In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, precision_score, recall_score
from scipy.ndimage import gaussian_filter


In [3]:
df=pd.read_csv("sonar.all-data.csv")


In [4]:
df.head()

Unnamed: 0,Freq_1,Freq_2,Freq_3,Freq_4,Freq_5,Freq_6,Freq_7,Freq_8,Freq_9,Freq_10,...,Freq_52,Freq_53,Freq_54,Freq_55,Freq_56,Freq_57,Freq_58,Freq_59,Freq_60,Label
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [6]:
# Set a global random seed for reproducibility
np.random.seed(42)

# Extract features and labels
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Apply Gaussian noise to data
def apply_gaussian_noise(X, scale=0.25):
    noise = np.random.normal(loc=0, scale=scale, size=X.shape)
    return X + noise

# Apply smoothing with a Gaussian filter
def apply_smoothing(X, sigma=0.1):
    smoothed_X = np.apply_along_axis(gaussian_filter, axis=0, arr=X, sigma=sigma)
    return smoothed_X

# Process and generate synthetic data using GMM
def generate_synthetic_data(X, y):
    # Initialize Gaussian Mixture Models for each class
    gmm_R = GaussianMixture(n_components=2, random_state=42)
    gmm_M = GaussianMixture(n_components=2, random_state=42)
    
    # Fit GMM to the data of each class
    gmm_R.fit(X[y == 'R'])
    gmm_M.fit(X[y == 'M'])
    
    # Generate synthetic data
    synthetic_R, _ = gmm_R.sample(n_samples=len(X[y == 'R']))
    synthetic_M, _ = gmm_M.sample(n_samples=len(X[y == 'M']))
    # Ensure non-negative values
    synthetic_R = np.maximum(synthetic_R, 0)
    synthetic_M = np.maximum(synthetic_M, 0)
    
   
    return synthetic_R, synthetic_M

# Apply Gaussian noise
X_noisy = apply_gaussian_noise(X)

# Apply smoothing
X_smooth = apply_smoothing(X_noisy)

# Generate synthetic data
synthetic_R, synthetic_M = generate_synthetic_data(X_smooth, y)

# Combine and label synthetic data
synthetic_data_R = pd.DataFrame(synthetic_R, columns=X.columns).assign(Label='R')
synthetic_data_M = pd.DataFrame(synthetic_M, columns=X.columns).assign(Label='M')
synthetic_data = pd.concat([synthetic_data_R, synthetic_data_M]).sample(frac=1, random_state=42).reset_index(drop=True)

synthetic_data.head()



Unnamed: 0,Freq_1,Freq_2,Freq_3,Freq_4,Freq_5,Freq_6,Freq_7,Freq_8,Freq_9,Freq_10,...,Freq_52,Freq_53,Freq_54,Freq_55,Freq_56,Freq_57,Freq_58,Freq_59,Freq_60,Label
0,0.172737,0.101495,0.0,0.337014,0.272193,0.0,0.0,0.046416,0.355184,0.013217,...,0.084394,0.25169,0.090909,0.0,0.243754,0.237907,0.0,0.17882,0.294573,M
1,0.076474,0.010928,0.0,0.0,0.0,0.030567,0.23796,0.233535,0.104934,0.374905,...,0.02605,0.0,0.0,0.377,0.169423,0.132765,0.0,0.0,0.0,R
2,0.21281,0.0,0.0,0.163172,0.054232,0.068321,0.0,0.29433,0.295069,0.427567,...,0.0,0.0,0.0,0.27413,0.091332,0.245878,0.279333,0.325832,0.0,R
3,0.0,0.0,0.0,0.0,0.0,0.189273,0.431482,0.0,0.0,0.380771,...,0.0,0.306687,0.0,0.033175,0.35182,0.0,0.0,0.0,0.0,R
4,0.0,0.203828,0.0,0.0,0.448534,0.370336,0.280846,0.622841,0.279216,0.476197,...,0.0,0.0,0.0,0.0,0.165609,0.399648,0.0,0.0,0.083054,M


In [7]:
def process_dataset(dataset):
    X = dataset.drop('Label', axis=1)
    y = dataset['Label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)
    rf_model = RandomForestClassifier(random_state=42)
    knn = KNeighborsClassifier()
    mlp_model = MLPClassifier(random_state=42)
    lgb_model = lgb.LGBMClassifier()

    operations = [('scaler', scaler), ('knn', knn)]
    pipe = Pipeline(operations)
    k_values = list(range(1, 30))
    param_grid = {'knn__n_neighbors': k_values}
    full_cv_classifier = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
    full_cv_classifier.fit(X_train, y_train)

    optimal_k = full_cv_classifier.best_params_['knn__n_neighbors']

    def train_evaluate(model):
        model.fit(scaled_X_train, y_train)
        predictions = model.predict(scaled_X_test)
        accuracy = accuracy_score(y_test, predictions)
        precision = precision_score(y_test, predictions, average='macro')
        recall = recall_score(y_test, predictions, average='macro')
        return accuracy, precision, recall

    rf_accuracy, rf_precision, rf_recall = train_evaluate(rf_model)
    knn_accuracy, knn_precision, knn_recall = train_evaluate(full_cv_classifier.best_estimator_)
    mlp_accuracy, mlp_precision, mlp_recall = train_evaluate(mlp_model)
    lgb_accuracy, lgb_precision, lgb_recall = train_evaluate(lgb_model)

    results = {
        "Random Forest": [rf_accuracy, rf_precision, rf_recall],
        "KNN": [knn_accuracy, knn_precision, knn_recall],
        "MLP": [mlp_accuracy, mlp_precision, mlp_recall],
        "LightGBM": [lgb_accuracy, lgb_precision, lgb_recall]
    }

    return results, optimal_k


# Replace these with the actual DataFrame objects
results_df, optimal_k_df = process_dataset(df)
results_data, optimal_k_data = process_dataset(synthetic_data)

# Print the results for each dataset, including the optimal K value for KNN
print("Results for dataset original :")
print(f"Optimal K for KNN: {optimal_k_df}")
for model, scores in results_df.items():
    print(f"{model}: Accuracy={scores[0]}, Precision={scores[1]}, Recall={scores[2]}")

print("\nResults for dataset modified:")
print(f"Optimal K for KNN: {optimal_k_data}")
for model, scores in results_data.items():
    print(f"{model}: Accuracy={scores[0]}, Precision={scores[1]}, Recall={scores[2]}")



[LightGBM] [Info] Number of positive: 81, number of negative: 85
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000946 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3302
[LightGBM] [Info] Number of data points in the train set: 166, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.487952 -> initscore=-0.048202
[LightGBM] [Info] Start training from score -0.048202
[LightGBM] [Info] Number of positive: 74, number of negative: 92
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2676
[LightGBM] [Info] Number of data points in the train set: 166, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.445783 -> initscore=-0.217723
[LightGBM] [Info] 

