In [329]:
import numpy as np
import pandas as pd

In [330]:
# This is basically the numerical representation of the audio file
data = np.load('arr_mfcc.npy')
data = np.moveaxis(data,2,0)
#Read the df
df = pd.read_csv('primary_dataframe.csv')
y = df.diagnosis_Healthy
del df # Get this out of memory

# Load the data
data = np.load('arr_mfcc.npy')
data = np.moveaxis(data, 2, 0)

# Create a list to store the data
data_list = []

# Loop over each audio file
for i in range(len(data)):
    # Loop over each time step
    time_steps = []
    for j in range(1, len(data[i])):
        time_steps.append(data[i][j])
    
    # Append the data for this audio file to the list
    data_list.append(time_steps)

# Create the dataframe
df = pd.DataFrame({'data': data_list, 'label': y})

# Under Sampling Approaches

In [331]:
from sklearn.model_selection import train_test_split

# Extract the data and labels from the dataframe
X = df['data'].values
y = df['label'].values

# Split the data and labels into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [332]:
from imblearn.under_sampling import RandomUnderSampler

# Perform under-sampling
rus = RandomUnderSampler(sampling_strategy=0.2, random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train.reshape(-1, 1), y_train)

# Convert the resampled data back to arrays
X_train = X_train_resampled.flatten()
y_train = y_train_resampled.flatten()

In [333]:
# Reshape X_train and X_test to have shape (num_samples, num_features)
X_train = np.array([np.array(x).reshape(-1) for x in X_train])
#X_validate = np.array([np.array(x).reshape(-1) for x in X_validate])
X_test = np.array([np.array(x).reshape(-1) for x in X_test])

#The result of this is basically a shape of (num_samples, every_single_observation_of_features)


In [334]:
X_train.shape

(180, 33618)

# SVMs & Different Validation (Undersampling)

In [335]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, make_scorer

In [336]:
#Fiting on raw data that has not been scaled

# Define the scorer
scorer = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'f1': make_scorer(f1_score),
    'recall_score': make_scorer(recall_score)
}

# Define the parameter grid for grid search
param_grid = {
    'kernel' : ['linear'],
    'C': [100],
}

# Create an SVM model with a linear kernel
model = SVC()


# Create a grid search object with cross-validation
grid_search = GridSearchCV(
    model, param_grid, cv=10, scoring= scorer, n_jobs=-1, verbose=1, refit=  'accuracy'
)

# Train the grid search object on the training data
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [337]:
cv_results = pd.DataFrame(grid_search.cv_results_) #adjust this to just use means
cv_results[['params', 'mean_fit_time', 'mean_test_accuracy', 'std_test_accuracy', 'mean_test_precision', 'std_test_precision', 'mean_test_f1','std_test_f1', 'mean_test_recall_score', 'std_test_recall_score']]

Unnamed: 0,params,mean_fit_time,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_f1,std_test_f1,mean_test_recall_score,std_test_recall_score
0,"{'C': 100, 'kernel': 'linear'}",0.769152,0.872222,0.061111,0.641667,0.372771,0.509524,0.291466,0.466667,0.305505


In [338]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

# Use cross_val_predict to get the predicted labels for each sample using the best model
y_pred = cross_val_predict(grid_search.best_estimator_, X_train, y_train, cv=10)

# Get the confusion matrix
confusion_mat = confusion_matrix(y_train, y_pred)

print('Confusion matrix:\n', confusion_mat)

Confusion matrix:
 [[143   7]
 [ 16  14]]


In [339]:
# Repeating the procedure for the scaled data
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()
# Scale the training data
X_train_scaled = scaler.fit_transform(X_train)

# Create a grid search object with cross-validation
grid_search_scaled = GridSearchCV(
    model, param_grid, cv=10, scoring= scorer, n_jobs=-1, verbose=1, refit=  'accuracy'
)

# Train the grid search object on the training data
grid_search_scaled.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [340]:
cv_results = pd.DataFrame(grid_search_scaled.cv_results_) #adjust this to just use means
cv_results[['params', 'mean_fit_time', 'mean_test_accuracy', 'std_test_accuracy', 'mean_test_precision', 'std_test_precision', 'mean_test_f1','std_test_f1', 'mean_test_recall_score', 'std_test_recall_score']]

Unnamed: 0,params,mean_fit_time,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_f1,std_test_f1,mean_test_recall_score,std_test_recall_score
0,"{'C': 100, 'kernel': 'linear'}",0.724077,0.916667,0.037268,0.841667,0.160078,0.712381,0.147515,0.666667,0.210819


In [341]:
# Use cross_val_predict to get the predicted labels for each sample using the best model
y_pred = cross_val_predict(grid_search_scaled.best_estimator_, X_train_scaled, y_train, cv=10)

# Get the confusion matrix
confusion_mat = confusion_matrix(y_train, y_pred)

print('Confusion matrix:\n', confusion_mat)

Confusion matrix:
 [[145   5]
 [ 10  20]]


In [342]:
# Repeating the procedure for the scaled data with PCA applied
from sklearn.decomposition import PCA

# Create a PCA object
pca = PCA(n_components=0.95)
pca.fit(X_train_scaled)
# Transform the training data
X_train_scaled_pca = pca.transform(X_train_scaled)

# Create a grid search object with cross-validation
grid_search_scaled_pca = GridSearchCV(
    model, param_grid, cv=10, scoring= scorer, n_jobs=-1, verbose=1, refit=  'accuracy'
)

# Train the grid search object on the training data
grid_search_scaled_pca.fit(X_train_scaled_pca, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [343]:
cv_results = pd.DataFrame(grid_search_scaled_pca.cv_results_) #adjust this to just use means
cv_results[['params', 'mean_fit_time', 'mean_test_accuracy', 'std_test_accuracy', 'mean_test_precision', 'std_test_precision', 'mean_test_f1','std_test_f1', 'mean_test_recall_score', 'std_test_recall_score']]

Unnamed: 0,params,mean_fit_time,mean_test_accuracy,std_test_accuracy,mean_test_precision,std_test_precision,mean_test_f1,std_test_f1,mean_test_recall_score,std_test_recall_score
0,"{'C': 100, 'kernel': 'linear'}",0.00271,0.916667,0.04479,0.85,0.189297,0.71,0.161967,0.633333,0.179505


In [344]:
# Use cross_val_predict to get the predicted labels for each sample using the best model
y_pred = cross_val_predict(grid_search_scaled_pca.best_estimator_, X_train_scaled_pca, y_train, cv=10)

# Get the confusion matrix
confusion_mat = confusion_matrix(y_train, y_pred)

print('Confusion matrix:\n', confusion_mat)

Confusion matrix:
 [[146   4]
 [ 11  19]]


In [345]:
# Decide to use model
model = grid_search_scaled.best_estimator_

# Test Data

In [346]:
from sklearn.metrics import classification_report


# Predict on the validation data
y_pred = model.predict(scaler.fit_transform(X_test))
# Evaluate the model on the test data and print precision, recall, and F1-score
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      0.93      0.97       225
         1.0       0.25      1.00      0.40         5

    accuracy                           0.93       230
   macro avg       0.62      0.97      0.68       230
weighted avg       0.98      0.93      0.95       230



In [347]:
# Create a labelled confusion matrix
cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
print(cm)

Predicted  0.0  1.0
Actual             
0.0        210   15
1.0          0    5


# Save the model

In [348]:
import pickle
# Save the model to your directory
with open("svm_model.pkl", "wb") as f:
    pickle.dump(model, f)

In [349]:
# Save the scaler to your directory
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [350]:
# Save the test file as a npy file
np.save('X_test.npy', X_test)