In [16]:
import pandas as pd
import numpy as np

from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

import optuna

Support Vector Machine with no data feature scaling or data imbalance handling

In [17]:
asd_sm = pd.read_csv('asd_sm.csv')
kd_sm = pd.read_csv('kd_sm.csv')
normal_sm = pd.read_csv('normal_sm.csv')
rhd_sm = pd.read_csv('rhd_sm.csv')

normal_sm['state'] = 0
asd_sm['state'] = 1
kd_sm['state'] = 2
rhd_sm['state'] = 3

combined_df = pd.concat([normal_sm, asd_sm, kd_sm, rhd_sm], axis=0)

print('normal_sm:', round(combined_df['state'].value_counts()[0]/len(combined_df) * 100,2), '% of the dataset')
print('asd_sm:', round(combined_df['state'].value_counts()[1]/len(combined_df) * 100,2), '% of the dataset')
print('kd_sm:', round(combined_df['state'].value_counts()[2]/len(combined_df) * 100,2), '% of the dataset')
print('rhd_sm:', round(combined_df['state'].value_counts()[3]/len(combined_df) * 100,2), '% of the dataset')

raw_data = combined_df.copy()
raw_data.drop_duplicates(inplace=True)

X = raw_data.drop('state', axis=1)
y = raw_data['state']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

normal_sm: 96.83 % of the dataset
asd_sm: 1.43 % of the dataset
kd_sm: 0.64 % of the dataset
rhd_sm: 1.1 % of the dataset


In [18]:
from sklearn import svm

clf_rbf = svm.SVC(kernel="rbf", decision_function_shape='ovo')
clf_rbf.fit(X_train, y_train)
training_output = clf_rbf.predict(X_train)
training_f1 = f1_score(y_train, training_output, average='macro')

validation_output = clf_rbf.predict(X_validation)
validation_f1 = f1_score(y_validation, validation_output, average='macro')

print('Support Vector Machine with no data feature scaling or data imbalance handling')
print('-' * 50)
print(f'SVC With rbg kernel -> Training F1 Score: {training_f1} | Validation F1 Score: {validation_f1}')

clf_linear = svm.SVC(kernel='poly', decision_function_shape='ovo')
clf_linear.fit(X_train, y_train)
training_output = clf_linear.predict(X_train)
training_f1 = f1_score(y_train, training_output, average='macro')

validation_output = clf_linear.predict(X_validation)
validation_f1 = f1_score(y_validation, validation_output, average='macro')

print(f'SVC With polynomial kernel -> Training F1 Score: {training_f1} | Validation F1 Score: {validation_f1}')

Support Vector Machine with no data feature scaling or data imbalance handling
--------------------------------------------------
SVC With rbg kernel -> Training F1 Score: 0.385302406649829 | Validation F1 Score: 0.30905111425539444
SVC With polynomial kernel -> Training F1 Score: 0.48143741705359683 | Validation F1 Score: 0.37950796974053846


Support Vector Machine with data feature scaling or data imbalance handling

In [19]:
X = raw_data.drop('state', axis=1)
y = raw_data['state']

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.25, random_state = 42)

def Standard_Scaler (df, col_names):
    features = df[col_names]
    scaler = StandardScaler().fit(features.values)
    features = scaler.transform(features.values)
    df[col_names] = features
    
    return df

col_names = X.columns

X_train = Standard_Scaler(X_train, col_names)
X_validation = Standard_Scaler(X_validation, col_names)

resample = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))

X_train_resampled, y_train_resampled = resample.fit_resample(X_train, y_train)

print('normal_sm:', round(y_train_resampled.value_counts()[0]/len(y_train_resampled) * 100,2), '% of the dataset')
print('asd_sm:', round(y_train_resampled.value_counts()[1]/len(y_train_resampled) * 100,2), '% of the dataset')
print('kd_sm:', round(y_train_resampled.value_counts()[2]/len(y_train_resampled) * 100,2), '% of the dataset')
print('rhd_sm:', round(y_train_resampled.value_counts()[3]/len(y_train_resampled) * 100,2), '% of the dataset')

normal_sm: 25.0 % of the dataset
asd_sm: 25.0 % of the dataset
kd_sm: 25.0 % of the dataset
rhd_sm: 25.0 % of the dataset


In [20]:
from sklearn import svm

clf_rbf = svm.SVC(kernel="rbf", decision_function_shape='ovo')
clf_rbf.fit(X_train_resampled, y_train_resampled)
training_output = clf_rbf.predict(X_train_resampled)
training_f1 = f1_score(y_train_resampled, training_output, average='macro')

validation_output = clf_rbf.predict(X_validation)
validation_f1 = f1_score(y_validation, validation_output, average='macro')

print('Support Vector Machine with data feature scaling and data imbalance handling')
print('-' * 50)
print(f'SVC With rbg kernel -> Training F1 Score: {training_f1} | Validation F1 Score: {validation_f1}')

clf_linear = svm.SVC(kernel='poly', decision_function_shape='ovo')
clf_linear.fit(X_train_resampled, y_train_resampled)
training_output = clf_linear.predict(X_train_resampled)
training_f1 = f1_score(y_train_resampled, training_output, average='macro')

validation_output = clf_linear.predict(X_validation)
validation_f1 = f1_score(y_validation, validation_output, average='macro')

print(f'SVC With polynomial kernel -> Training F1 Score: {training_f1} | Validation F1 Score: {validation_f1}')

Support Vector Machine with data feature scaling and data imbalance handling
--------------------------------------------------
SVC With rbg kernel -> Training F1 Score: 0.9951462557831576 | Validation F1 Score: 0.44755383025371037
SVC With polynomial kernel -> Training F1 Score: 0.9814887584152926 | Validation F1 Score: 0.4422124874700002


Test Set

In [21]:
# pd.set_option('future.no_silent_downcasting', True)
# Test_Set = pd.read_csv('test_all.csv').copy()

# X = Test_Set.drop('state', axis=1)
# y = Test_Set['state']

# mapping = {
#     'normal': 0,
#     'asd': 1,
#     'kd': 2,
#     'rhd': 3
# }

# y = y.replace(mapping)
# y.fillna(0, inplace=True)
# y = y.astype(int)

# def Standard_Scaler (df, col_names):
#     features = df[col_names]
#     scaler = StandardScaler().fit(features.values)
#     features = scaler.transform(features.values)
#     df[col_names] = features
    
#     return df

# col_names = X.columns

# X_test = Standard_Scaler(X, col_names)

# test_output = clf_rbf.predict(X_test)

# test_f1 = f1_score(y, test_output, average='macro')

# print(f'Test Set F1 Score: {test_f1}')

# y_test_prediction = pd.DataFrame(test_output, columns=['prediction'])

# # Now saving it to a CSV file
# y_test_prediction.to_csv('Support_Vector_Machine_Answer.csv', index=False)

Test Set F1 Score: 0.4172886724881772


Support Vector Machine With Hyperparameter Tuning Using Optuna

In [22]:
def create_objective(X_train, y_train, X_val, y_val):

    def objective(trial):

        C = trial.suggest_float("C", 1e-2, 1, log=True)
        gamma = trial.suggest_float("gamma", 1e-2, 1, log=True)
    
        clf_rbf = svm.SVC(kernel="rbf", C=C, gamma=gamma, decision_function_shape='ovo')
        clf_rbf.fit(X_train, y_train)
        
        validation_output = clf_rbf.predict(X_val)
        validation_f1 = f1_score(y_val, validation_output, average='macro')
        
        return validation_f1  
    
    return objective

# Now create the objective function using the closure
objective = create_objective(X_train_resampled, y_train_resampled, X_validation, y_validation)

# Create and run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2024-05-10 22:33:42,049] A new study created in memory with name: no-name-33ecd94c-a46a-48b6-ac5d-8736250c66d0
[I 2024-05-10 22:37:35,667] Trial 0 finished with value: 0.32081381170600853 and parameters: {'C': 0.6945999732582975, 'gamma': 0.01651252323917206}. Best is trial 0 with value: 0.32081381170600853.
[I 2024-05-10 22:42:29,035] Trial 1 finished with value: 0.3250001356204364 and parameters: {'C': 0.08272458714053041, 'gamma': 0.027132787351504938}. Best is trial 1 with value: 0.3250001356204364.
[I 2024-05-10 22:48:21,805] Trial 2 finished with value: 0.3156987765124126 and parameters: {'C': 0.47466185016705675, 'gamma': 0.021782828673264126}. Best is trial 1 with value: 0.3250001356204364.
[I 2024-05-10 23:05:14,334] Trial 3 finished with value: 0.24618101545253862 and parameters: {'C': 0.5338693112034739, 'gamma': 0.11095671852028988}. Best is trial 1 with value: 0.3250001356204364.
[I 2024-05-10 23:25:11,779] Trial 4 finished with value: 0.24618101545253862 and parameters

Best trial:
  Value: 0.3250001356204364
  Params: 
    C: 0.08272458714053041
    gamma: 0.027132787351504938
