In [4]:
import random
random.seed(42)  # Set the random seed to 42 for reproducibility

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle


In [6]:
# Load Dataset

file_path = "/Users/Sebastiano/ML_MRI copy.xlsx"
df = pd.read_excel(file_path)

df = shuffle(df)
df.reset_index(inplace=True, drop=True)

print("N° of patients: {}".format(len(df)))
print("N° of columns: {}".format(df.shape[1]))
df.head()

N° of patients: 47
N° of columns: 932


Unnamed: 0,Patient,Gender,Age,NP-SLE,Event,Scale factor,SNR,White Matter (WM) volume cm3,White Matter (WM) volume %,Normal Appearing White Matter volume cm3,...,FO left thickness mm,FO left thickness norm.,FO thickness asymmetry,PO total thickness mm,PO total thickness norm.,PO right thickness mm,PO right thickness norm.,PO left thickness mm,PO left thickness norm.,PO thickness asymmetry
0,job1625737,0,45,2,Na,0.69628,35.5452,371.5893,31.4143,358.0627,...,3.6794,0.034791,-14.0668,2.5826,0.02442,2.4164,0.022849,2.7343,0.025854,-12.3408
1,Paziente 19,0,21,1,Movement Disorder,0.61716,52.8577,378.7339,33.9454,378.1877,...,3.4939,0.033687,2.0921,2.5906,0.024978,2.4642,0.023759,2.6966,0.026,-9.0078
2,paziente 33,1,48,0,Na,0.82252,38.7983,550.644,37.0593,550.6037,...,2.607,0.022846,20.3536,2.0938,0.018349,2.1978,0.01926,1.9565,0.017146,11.6179
3,job1625740,0,34,2,Na,0.77808,34.9677,394.7451,29.6633,355.4317,...,4.2384,0.038533,-0.41052,3.7057,0.03369,3.5324,0.032115,3.8456,0.034963,-8.4905
4,job1628628,0,50,2,Na,0.71459,34.9528,358.0635,29.6291,325.1168,...,3.9499,0.037083,-5.5596,2.5781,0.024204,2.341,0.021978,2.7756,0.026058,-16.9897


In [7]:
# Drop unwanted columns

df = df.drop(['Patient', 'Gender', 'Age','Event', 'Scale factor', 'SNR'], axis = 'columns')
# drop columns that include "%" in their name
#cols_to_drop = [col for col in df.columns if "%" in col]
#df = df.drop(columns=cols_to_drop)
print("Effective features to consider: {} ".format(len(df.columns)-1))

Effective features to consider: 925 


In [8]:
from sklearn.preprocessing import MinMaxScaler

features_to_normalize = df.columns.difference(['NP-SLE']) 
# Normalize the selected features
scaler = MinMaxScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

In [9]:
# Define your features and target variable
X = df.drop(['NP-SLE'], axis=1)  
y = df['NP-SLE']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],  # Applicable for 'poly' kernel
}

# Create the SVM classifier
svm_classifier = SVC()

# Initialize StratifiedKFold with 5 folds
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the GridSearchCV with StratifiedKFold
grid_search = GridSearchCV(svm_classifier, param_grid, cv=stratified_kfold)

# Fit the GridSearchCV to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters found by grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best SVM classifier with the best hyperparameters
best_svm_classifier = grid_search.best_estimator_

# Train the best SVM classifier on the training data
best_svm_classifier.fit(X_train, y_train)

# Perform k-fold cross-validation and calculate the average accuracy
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(best_svm_classifier, X_train, y_train, cv=stratified_kfold)
average_cross_val_accuracy = cross_val_scores.mean()

# Predict on the test set
y_pred = best_svm_classifier.predict(X_test)

# Calculate accuracy and print a classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0)

print("Average Cross-Validation Accuracy:", average_cross_val_accuracy)
print("Accuracy on Test Set:", accuracy)
print("Classification Report:\n", report)


Best Hyperparameters: {'C': 0.1, 'degree': 3, 'gamma': 0.01, 'kernel': 'poly'}
Average Cross-Validation Accuracy: 0.7535714285714287
Accuracy on Test Set: 0.7
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.40      1.00      0.57         2
           2       1.00      1.00      1.00         5

    accuracy                           0.70        10
   macro avg       0.47      0.67      0.52        10
weighted avg       0.58      0.70      0.61        10

