In [11]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

In [12]:
# Set the random seed for reproducibility
random.seed(42)

In [13]:
# Load Dataset
file_path = "/Users/Sebastiano/ML_MRI copy.xlsx"
df = pd.read_excel(file_path)

print("N° of patients: {}".format(len(df)))
print("N° of columns: {}".format(df.shape[1]))
df.head()

N° of patients: 47
N° of columns: 932


Unnamed: 0,Patient,Gender,Age,NP-SLE,Event,Scale factor,SNR,White Matter (WM) volume cm3,White Matter (WM) volume %,Normal Appearing White Matter volume cm3,...,FO left thickness mm,FO left thickness norm.,FO thickness asymmetry,PO total thickness mm,PO total thickness norm.,PO right thickness mm,PO right thickness norm.,PO left thickness mm,PO left thickness norm.,PO thickness asymmetry
0,Paziente 1,0,38,1,Mood abnormalities (depressive),0.67586,42.3566,438.3091,35.4223,438.2523,...,2.2623,0.021072,18.2292,2.4475,0.022797,2.293,0.021358,2.597,0.02419,-12.4336
1,Paziente 2,0,41,0,Na,0.70729,105.5166,472.6302,37.2214,466.0998,...,1.8574,0.017152,-18.2462,1.3628,0.012585,1.2929,0.01194,1.4317,0.013222,-10.1909
2,job1625735,0,58,2,Na,0.80425,32.0757,444.9024,32.3276,444.4705,...,2.5364,0.022803,19.376,2.8289,0.025432,2.4842,0.022334,3.1091,0.027951,-22.3438
3,Paziente 3,0,32,0,Na,0.65236,49.4839,407.0018,33.7657,406.977,...,2.6216,0.024634,6.8561,2.3106,0.021711,2.484,0.023341,2.1159,0.019882,16.004
4,job1625738,0,42,2,Na,0.66906,32.9605,354.9196,31.1643,339.6141,...,2.7589,0.026419,-1.7345,1.9995,0.019147,1.8184,0.017413,2.1486,0.020574,-16.6438


In [14]:
# Drop unwanted columns
df = df.drop(['Patient', 'Gender', 'Age', 'Event', 'Scale factor', 'SNR'], axis='columns')

# Normalize the selected features
features_to_normalize = df.columns.difference(['NP-SLE'])
scaler = MinMaxScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])


In [15]:
# Separate features and target variable
X = df.drop(['NP-SLE'], axis=1)
y = df['NP-SLE']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Initialize and train the XGBoost model
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', random_state=42)
xgb_clf.fit(X_train, y_train)

In [17]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],  # Applicable for 'poly' kernel
}

# Create the SVM classifier
svm_classifier = SVC(probability=True, random_state=42)

# Initialize StratifiedKFold with 5 folds
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the GridSearchCV with StratifiedKFold
grid_search = GridSearchCV(svm_classifier, param_grid, cv=stratified_kfold)

# Fit the GridSearchCV to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters found by grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best SVM classifier with the best hyperparameters
best_svm_classifier = grid_search.best_estimator_

# Train the best SVM classifier on the training data
best_svm_classifier.fit(X_train, y_train)

Best Hyperparameters: {'C': 0.1, 'degree': 4, 'gamma': 1, 'kernel': 'poly'}


In [18]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)

In [19]:
from sklearn.ensemble import VotingClassifier

# Assuming xgb_clf is already defined and trained or configured
voting_clf = VotingClassifier(
    estimators=[('svm', best_svm_classifier), ('rf', rf), ('xgb', xgb_clf)],
    voting='soft'  # or 'hard'
)

# Train the VotingClassifier
voting_clf.fit(X_train, y_train)

# Evaluate the model
y_pred_voting = voting_clf.predict(X_test)
accuracy_voting = accuracy_score(y_test, y_pred_voting)
report_voting = classification_report(y_test, y_pred_voting, zero_division=0)

print("Accuracy on Test Set:", accuracy_voting)
print("Classification Report:\n", report_voting)


Accuracy on Test Set: 0.8
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.50      0.50      0.50         2
           2       1.00      1.00      1.00         6

    accuracy                           0.80        10
   macro avg       0.67      0.67      0.67        10
weighted avg       0.80      0.80      0.80        10



In [20]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

# Assuming your classifiers (best_svm_classifier, rf, xgb_clf) are already defined
voting_clf = VotingClassifier(
    estimators=[('svm', best_svm_classifier), ('rf', rf), ('xgb', xgb_clf)],
    voting='soft'
)

# Initialize StratifiedKFold for cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation and calculate the average accuracy
cross_val_scores = cross_val_score(voting_clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
average_cross_val_accuracy = np.mean(cross_val_scores)

# Train the VotingClassifier on the entire training set
voting_clf.fit(X_train, y_train)

# Predict on the test set
y_pred_voting = voting_clf.predict(X_test)

# Calculate accuracy and print a classification report
accuracy_voting = accuracy_score(y_test, y_pred_voting)
report_voting = classification_report(y_test, y_pred_voting, zero_division=0)

# Output the results
print("Average Cross-Validation Accuracy:", average_cross_val_accuracy)
print("Accuracy on Test Set:", accuracy_voting)
print("Classification Report:\n", report_voting)

Average Cross-Validation Accuracy: 0.7535714285714287
Accuracy on Test Set: 0.8
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.50      0.50         2
           1       0.50      0.50      0.50         2
           2       1.00      1.00      1.00         6

    accuracy                           0.80        10
   macro avg       0.67      0.67      0.67        10
weighted avg       0.80      0.80      0.80        10

