In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier

In [2]:
# Set the random seed for reproducibility
random.seed(42)

In [3]:
# Load Dataset
file_path = "/Users/Sebastiano/ML_MRI copy.xlsx"
df = pd.read_excel(file_path)

df = shuffle(df)
df.reset_index(inplace=True, drop=True)

print("N° of patients: {}".format(len(df)))
print("N° of columns: {}".format(df.shape[1]))
df.head()

N° of patients: 47
N° of columns: 932


Unnamed: 0,Patient,Gender,Age,NP-SLE,Event,Scale factor,SNR,White Matter (WM) volume cm3,White Matter (WM) volume %,Normal Appearing White Matter volume cm3,...,FO left thickness mm,FO left thickness norm.,FO thickness asymmetry,PO total thickness mm,PO total thickness norm.,PO right thickness mm,PO right thickness norm.,PO left thickness mm,PO left thickness norm.,PO thickness asymmetry
0,paziente 28,0,62,1,Mood abnormalities (depressive),0.62973,43.8559,393.3745,34.427,392.581,...,3.5535,0.03399,-23.6272,2.7065,0.025888,2.3707,0.022676,2.9864,0.028565,-22.9866
1,paziente 31,0,48,1,Psychosis,0.75214,45.3995,457.9349,34.1854,456.3373,...,2.069,0.018769,3.1731,1.914,0.017363,2.0025,0.018165,1.8525,0.016805,7.7805
2,Paziente 15,0,60,1,Psychosis,0.69406,51.4202,467.2516,36.3049,466.9031,...,1.5857,0.014578,25.8052,1.9698,0.018109,1.906,0.017523,2.031,0.018671,-6.3473
3,Paziente 6,0,55,0,Na,0.73812,49.6214,499.9131,37.3108,499.7323,...,2.4721,0.022424,11.9474,2.3978,0.02175,2.4657,0.022366,2.3259,0.021098,5.8332
4,paziente 32,0,37,0,Na,0.57994,102.2972,415.1286,39.2274,411.1932,...,1.5729,0.015435,-5.1432,1.0979,0.010774,1.157,0.011353,1.0314,0.010121,11.4765


In [4]:
# Drop unwanted columns
df = df.drop(['Patient', 'Gender', 'Age', 'Event', 'Scale factor', 'SNR'], axis='columns')

# Normalize the selected features
features_to_normalize = df.columns.difference(['NP-SLE'])
scaler = MinMaxScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])


In [5]:
# Separate features and target variable
X = df.drop(['NP-SLE'], axis=1)
y = df['NP-SLE']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Initialize and train the XGBoost model
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', random_state=42)
xgb_clf.fit(X_train, y_train)

In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],  # Applicable for 'poly' kernel
}

# Create the SVM classifier
svm_classifier = SVC(probability=True, random_state=42)

# Initialize StratifiedKFold with 5 folds
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the GridSearchCV with StratifiedKFold
grid_search = GridSearchCV(svm_classifier, param_grid, cv=stratified_kfold)

# Fit the GridSearchCV to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters found by grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best SVM classifier with the best hyperparameters
best_svm_classifier = grid_search.best_estimator_

# Train the best SVM classifier on the training data
best_svm_classifier.fit(X_train, y_train)

Best Hyperparameters: {'C': 1, 'degree': 2, 'gamma': 0.01, 'kernel': 'poly'}


In [8]:
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train, y_train)

In [9]:
from sklearn.ensemble import StackingClassifier

# Define the base learners
base_learners = [
    ('svm', best_svm_classifier),
    ('rf', rf),
    ('xgb', xgb_clf)
]

# Initialize StackingClassifier
stacking_clf = StackingClassifier(
    estimators=base_learners, final_estimator=RandomForestClassifier()
)

# Train the StackingClassifier
stacking_clf.fit(X_train, y_train)

# Evaluate the model
y_pred_stacking = stacking_clf.predict(X_test)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
report_stacking = classification_report(y_test, y_pred_stacking, zero_division=0)

print("Accuracy on Test Set:", accuracy_stacking)
print("Classification Report:\n", report_stacking)


Accuracy on Test Set: 0.6
Classification Report:
               precision    recall  f1-score   support

           0       0.25      0.50      0.33         2
           1       0.50      0.25      0.33         4
           2       1.00      1.00      1.00         4

    accuracy                           0.60        10
   macro avg       0.58      0.58      0.56        10
weighted avg       0.65      0.60      0.60        10



In [10]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

# Define the base learners
base_learners = [
    ('svm', best_svm_classifier),
    ('rf', rf),
    ('xgb', xgb_clf)
]

# Initialize StackingClassifier
stacking_clf = StackingClassifier(
    estimators=base_learners, final_estimator=RandomForestClassifier()
)

# Initialize StratifiedKFold for cross-validation
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation and calculate the average accuracy
cross_val_scores = cross_val_score(stacking_clf, X_train, y_train, cv=stratified_kfold, scoring='accuracy')
average_cross_val_accuracy = np.mean(cross_val_scores)

# Train the StackingClassifier
stacking_clf.fit(X_train, y_train)

# Evaluate the model
y_pred_stacking = stacking_clf.predict(X_test)

# Calculate accuracy and print a classification report
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
report_stacking = classification_report(y_test, y_pred_stacking, zero_division=0)

# Output the results
print("Average Cross-Validation Accuracy:", average_cross_val_accuracy)
print("Accuracy on Test Set:", accuracy_stacking)
print("Classification Report:\n", report_stacking)

Average Cross-Validation Accuracy: 0.6785714285714286
Accuracy on Test Set: 0.6
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.33      0.33         3
           1       0.33      0.33      0.33         3
           2       1.00      1.00      1.00         4

    accuracy                           0.60        10
   macro avg       0.56      0.56      0.56        10
weighted avg       0.60      0.60      0.60        10

