In [2]:
import random
random.seed(42)  # Set the random seed to 42 for reproducibility

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report

In [4]:
# Load Dataset

file_path = "/Users/Sebastiano/NeuroLupus_ML/ML_MRI copy.xlsx"
df = pd.read_excel(file_path)

print("N° of patients: {}".format(len(df)))
print("N° of columns: {}".format(df.shape[1]))
df.head()

N° of patients: 47
N° of columns: 932


Unnamed: 0,Patient,Gender,Age,NP-SLE,Event,Scale factor,SNR,White Matter (WM) volume cm3,White Matter (WM) volume %,Normal Appearing White Matter volume cm3,...,FO left thickness mm,FO left thickness norm.,FO thickness asymmetry,PO total thickness mm,PO total thickness norm.,PO right thickness mm,PO right thickness norm.,PO left thickness mm,PO left thickness norm.,PO thickness asymmetry
0,Paziente 1,0,38,1,Mood abnormalities (depressive),0.67586,42.3566,438.3091,35.4223,438.2523,...,2.2623,0.021072,18.2292,2.4475,0.022797,2.293,0.021358,2.597,0.02419,-12.4336
1,Paziente 2,0,41,0,Na,0.70729,105.5166,472.6302,37.2214,466.0998,...,1.8574,0.017152,-18.2462,1.3628,0.012585,1.2929,0.01194,1.4317,0.013222,-10.1909
2,Paziente 3,0,32,0,Na,0.65236,49.4839,407.0018,33.7657,406.977,...,2.6216,0.024634,6.8561,2.3106,0.021711,2.484,0.023341,2.1159,0.019882,16.004
3,Paziente 4,0,31,1,Seizure,0.65564,44.808,424.9121,35.646,424.8701,...,3.0341,0.028616,-6.5858,2.1641,0.02041,2.2997,0.021689,2.0193,0.019045,12.9849
4,Paziente 5,0,43,0,Na,0.76373,94.5834,548.5729,41.4234,547.8604,...,3.9152,0.035652,-10.4521,2.596,0.02364,2.5593,0.023305,2.6209,0.023866,-2.3788


In [5]:
# Drop unwanted columns

df = df.drop(['Patient', 'Gender', 'Age','Event', 'Scale factor', 'SNR'], axis = 'columns')
# drop columns that include "%" in their name
#cols_to_drop = [col for col in df.columns if "%" in col]
#df = df.drop(columns=cols_to_drop)
print("Effective features to consider: {} ".format(len(df.columns)-1))

Effective features to consider: 925 


In [6]:
from sklearn.preprocessing import MinMaxScaler

features_to_normalize = df.columns.difference(['NP-SLE']) 
# Normalize the selected features
scaler = MinMaxScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

In [7]:
# Define your features and target variable
X = df.drop(['NP-SLE'], axis=1)  
y = df['NP-SLE']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define the hyperparameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],  
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],  # Applicable for 'poly' kernel
}

# Create the SVM classifier
svm_classifier = SVC()

# Initialize StratifiedKFold with 5 folds
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize the GridSearchCV with StratifiedKFold
grid_search = GridSearchCV(svm_classifier, param_grid, cv=stratified_kfold)

# Fit the GridSearchCV to your training data
grid_search.fit(X_train, y_train)

# Get the best parameters found by grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Get the best SVM classifier with the best hyperparameters
best_svm_classifier = grid_search.best_estimator_

# Train the best SVM classifier on the training data
best_svm_classifier.fit(X_train, y_train)

# Perform k-fold cross-validation and calculate the average accuracy
from sklearn.model_selection import cross_val_score

cross_val_scores = cross_val_score(best_svm_classifier, X_train, y_train, cv=stratified_kfold)
average_cross_val_accuracy = cross_val_scores.mean()

# Predict on the test set
y_pred = best_svm_classifier.predict(X_test)

# Calculate accuracy and print a classification report
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Average Cross-Validation Accuracy:", average_cross_val_accuracy)
print("Accuracy on Test Set:", accuracy)
print("Classification Report:\n", report)


Best Hyperparameters: {'C': 10, 'degree': 2, 'gamma': 0.01, 'kernel': 'rbf'}
Average Cross-Validation Accuracy: 0.8714285714285716
Accuracy on Test Set: 0.8
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.50      1.00      0.67         2
           2       1.00      1.00      1.00         4

    accuracy                           0.80        10
   macro avg       0.83      0.83      0.78        10
weighted avg       0.90      0.80      0.80        10



In [12]:
import numpy as np

# Initialize a list to store the names of highly correlated features
highly_correlated_features = []

# Set the correlation threshold (e.g., 90%)
correlation_threshold = 0.9

# Iterate through each feature in the DataFrame
for feature in df.columns:
    if feature != 'NP-SLE':  # Skip the target variable
        correlation = np.abs(df[feature].corr(y))
        if correlation > correlation_threshold:
            print(f"Feature '{feature}' is highly correlated with 'NP-SLE' (Correlation: {correlation:.2f})")
            highly_correlated_features.append(feature)

# Print the names of highly correlated features
print("\nHighly correlated features to be eliminated:")
print(highly_correlated_features)

# Drop the highly correlated features from the DataFrame
df_filtered = df.drop(columns=highly_correlated_features)


Highly correlated features to be eliminated:
[]


In [13]:
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import LeaveOneOut, KFold
import numpy as np
from sklearn.feature_selection import SelectFromModel


# Initialize lists to store results for each repetition
accuracies = []
precision_list = []
recall_list = []
f1_score_list = []

# Repeat the process 100 times
for _ in range(100):
    # Initialize lists to store results for each fold
    fold_actual_labels = []
    fold_predicted_labels = []

    # Perform nested Leave-10-Out cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=None)
    for train_index, test_index in kfold.split(df):
        x_train, x_test = df.iloc[train_index], df.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Feature selection using Elastic Net
        # Assuming x_train and x_test are already standardized
        elastic_net = ElasticNet(l1_ratio=0.5, alpha=0.5, max_iter=10000)
        elastic_net.fit(x_train, y_train)

        # Select features based on Elastic Net feature importances
        sfm = SelectFromModel(elastic_net)
        sfm.fit(x_train, y_train)

        x_train_selected = sfm.transform(x_train)
        x_test_selected = sfm.transform(x_test)

        # Train the random forest model on the selected features
        svm = SVC(random_state=42)
        svm.fit(x_train_selected, y_train)

        # Predict class labels for the test subjects
        y_pred = svm.predict(x_test_selected)

        # Append the actual and predicted labels to the lists for this fold
        fold_actual_labels.extend(y_test)
        fold_predicted_labels.extend(y_pred)

    # Calculate performance metrics for this repetition
    accuracy = accuracy_score(fold_actual_labels, fold_predicted_labels)
    classification_report_str = classification_report(fold_actual_labels, fold_predicted_labels, output_dict=True)

    # Store results for this repetition
    accuracies.append(accuracy)
    precision_list.append(classification_report_str['macro avg']['precision'])
    recall_list.append(classification_report_str['macro avg']['recall'])
    f1_score_list.append(classification_report_str['macro avg']['f1-score'])

# Calculate the average accuracy and classification report over 100 repetitions
average_accuracy = np.mean(accuracies)
average_precision = np.mean(precision_list)
average_recall = np.mean(recall_list)
average_f1_score = np.mean(f1_score_list)

# Compute 95% confidence intervals using the percentile method
confidence_interval_lower = np.percentile(accuracies, 2.5)
confidence_interval_upper = np.percentile(accuracies, 97.5)

# Print the summary results
print(f"Average Accuracy: {average_accuracy}")
print(f"Confidence Interval: [{confidence_interval_lower}, {confidence_interval_upper}]")
print(f"Average Precision: {average_precision}")
print(f"Average Recall: {average_recall}")
print(f"Average F1-Score: {average_f1_score}")


Average Accuracy: 1.0
Confidence Interval: [1.0, 1.0]
Average Precision: 1.0
Average Recall: 1.0
Average F1-Score: 1.0
