In [1]:
!pip install boruta
!pip install numpy==1.23.1



In [2]:
import pandas as pd
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import RandomOverSampler, SMOTENC, SMOTE

In [3]:
# Load Dataset

file_path = "/Users/Sebastiano/data/Clinical_MRI.xlsx"
df = pd.read_excel(file_path)

print("N° of patients: {}".format(len(df)))
print("N° of columns: {}".format(df.shape[1]))
df.head()

N° of patients: 27
N° of columns: 969


Unnamed: 0,Patient,Date of Birth,Gender,Education,Disease duration (months),Age,SLEDAI-2k (at the time of NP event),PGA (at the time of fMRI),SLICC-DI (at the time of NP event),anti-dsDNA Titre (0=absent; 1=present) ),...,FO left thickness mm,FO left thickness norm.,FO thickness asymmetry,PO total thickness mm,PO total thickness norm.,PO right thickness mm,PO right thickness norm.,PO left thickness mm,PO left thickness norm.,PO thickness asymmetry
0,Paziente 1,30084,0,High School,109.5,38.0,0,0.0,1,1,...,2.2623,0.021072,18.2292,2.4475,0.022797,2.293,0.021358,2.597,0.02419,-12.4336
1,Paziente 2,26505,0,University,96.0,41.17,13,2.1,0,1,...,1.8574,0.017152,-18.2462,1.3628,0.012585,1.2929,0.01194,1.4317,0.013222,-10.1909
2,Paziente 3,31954,0,University,76.5,32.0,2,0.5,0,1,...,2.6216,0.024634,6.8561,2.3106,0.021711,2.484,0.023341,2.1159,0.019882,16.004
3,Paziente 4,32438,0,University,79.1,31.0,2,0.0,1,1,...,3.0341,0.028616,-6.5858,2.1641,0.02041,2.2997,0.021689,2.0193,0.019045,12.9849
4,Paziente 5,28445,0,high school,42.0,43.0,2,0.4,0,1,...,3.9152,0.035652,-10.4521,2.596,0.02364,2.5593,0.023305,2.6209,0.023866,-2.3788


In [4]:
# Drop unwanted columns

df = df.drop(['Patient','Date of Birth', 'Gender', 'Education', 'Age','Event'], axis = 'columns')
# drop columns that include "%" in their name
#cols_to_drop = [col for col in df.columns if "%" in col]
#df = df.drop(columns=cols_to_drop)
print("Effective features to consider: {} ".format(len(df.columns)-1))

Effective features to consider: 962 


In [5]:
# One Hot Encoding for Scores, Antiplatelets and Coagulants, Therapy, NP Event

from sklearn.preprocessing import OneHotEncoder

categ = ['Antiplatelet', 'Anticoagulant', 'Antimalarial', 'Immunosuppressant', 'Biologic']
ohe = OneHotEncoder(categories='auto',sparse=False)
df_enc = ohe.fit_transform(df[categ])
df_enc = pd.DataFrame(df_enc,columns=ohe.get_feature_names_out(categ))
df = pd.concat([df, df_enc], axis=1)
df = df.drop(categ, axis=1)
df.head()



Unnamed: 0,Disease duration (months),SLEDAI-2k (at the time of NP event),PGA (at the time of fMRI),SLICC-DI (at the time of NP event),anti-dsDNA Titre (0=absent; 1=present) ),anti-dsDNA Titre (insert NV here <7 ),Anti-Ro-SSA,Anti-La-SSB,Anti-RNP,anti-Sm,...,Immunosuppressant_Azathioprine,Immunosuppressant_Cyclophosphamide,Immunosuppressant_Methotrexate,Immunosuppressant_Mycophenolate,Immunosuppressant_None,Immunosuppressant_azathioprine,Biologic_Belimumab,Biologic_None,Biologic_Rituximab,Biologic_tocilizumab
0,109.5,0,0.0,1,1,3.0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,96.0,13,2.1,0,1,84.4,1,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,76.5,2,0.5,0,1,4.0,1,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,79.1,2,0.0,1,1,5.0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,42.0,2,0.4,0,1,37.2,1,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [6]:
import random
random.seed(42)  # Set the random seed to 42 for reproducibility

## Random Forest for SLEDAI

In [7]:
# Separate the features (input variables) and the target variable
X = df.drop(["SLEDAI-2k (at the time of NP event)"], axis=1)
y = df["SLEDAI-2k (at the time of NP event)"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Initialize the random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Initialize the Boruta feature selector
boruta = BorutaPy(rf, n_estimators="auto", random_state=42)

# Perform feature selection
boruta.fit(X_train.values, y_train.values)

# Get the selected features
selected_features = X.columns[boruta.support_].tolist()

# Filter the training and testing sets to include only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [10]:
# Initialize a new random forest regressor using the selected features
rf_selected = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_selected.fit(X_train_selected, y_train)

# Predict the target variable for the test data
y_pred = rf_selected.predict(X_test_selected)

from sklearn.metrics import mean_squared_error, r2_score

# Calculate R-squared score
r2_score_rf = rf_selected.score(X_test_selected, y_test)
print("Random Forest R-squared score: ", round(r2_score_rf, 2))

# Predict the target variable for the test data
y_predicted = rf_selected.predict(X_test_selected)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_predicted)
print("Mean Squared Error: ", mse)


Random Forest R-squared score:  0.51
Mean Squared Error:  27.08740772826908


## Random Forest SLICCDI

In [11]:
# Separate the features (input variables) and the target variable
X = df.drop(["SLICC-DI (at the time of NP event)"], axis=1)
y = df["SLICC-DI (at the time of NP event)"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Initialize the random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Initialize the Boruta feature selector
boruta = BorutaPy(rf, n_estimators="auto", random_state=42)

# Perform feature selection
boruta.fit(X_train.values, y_train.values)

# Get the selected features
selected_features = X.columns[boruta.support_].tolist()

# Filter the training and testing sets to include only the selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [13]:
# Initialize a new random forest regressor using the selected features
rf_selected = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model on the training data
rf_selected.fit(X_train_selected, y_train)

# Predict the target variable for the test data
y_pred = rf_selected.predict(X_test_selected)

from sklearn.metrics import mean_squared_error, r2_score

# Calculate R-squared score
r2_score_rf = rf_selected.score(X_test_selected, y_test)
print("Random Forest R-squared score: ", round(r2_score_rf, 2))

# Predict the target variable for the test data
y_predicted = rf_selected.predict(X_test_selected)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_predicted)
print("Mean Squared Error: ", mse)


Random Forest R-squared score:  -0.21
Mean Squared Error:  4.048816666666667
