In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

## Load and Clean Data

In [None]:
# Load Dataset

file_path = "/Users/Sebastiano/SLE/Clinical_MRI.xlsx"
db = pd.read_excel(file_path)

print("N° of patients: {}".format(len(db)))
print("N° of columns: {}".format(db.shape[1]))
db.head()

In [None]:
# Drop unwanted columns

df = db.drop(['Patient','Date of Birth', 'Gender', 'Education', 'Age'], axis = 'columns')
# drop columns that include "%" in their name
#cols_to_drop = [col for col in df.columns if "%" in col]
#df = df.drop(columns=cols_to_drop)
print("Effective features to consider: {} ".format(len(df.columns)-1))

In [None]:
# 0 = No Event
df.loc[df['NP-SLE']== 0, 'result'] = 0

# 1 = NP Event
df.loc[df['NP-SLE'] ==1, 'result'] = 1

In [None]:
df.drop(['NP-SLE'], axis = 'columns')
df.head()

In [None]:
## transform columns with high skewness.

# Identify columns with high skewness
high_skew_cols = skewness[abs(skewness) > 1].index.tolist()
print('-------------------')
print('High skewness columns:')
print(high_skew_cols)

# Apply log transformation to high skewness columns
for col in high_skew_cols:
    df[col] = np.log1p(df[col])

In [None]:
# One Hot Encoding for Scores, Antiplatelets and Coagulants, Therapy, NP Event

from sklearn.preprocessing import OneHotEncoder

categ = ['Antiplatelet', 'Anticoagulant', 'Antimalarial', 'Immunosuppressant', 'Biologic']
ohe = OneHotEncoder(categories='auto',sparse=False)
df_enc = ohe.fit_transform(df[categ])
df_enc = pd.DataFrame(df_enc,columns=ohe.get_feature_names_out(categ))
df = pd.concat([df, df_enc], axis=1)
df = df.drop(categ, axis=1)
df.head()

## Regression Analysis

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import numpy as np

# load data
X, y = load_data()

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# fit ridge regression model
alpha = 0.5 # regularization parameter
ridge = Ridge(alpha=alpha)
ridge.fit(X_train, y_train)

# evaluate model on test set
y_pred = ridge.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# define the range of alpha values to test
alpha_range = [0.01, 0.1, 1, 10, 100]

# create a Ridge regression object
ridge = Ridge()

# set up the parameter grid to search over
param_grid = {'alpha': alpha_range}

# perform grid search with 5-fold cross-validation
grid = GridSearchCV(ridge, param_grid, cv=5)

# fit the grid search object to the data
grid.fit(X_train, y_train)

# print the best value of alpha found by grid search
print("Best alpha value found:", grid.best_params_['alpha'])

# evaluate the performance of the best model on the test set
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test set MSE:", mse)
