In [None]:
# data management
import numpy as np                   # for linear algebra
import pandas as pd                  # for tabular data manipulation and processing
import category_encoders             # for categorical encoding

# machine learning
import sklearn                       # for data prep and classical ML

# data visualization and graphics
import matplotlib.pyplot as plt      # for visualization fundamentals
import seaborn as sns                # for pretty visualizations
sns.set_palette("magma")

# misc
import math                          # for calculation
import sys                           # for system manipulation
import os                            # for file manipulation

# stats
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.proportion import proportion_confint

In [None]:
# Load Dataset

file_path = "/Users/Sebastiano/data/Clinical_MRI.xlsx"
db = pd.read_excel(file_path)
pd.set_option('display.max_columns', None)

print("N° of patients: {}".format(len(db)))
print("N° of columns: {}".format(db.shape[1]))
db.head()

In [None]:
# Drop unwanted columns

df = db.drop(['Patient','Date of Birth', 'Gender', 'Education', 'Age'], axis = 'columns')
# drop columns that include "%" in their name
#cols_to_drop = [col for col in df.columns if "%" in col]
#df = df.drop(columns=cols_to_drop)
print("Effective features to consider: {} ".format(len(df.columns)-1))

In [None]:
# One Hot Encoding for Scores, Antiplatelets and Coagulants, Therapy, NP Event

from sklearn.preprocessing import OneHotEncoder

categ = ['Antiplatelet', 'Anticoagulant', 'Antimalarial', 'Immunosuppressant', 'Biologic', 'Event']
ohe = OneHotEncoder(categories='auto',sparse=False)
df_enc = ohe.fit_transform(df[categ])
df_enc = pd.DataFrame(df_enc,columns=ohe.get_feature_names_out(categ))
df = pd.concat([df, df_enc], axis=1)
df = df.drop(categ, axis=1)
df.tail()

## Regression

In [None]:
# define the dependent and independent variables
y = df['NP-SLE']
X = df[['SCA thickness asymmetry', 'Temporal thickness asymmetry', 'Anti-Rib-P','TMP thickness asymmetry', 'MTG thickness asymmetry', 'AnAb ','AIns thickness asymmetry', 'SOG thickness asymmetry', 'Inf. Lateral Ventricle left volume cm3']]
#print(X)
# check for missing or infinite values in the independent variables
#if X.isnull().values.any() or np.isinf(X).any():
   # raise ValueError("The independent variables contain missing or infinite values.")

# add a constant to the independent variables (required for regression analysis)
X = sm.add_constant(X)

# fit the multiple regression model
model = sm.Logit(y, X).fit()

# print the summary of the regression model
print(model.summary())

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# drop the constant column from X
X_no_constant = X.drop('const', axis=1)

# calculate VIF for each independent variable
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X_no_constant.values, i) for i in range(X_no_constant.shape[1])]
vif["features"] = X_no_constant.columns

# print the VIF results
print(vif)
print("The more  VIF increases, the less reliable  regression results are going to be")

In [None]:
import statsmodels.api as sm

# define the target variable
target_variable = 'SLEDAI-2k (at the time of NP event)'

# compute the correlation matrix using Spearman's coefficient
correlation_matrix = df.corr(method='spearman')

# get the correlation between the target variable and variables starting from the 48th column
target_correlation = correlation_matrix.iloc[:, 48:].loc[target_variable]

# get the 10 most correlated variables
top_10_variables = target_correlation.abs().nlargest(10).index.tolist()

# define the dependent and independent variables
y = df[target_variable]
X = df[top_10_variables]

# add a constant to the independent variables (required for regression analysis)
X = sm.add_constant(X)

# fit the multiple regression model
model = sm.OLS(y, X).fit()

# print the summary of the regression model
print(model.summary())

In [None]:
import statsmodels.api as sm

# define the target variable
target_variable = 'SLICC-DI (at the time of NP event)'

# compute the correlation matrix using Spearman's coefficient
correlation_matrix = df.corr(method='spearman')

# get the correlation between the target variable and variables starting from the 48th column
target_correlation = correlation_matrix.iloc[:, 48:].loc[target_variable]

# get the 10 most correlated variables
top_10_variables = target_correlation.abs().nlargest(10).index.tolist()

# define the dependent and independent variables
y = df[target_variable]
X = df[top_10_variables]
# add a constant to the independent variables (required for regression analysis)
X = sm.add_constant(X)

# fit the multiple regression model
model = sm.OLS(y, X).fit()

# print the summary of the regression model
print(model.summary())

In [None]:
import statsmodels.api as sm

# define the target variable
target_variable = 'AnAb '

# compute the correlation matrix using Spearman's coefficient
correlation_matrix = df.corr(method='spearman')

# get the correlation between the target variable and variables starting from the 48th column
target_correlation = correlation_matrix.iloc[:, 48:].loc[target_variable]

# get the 10 most correlated variables
top_10_variables = target_correlation.abs().nlargest(10).index.tolist()

# define the dependent and independent variables
y = df[target_variable]
X = df[top_10_variables]
# add a constant to the independent variables (required for regression analysis)
X = sm.add_constant(X)

# fit the multiple regression model
model = sm.Logit(y, X).fit()

# print the summary of the regression model
print(model.summary())

In [None]:
import statsmodels.api as sm

# define the target variable
target_variable = 'anti-dsDNA Titre  (insert NV here <7 )'

# compute the correlation matrix using Spearman's coefficient
correlation_matrix = df.corr(method='spearman')

# get the correlation between the target variable and variables starting from the 48th column
target_correlation = correlation_matrix.iloc[:, 48:].loc[target_variable]

# get the 10 most correlated variables
top_10_variables = target_correlation.abs().nlargest(10).index.tolist()

# define the dependent and independent variables
y = df[target_variable]
X = df[top_10_variables]
# add a constant to the independent variables (required for regression analysis)
X = sm.add_constant(X)

# fit the multiple regression model
model = sm.OLS(y, X).fit()

# print the summary of the regression model
print(model.summary())

In [None]:
import statsmodels.api as sm

# define the target variable
target_variable = 'anti-dsDNA Titre  (0=absent; 1=present) )'

# compute the correlation matrix using Spearman's coefficient
correlation_matrix = df.corr(method='spearman')

# get the correlation between the target variable and variables starting from the 48th column
target_correlation = correlation_matrix.iloc[:, 48:].loc[target_variable]

# get the 10 most correlated variables
top_10_variables = target_correlation.abs().nlargest(10).index.tolist()

# define the dependent and independent variables
y = df[target_variable]
X = df[top_10_variables]
# add a constant to the independent variables (required for regression analysis)
X = sm.add_constant(X)

# fit the multiple regression model
model = sm.OLS(y, X).fit()

# print the summary of the regression model
print(model.summary())