# Infertility - Psoriasis

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#import xgboost as xgb

from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score

#import prince 
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report
from scipy.stats import chi2_contingency

In [2]:
def days_between(d1, d2):
    
    if pd.isna(d1) or pd.isna(d2):
        return np.nan
    else:
        d1_ = d1.date()
        d2_ = d2.date()
        
        return (d2_ - d1_).days

In [3]:
def months_between(d1, d2):
    if pd.isnull(d1) or pd.isnull(d2):
        return np.nan
    else:
        return (d2 - d1).days / 30.44  # Average days in a month

In [4]:
def create_table(df1, df2, col, n_cols):
    
    y1 = df1[col].value_counts(sort=False).sort_index()
    y1_indeces = y1.index.to_numpy()
    y1_values  = y1.values
    
    y2 = df2[col].value_counts(sort=False).sort_index()
    y2_indeces = y2.index.to_numpy()
    y2_values  = y2.values
    
    table = np.zeros((2, n_cols), dtype=int)
    
    for idx, val in enumerate(y1_indeces):
        table[0,val+1] = y1_values[idx]   
    
    for idx, val in enumerate(y2_indeces):
        table[1,val+1] = y2_values[idx]
    
    return table


def run_chi2(dataframe1, dataframe2, feature_list, label1="INF", label2="INF+PSO", columns=[-1,0,1], verbose=True):

    important_features = {}

    for feature in feature_list:

        if verbose:
            print("* Testing feature %s"%feature)
    
        table = create_table(dataframe1, dataframe2, feature, len(columns))
    
        df = pd.DataFrame(data=table, index=[label1, label2], columns=columns)
        
        if verbose:
            display(df)
    
        try:
            res = chi2_contingency(table, correction=True) 
        
            if res.pvalue <= 0.05:
                if verbose:
                    print(res.pvalue, res.pvalue <= 0.05)
                important_features[feature] = {"p-val": res.pvalue, "table": df}
        
        except:
            pass
    
        if verbose:
            print()
            print()

    return important_features

In [5]:
# Define the adjusted create_table function
def create_table_month(df1, df2, col, categories):
    y1 = df1[col].value_counts(sort=False)
    y1 = y1.reindex(categories, fill_value=0)
    y1_values  = y1.values

    y2 = df2[col].value_counts(sort=False)
    y2 = y2.reindex(categories, fill_value=0)
    y2_values  = y2.values

    table = np.array([y1_values, y2_values])

    return table, categories

# Define the adjusted run_chi2 function
def run_chi2_month(dataframe1, dataframe2, feature_list, label1="INF", label2="INF+PSO", categories=None, verbose=False):
    important_features = {}

    for feature in feature_list:
        table, columns = create_table_month(dataframe1, dataframe2, feature, categories)
        df = pd.DataFrame(data=table, index=[label1, label2], columns=columns)

        # Perform the chi-squared test
        try:
            chi2, p_value, dof, expected = chi2_contingency(table)
            # Store only if p-value <= 0.05
            if p_value <= 0.05:
                if verbose:
                    print(f"* Significant feature: {feature}")
                    print(f"P-value: {p_value}")
                    display(df)
                important_features[feature] = {"p-val": p_value, "table": df}
        except Exception as e:
            if verbose:
                print(f"An error occurred with feature {feature}: {e}")

    return important_features

# Read Dataset

In [6]:
df = pd.read_excel("/Users/kryptonempyrean/Desktop/Tesi Material/OneDrive_1_08-09-2024/Psoriasis_2017_Erez_Data2_Coded codifica in corso.xlsx", sheet_name="PsoriasisPanel")

In [7]:
df["sex"].value_counts()

sex
F    141769
M    138684
Name: count, dtype: int64

# Select only Men and All the Comorbities

In [8]:
df = df[df["sex"] == "M"]

In [9]:
columns = pd.read_excel("/Users/kryptonempyrean/Desktop/Tesi Material/OneDrive_1_08-09-2024/Psoriasis_2017_Erez_Data2_Coded codifica in corso.xlsx", sheet_name="Foglio1", header=None)
columns = columns[1].tolist()

comorbidities = columns[:142]
selected_comorbidities = sorted(set(df.columns.tolist()) & set(comorbidities),  key = df.columns.tolist().index)
selected_comorbidities.remove("Psoriasis")

selected_columns = ['date_of_birth', "Infertility ", "Psoriasis"] + selected_comorbidities

In [10]:
data = df[selected_columns]
data

Unnamed: 0,date_of_birth,Infertility,Psoriasis,Tuberculosis,Tuberculosis s/p,Syphilis / Gonorrhea,Hepatitis B Carrier,Hepatitis C Carrier,Familial Mediteranean Fever,Amyloidosis,...,OncBone,OncSaracoma,OncGenitalia,Oncmyeloma,OncPolycythemiaVera,OncMyelodysplastic,OncLympholiferative,OncNeurofibromatosis,OncOther,OncUnKnow
0,1906-05-15 00:00:00,,,,,,,,,,...,,,,,,,,,,
1,1906-06-15 00:00:00,,,,,,,,,,...,,,,,,,,,,
2,1907-06-15 00:00:00,,,,,,,,,,...,,,,,,,,,,
3,1907-05-14 00:00:00,,,,,,,,,,...,,,,,,,,,,
4,1909-01-01 00:00:00,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138679,2016-01-30 00:00:00,,,,,,,,,,...,,,,,,,,,,
138680,2016-05-25 00:00:00,,,,,,,,,,...,,,,,,,,,,
138681,2016-06-15 00:00:00,,,,,,,,,,...,,,,,,,,,,
138682,2016-10-13 00:00:00,,,,,,,,,,...,,,,,,,,,,


# Remove the Comorbities where entries are all null

In [11]:
data_subset = data.dropna(axis='columns', how='all').copy()

In [12]:
data.columns.tolist()

['date_of_birth',
 'Infertility ',
 'Psoriasis',
 'Tuberculosis',
 'Tuberculosis s/p',
 'Syphilis / Gonorrhea',
 'Hepatitis B Carrier',
 'Hepatitis C Carrier',
 'Familial Mediteranean Fever',
 'Amyloidosis',
 'Malignancy',
 'Benign Brain Tumor',
 'Hyperthyroidism',
 'Hypothyroidism',
 'Diabetes',
 'Gaucher Disease',
 'Hypo/Hyperparathyroidism',
 'Acromegaly',
 'Obesity',
 'Hyperlipidemia',
 'Cystic Fibrosis',
 'Hyperprolactinemia',
 'Other Endocrine and Metabolic Disease',
 'Other Hematologic Dis (excl. Iron Def Anemia)',
 'Psychoses',
 'Neuroses',
 'Depression',
 'Anxiety',
 'Smoking',
 'Dementia / Alzheimers / OMS',
 'Myasthenia Gravis',
 'Parkinsons Disease',
 'Epilepsy',
 'Multiple Sclerosis',
 'Cerebral Palsy',
 'Hereditary Neurological Disease',
 'Muscular Dystrophy',
 'Motor Neuron Disease',
 'Other Neurological Disease',
 'Retinopathy',
 'Glaucoma',
 'Blindness',
 'Retinitis Pigmentosum',
 'Deafness',
 'IHD',
 'CHF',
 'Cardiomyopathy',
 'IHSS',
 'Arrhythmia',
 'Hypertension',
 

In [13]:
data_subset.columns

Index(['date_of_birth', 'Infertility ', 'Psoriasis', 'Tuberculosis',
       'Tuberculosis s/p', 'Syphilis / Gonorrhea', 'Hepatitis B Carrier',
       'Hepatitis C Carrier', 'Familial Mediteranean Fever', 'Amyloidosis',
       ...
       'OncBone', 'OncSaracoma', 'OncGenitalia', 'Oncmyeloma',
       'OncPolycythemiaVera', 'OncMyelodysplastic', 'OncLympholiferative',
       'OncNeurofibromatosis', 'OncOther', 'OncUnKnow'],
      dtype='object', length=123)

# Binarize the columns

In [14]:
for var in data_subset.columns:
    if var not in ['GroupName', 'date_of_birth']:
        data_subset["%s_binary"%var] = data_subset["%s"%var].fillna(False)
        data_subset["%s_binary"%var] = data_subset["%s_binary"%var] != False

# Select Patients with Infertility

In [15]:
patients_inf = data_subset[data_subset["Infertility _binary"]]

In [16]:
patients_inf[["Infertility ", "Psoriasis"]].head(10)

Unnamed: 0,Infertility,Psoriasis
836,2006-02-03 00:00:00,
1429,2012-11-19 00:00:00,1998-08-01 00:00:00
1592,2004-09-17 00:00:00,
1809,2002-01-28 00:00:00,
2522,2004-12-19 00:00:00,2014-02-23 00:00:00
2792,2004-03-24 00:00:00,
3451,2003-02-02 00:00:00,
4152,2001-04-24 00:00:00,
4658,2002-02-03 00:00:00,
5371,2001-10-31 00:00:00,2014-02-03 00:00:00


In [17]:
values = patients_inf.apply(lambda x: days_between(x['Infertility '], x["Psoriasis"]), axis=1)
indeces = patients_inf[pd.isna(values)].index.tolist() + patients_inf[values > 0].index.tolist()
filtered_inf = patients_inf[patients_inf.index.isin(indeces)]
filtered_inf.head(10)

Unnamed: 0,date_of_birth,Infertility,Psoriasis,Tuberculosis,Tuberculosis s/p,Syphilis / Gonorrhea,Hepatitis B Carrier,Hepatitis C Carrier,Familial Mediteranean Fever,Amyloidosis,...,OncBone_binary,OncSaracoma_binary,OncGenitalia_binary,Oncmyeloma_binary,OncPolycythemiaVera_binary,OncMyelodysplastic_binary,OncLympholiferative_binary,OncNeurofibromatosis_binary,OncOther_binary,OncUnKnow_binary
836,1921-06-16 00:00:00,2006-02-03 00:00:00,,,,,,,,,...,False,False,False,False,False,False,True,False,False,False
1592,1923-04-01 00:00:00,2004-09-17 00:00:00,,,1999-12-19 00:00:00,,,,,,...,False,False,False,False,True,False,False,False,False,False
1809,1924-06-20 00:00:00,2002-01-28 00:00:00,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
2522,1925-01-19 00:00:00,2004-12-19 00:00:00,2014-02-23 00:00:00,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
2792,1926-01-01 00:00:00,2004-03-24 00:00:00,,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
3451,1927-04-10 00:00:00,2003-02-02 00:00:00,,,,,,,,,...,False,False,False,False,True,False,False,False,False,False
4152,1928-01-01 00:00:00,2001-04-24 00:00:00,,,,,,,,,...,False,False,False,False,False,True,False,False,False,False
4658,1929-04-24 00:00:00,2002-02-03 00:00:00,,,,,,,,,...,False,False,False,False,False,False,False,False,True,False
5371,1930-12-20 00:00:00,2001-10-31 00:00:00,2014-02-03 00:00:00,,,,,,,,...,False,False,False,False,False,False,False,False,False,False
5576,1930-01-01 00:00:00,1999-10-27 00:00:00,,,,,,,,,...,False,False,False,False,False,False,False,False,False,True


## Assign feature values to comorbidities
- -1: diagnosed before infertility
- 0: not diagnosed or diagnosed after psoriasis
- 1: diagnosed after infertility but before psoriasis
- compute month differences between diagnoses

In [18]:
for var in filtered_inf.columns[3:123]:
    # Compute months difference between comorbidity and infertility
    months_inf = filtered_inf.apply(lambda x: months_between(x["Infertility "], x[var]), axis=1)
    # Compute months difference between comorbidity and psoriasis
    months_pso = filtered_inf.apply(lambda x: months_between(x["Psoriasis"], x[var]), axis=1)
    
    # Create a DataFrame to hold these values
    df = pd.DataFrame({'months_inf': months_inf, 'months_pso': months_pso})

    # Initialize a column to hold the months difference
    df["months_diff"] = np.nan

    # Condition where comorbidity occurs after infertility but before psoriasis
    condition_inf_before = (df["months_inf"] >= 0) & ((df["months_pso"].isna()) | (df["months_pso"] >= df["months_inf"]))
    df.loc[condition_inf_before, "months_diff"] = df.loc[condition_inf_before, "months_inf"]

    # Condition where comorbidity occurs after psoriasis
    condition_pso_before = (df["months_pso"] >= 0) & ((df["months_inf"].isna()) | (df["months_inf"] > df["months_pso"]))
    df.loc[condition_pso_before, "months_diff"] = df.loc[condition_pso_before, "months_pso"]

    # Adjusted condition for comorbidity occurring before both infertility and psoriasis
    condition_before_both = (df["months_inf"] < 0) & ((df["months_pso"] < 0) | df["months_pso"].isna())
    df.loc[condition_before_both, "months_diff"] = df.loc[condition_before_both, ["months_inf", "months_pso"]].max(axis=1)

    # Initialize the group column
    df["group"] = 0

    # Group -1: Comorbidity occurs before infertility
    df.loc[df["months_inf"] < 0, "group"] = -1

    # Group 1: Comorbidity occurs after infertility but before psoriasis
    df.loc[(df["months_inf"] >= 0) & ((df["months_pso"] <= 0) | df["months_pso"].isna()), "group"] = 1

    # Add the months_diff to the main DataFrame
    filtered_inf[var + "_months_diff"] = df["months_diff"]

    # Add the group as before
    filtered_inf[var + "_feature"] = df["group"].astype("category")

In [19]:
feature_list = filtered_inf.columns[filtered_inf.columns.str.contains("_feature")]

In [20]:
feature_list_month = filtered_inf.columns[filtered_inf.columns.str.contains("_months_diff")]

In [21]:
df = filtered_inf[feature_list]

In [22]:
df

Unnamed: 0,Tuberculosis_feature,Tuberculosis s/p_feature,Syphilis / Gonorrhea_feature,Hepatitis B Carrier_feature,Hepatitis C Carrier_feature,Familial Mediteranean Fever_feature,Amyloidosis_feature,Malignancy_feature,Benign Brain Tumor_feature,Hyperthyroidism_feature,...,OncBone_feature,OncSaracoma_feature,OncGenitalia_feature,Oncmyeloma_feature,OncPolycythemiaVera_feature,OncMyelodysplastic_feature,OncLympholiferative_feature,OncNeurofibromatosis_feature,OncOther_feature,OncUnKnow_feature
836,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1592,0,-1,0,0,0,0,0,-1,0,0,...,0,0,0,0,-1,0,0,0,0,0
1809,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2792,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
124273,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130280,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
130681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
patients_only_inf = filtered_inf[filtered_inf["Psoriasis_binary"]==0]
patients_inf_pso  = filtered_inf[filtered_inf["Psoriasis_binary"]==1]

In [24]:
dictionary_test1 = run_chi2(patients_only_inf,
                            patients_inf_pso,
                            feature_list,
                            verbose=False)

for feature in dictionary_test1:
    print(feature)
    display(dictionary_test1[feature]["table"])

Hyperthyroidism_feature


Unnamed: 0,-1,0,1
INF,2,680,10
INF+PSO,2,59,0


Diabetes_feature


Unnamed: 0,-1,0,1
INF,34,563,95
INF+PSO,3,57,1


COPD_feature


Unnamed: 0,-1,0,1
INF,4,663,25
INF+PSO,2,59,0


In [25]:
from scipy.stats import chi2_contingency

# Define your months_between function if not already defined
def months_between(d1, d2):
    if pd.isnull(d1) or pd.isnull(d2):
        return np.nan
    else:
        return (d2 - d1).days / 30.44  # Average days in a month

# Define bins and labels
bins = [-np.inf, -24, -12, 0, 12, 24, np.inf]
labels = ['<= -24', '-24 to -12', '-12 to 0', '0 to 12', '12 to 24', '>24']
categories = labels  # Ensure consistent ordering


feature_list_month = [col for col in filtered_inf.columns if col.endswith('_months_diff')]

# Prepare feature list for binned months difference columns
feature_list_binned = []
for feature in feature_list_month:
    bin_col = feature + '_binned'
    patients_only_inf[bin_col] = pd.cut(patients_only_inf[feature], bins=bins, labels=labels)
    patients_inf_pso[bin_col] = pd.cut(patients_inf_pso[feature], bins=bins, labels=labels)
    feature_list_binned.append(bin_col)

In [26]:
# Run chi-squared tests
significant_features = run_chi2_month(
    patients_only_inf,
    patients_inf_pso,
    feature_list_binned,
    label1="INF",
    label2="INF+PSO",
    categories=categories,
    verbose=False  # Suppress intermediate outputs
)

# Display only significant features
for feature in significant_features:
    print(f"Significant feature: {feature}")
    print(f"P-value: {significant_features[feature]['p-val']}")
    display(significant_features[feature]["table"])


Significant feature: Chronic Renal Failure_months_diff_binned
P-value: 0.019292400951408124


Unnamed: 0,<= -24,-24 to -12,-12 to 0,0 to 12,12 to 24,>24
INF,7,1,5,1,0,23
INF+PSO,0,0,0,0,1,2


Significant feature: Arthropathy_months_diff_binned
P-value: 0.007220681483009218


Unnamed: 0,<= -24,-24 to -12,-12 to 0,0 to 12,12 to 24,>24
INF,41,23,24,23,19,63
INF+PSO,5,4,7,1,0,1


## Cluster the Patients

In [27]:
#One-Hot Encoding
comorbidity_cols = df.columns
data_encoded = df[comorbidity_cols].astype(str)
data_ohe = pd.get_dummies(data_encoded, columns=comorbidity_cols, prefix_sep='_')

In [28]:
data_ohe.head(10)

Unnamed: 0,Tuberculosis_feature_0,Tuberculosis_feature_1,Tuberculosis s/p_feature_-1,Tuberculosis s/p_feature_0,Tuberculosis s/p_feature_1,Syphilis / Gonorrhea_feature_0,Hepatitis B Carrier_feature_-1,Hepatitis B Carrier_feature_0,Hepatitis B Carrier_feature_1,Hepatitis C Carrier_feature_-1,...,OncMyelodysplastic_feature_1,OncLympholiferative_feature_0,OncLympholiferative_feature_1,OncNeurofibromatosis_feature_0,OncOther_feature_-1,OncOther_feature_0,OncOther_feature_1,OncUnKnow_feature_-1,OncUnKnow_feature_0,OncUnKnow_feature_1
836,True,False,False,True,False,True,False,True,False,False,...,False,False,True,True,False,True,False,False,True,False
1592,True,False,True,False,False,True,False,True,False,False,...,False,True,False,True,False,True,False,False,True,False
1809,True,False,False,True,False,True,False,True,False,False,...,False,True,False,True,False,True,False,False,True,False
2522,True,False,False,True,False,True,False,True,False,False,...,False,True,False,True,False,True,False,False,True,False
2792,True,False,False,True,False,True,False,True,False,False,...,False,True,False,True,False,True,False,False,True,False
3451,True,False,False,True,False,True,False,True,False,False,...,False,True,False,True,False,True,False,False,True,False
4152,True,False,False,True,False,True,False,True,False,False,...,True,True,False,True,False,True,False,False,True,False
4658,True,False,False,True,False,True,False,True,False,False,...,False,True,False,True,False,False,True,False,True,False
5371,True,False,False,True,False,True,False,True,False,False,...,False,True,False,True,False,True,False,False,True,False
5576,True,False,False,True,False,True,False,True,False,False,...,False,True,False,True,False,True,False,False,False,True


In [29]:
import prince

for n in range(10,80,10):
    mca = prince.MCA(n_components=n, random_state=42)
    data_mca = mca.fit_transform(data_ohe)
    
    silhouette = {}
    print(f"Silhouette Scores with MCA number of components of {n}")
    for n_clusters in range(2, 10):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        silhouette[n_clusters] = silhouette_score(data_mca, kmeans.fit_predict(data_mca))
        
        
    print(silhouette)
    print("\n\n")

Silhouette Scores with MCA number of components of 10
{2: 0.7693643380718449, 3: 0.7768020272832513, 4: 0.7725422497843246, 5: 0.6734680629615154, 6: 0.6966986909764474, 7: 0.6795756363891466, 8: 0.657976248261588, 9: 0.5762736281540155}



Silhouette Scores with MCA number of components of 20
{2: 0.7600304288557171, 3: 0.7312219496333875, 4: 0.7350802851932369, 5: 0.7356346653120144, 6: 0.7255404549058196, 7: 0.6636954092330416, 8: 0.6572808113001813, 9: 0.51887203225218}



Silhouette Scores with MCA number of components of 30
{2: 0.7221357145589105, 3: 0.743229198577517, 4: 0.7216367672515869, 5: 0.7074523100274841, 6: 0.7107955210024202, 7: 0.5193314768625333, 8: 0.7094659731227626, 9: 0.445280905773965}



Silhouette Scores with MCA number of components of 40
{2: 0.7010252738998722, 3: 0.7026417069347621, 4: 0.6911142608716689, 5: 0.6847792528183054, 6: 0.6793359340219485, 7: 0.6821344063364413, 8: 0.6811918646856948, 9: 0.6773960597689589}



Silhouette Scores with MCA number of 

In [30]:
for n in range(10,80,10):
    mca = prince.MCA(n_components=n, random_state=42)
    data_mca = mca.fit_transform(data_ohe)
    
    silhouette = {}
    print(f"Silhouette Scores with MCA number of components of {n}")
    for n_clusters in range(2, 10):
        agglo = AgglomerativeClustering(n_clusters=n_clusters)
        silhouette[n_clusters] = silhouette_score(data_mca, agglo.fit_predict(data_mca))
        
        
    print(silhouette)
    print("\n\n")

Silhouette Scores with MCA number of components of 10
{2: 0.850973327576855, 3: 0.844437180030405, 4: 0.8366808580449892, 5: 0.5868407232092473, 6: 0.5878656781218317, 7: 0.5853468646376141, 8: 0.5875260715831835, 9: 0.5528179124250203}



Silhouette Scores with MCA number of components of 20
{2: 0.8358950756488513, 3: 0.8319179340504653, 4: 0.8262303167313246, 5: 0.6554470073989774, 6: 0.6559615439042932, 7: 0.6522060574391965, 8: 0.6531111447208412, 9: 0.6586764516389352}



Silhouette Scores with MCA number of components of 30
{2: 0.7956786881643123, 3: 0.7912709658765266, 4: 0.7855061989548552, 5: 0.7873210639990873, 6: 0.7818626982712852, 7: 0.7703930071889911, 8: 0.6646252930535343, 9: 0.657174466587454}



Silhouette Scores with MCA number of components of 40
{2: 0.7881167413225358, 3: 0.7838397359645987, 4: 0.7779195440926584, 5: 0.6252422427013538, 6: 0.6256678580846641, 7: 0.6226692517138525, 8: 0.6231742474087791, 9: 0.6265572615829677}



Silhouette Scores with MCA number o

In [31]:
for n in range(10,80,10):
    mca = prince.MCA(n_components=n, random_state=42)
    data_mca = mca.fit_transform(data_ohe)
    
    silhouette = {}
    print(f"Silhouette Scores with MCA number of components of {n}")
    for n_clusters in range(2, 10):
        spectral = SpectralClustering(n_clusters=n_clusters, random_state=42)
        silhouette[n_clusters] = silhouette_score(data_mca, spectral.fit_predict(data_mca))
        
        
    print(silhouette)
    print("\n\n")

Silhouette Scores with MCA number of components of 10
{2: 0.8633047803913422, 3: 0.8590653933845361, 4: 0.8267157235377203, 5: 0.8323368099405751, 6: 0.8348722480992173, 7: 0.8162781563455642, 8: 0.8356428980662497, 9: 0.8355332736278948}



Silhouette Scores with MCA number of components of 20
{2: 0.7753360187441928, 3: 0.7782549295604345, 4: 0.7857291977986028, 5: 0.7782837026505799, 6: 0.7767613915803712, 7: 0.7787678234910322, 8: 0.7846354614226888, 9: 0.7856914267774175}



Silhouette Scores with MCA number of components of 30
{2: 0.7428133004746682, 3: 0.7532816304642693, 4: 0.6993742517896706, 5: 0.6928635311409858, 6: 0.7520542797005109, 7: 0.7513835821268268, 8: 0.7031871723194647, 9: 0.6975994078837349}



Silhouette Scores with MCA number of components of 40
{2: 0.7087936991345585, 3: 0.6572451097028259, 4: 0.6778121423938342, 5: 0.6773369566974511, 6: 0.6697587877710046, 7: 0.673843381412204, 8: 0.6754129964224398, 9: 0.675573477850073}



Silhouette Scores with MCA number 

In [32]:
for n in range(10,80,10):
    mca = prince.MCA(n_components=n, random_state=42)
    data_mca = mca.fit_transform(data_ohe)
    
    silhouette = {}
    print(f"Silhouette Scores with MCA number of components of {n}")
    for n_clusters in range(2, 10):
        gaussian = GaussianMixture(n_components=n_clusters, random_state=42)
        silhouette[n_clusters] = silhouette_score(data_mca, gaussian.fit_predict(data_mca))
        
        
    print(silhouette)
    print("\n\n")

Silhouette Scores with MCA number of components of 10
{2: 0.9005680496706594, 3: 0.4054365407585177, 4: 0.24674504238431375, 5: 0.24801060927914936, 6: 0.2490306888525401, 7: 0.24960706618577005, 8: 0.25151685629804865, 9: 0.23157460682593559}



Silhouette Scores with MCA number of components of 20
{2: 0.8724136824517612, 3: 0.872186994366598, 4: 0.36185081727717716, 5: 0.3608605431179707, 6: 0.3581688380376489, 7: 0.356597008218977, 8: 0.35579849536950814, 9: 0.35760241517411445}



Silhouette Scores with MCA number of components of 30
{2: 0.432229524728802, 3: 0.42079151299171863, 4: 0.436094235739309, 5: 0.42888083121979803, 6: 0.38336756536363936, 7: 0.3986381868448014, 8: 0.36404321519751187, 9: 0.35602191449080167}



Silhouette Scores with MCA number of components of 40
{2: 0.8286264217263947, 3: 0.819973688566282, 4: 0.4137244124038344, 5: 0.6847792528183054, 6: 0.6806780452649555, 7: 0.6822322410778022, 8: 0.3808961387802703, 9: 0.38042980615062405}



Silhouette Scores with 

In [34]:
mca = prince.MCA(n_components=10, random_state=42)
data_mca = mca.fit_transform(data_ohe)

In [35]:
data_mca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
836,1.103457,-0.151476,0.446340,-0.347508,0.193591,0.100313,-0.241305,-0.019153,0.663499,0.173098
1592,0.333570,-0.021799,-0.263165,-0.383516,0.645322,0.092263,0.287595,-0.168128,-0.027834,0.390809
1809,0.739152,-0.213336,-0.242766,-0.261089,0.069927,-0.247627,-0.078986,-0.129864,-0.207209,0.231815
2522,0.572568,-0.133040,-0.023236,-0.171501,0.026675,0.091771,0.180475,0.075825,0.065274,-0.124266
2792,1.382336,-0.181981,-0.812211,-0.044585,-0.401766,-0.724287,0.532892,0.909143,0.004376,0.013954
...,...,...,...,...,...,...,...,...,...,...
122421,-0.106521,-0.002074,0.004114,-0.002581,-0.001387,0.009135,0.044750,0.028757,-0.000743,0.035021
124273,-0.106521,-0.002074,0.004114,-0.002581,-0.001387,0.009135,0.044750,0.028757,-0.000743,0.035021
130280,-0.114553,0.004623,0.008726,-0.006136,0.001169,0.011424,0.056592,0.022477,0.000310,0.039213
130681,-0.085624,-0.015068,0.000858,0.000513,-0.003939,-0.016861,0.004732,-0.009043,-0.008783,0.065181


In [36]:
gaussian = GaussianMixture(n_components=2, random_state=42)
filtered_inf["Cluster"] = gaussian.fit_predict(data_mca)
filtered_inf["Cluster"] = filtered_inf["Cluster"].astype("category")
filtered_inf["Cluster"].value_counts()

Cluster
0    752
1      1
Name: count, dtype: int64

In [37]:
patients_group0 = filtered_inf[filtered_inf["Cluster"]==0]
patients_group1 = filtered_inf[filtered_inf["Cluster"]==1]

patients_group0_only_inf = patients_group0[patients_group0["Psoriasis_binary"] == 0]
patients_group1_only_inf = patients_group1[patients_group1["Psoriasis_binary"] == 0]

patients_group0_inf_pso = patients_group0[patients_group0["Psoriasis_binary"] == 1]
patients_group1_inf_pso = patients_group1[patients_group1["Psoriasis_binary"] == 1]

### Group 1 vs. Group 2 (all patients)

In [38]:
dictionary_test2 = run_chi2(patients_group0,
                            patients_group1,
                            feature_list,
                            label1="GROUP1",
                            label2="GROUP2",
                            verbose=False)

for feature in dictionary_test2:
    print(feature, "p-val: %2.3e"%dictionary_test2[feature]["p-val"])
    display(dictionary_test2[feature]["table"])

Hepatitis C Carrier_feature p-val: 4.375e-55


Unnamed: 0,-1,0,1
GROUP1,2,742,8
GROUP2,1,0,0


Malignancy_feature p-val: 5.959e-08


Unnamed: 0,-1,0,1
GROUP1,21,699,32
GROUP2,1,0,0


Hypothyroidism_feature p-val: 6.990e-17


Unnamed: 0,-1,0,1
GROUP1,9,721,22
GROUP2,1,0,0


Diabetes_feature p-val: 6.199e-05


Unnamed: 0,-1,0,1
GROUP1,36,620,96
GROUP2,1,0,0


Hypo/Hyperparathyroidism_feature p-val: 3.077e-164


Unnamed: 0,-1,0,1
GROUP1,0,751,1
GROUP2,1,0,0


Other Hematologic Dis (excl. Iron Def Anemia)_feature p-val: 2.962e-33


Unnamed: 0,-1,0,1
GROUP1,4,736,12
GROUP2,1,0,0


Cardiomyopathy_feature p-val: 2.253e-82


Unnamed: 0,-1,0,1
GROUP1,1,746,5
GROUP2,1,0,0


Arrhythmia_feature p-val: 3.330e-12


Unnamed: 0,-1,0,1
GROUP1,13,715,24
GROUP2,1,0,0


Chronic Act/Per Hepatitis_feature p-val: 2.253e-82


Unnamed: 0,-1,0,1
GROUP1,1,747,4
GROUP2,1,0,0


Arthropathy_feature p-val: 3.661e-02


Unnamed: 0,-1,0,1
GROUP1,98,537,117
GROUP2,1,0,0


OncUnKnow_feature p-val: 1.928e-41


Unnamed: 0,-1,0,1
GROUP1,3,745,4
GROUP2,1,0,0


### Group 1 (only infertility) vs. Group 2 (only infertility)

In [39]:
dictionary_test3 = run_chi2(patients_group0_only_inf,
                            patients_group1_only_inf,
                            feature_list,
                            label1="GROUP1_INF",
                            label2="GROUP2_INF",
                            verbose=False)

for feature in dictionary_test3:
    print(feature, "p-val: %2.3e"%dictionary_test3[feature]["p-val"])
    display(dictionary_test3[feature]["table"])

Hepatitis C Carrier_feature p-val: 1.139e-50


Unnamed: 0,-1,0,1
GROUP1_INF,2,682,7
GROUP2_INF,1,0,0


Malignancy_feature p-val: 1.983e-08


Unnamed: 0,-1,0,1
GROUP1_INF,18,641,32
GROUP2_INF,1,0,0


Hypothyroidism_feature p-val: 3.141e-17


Unnamed: 0,-1,0,1
GROUP1_INF,8,663,20
GROUP2_INF,1,0,0


Diabetes_feature p-val: 6.187e-05


Unnamed: 0,-1,0,1
GROUP1_INF,33,563,95
GROUP2_INF,1,0,0


Hypo/Hyperparathyroidism_feature p-val: 5.421e-151


Unnamed: 0,-1,0,1
GROUP1_INF,0,690,1
GROUP2_INF,1,0,0


Other Hematologic Dis (excl. Iron Def Anemia)_feature p-val: 3.950e-38


Unnamed: 0,-1,0,1
GROUP1_INF,3,676,12
GROUP2_INF,1,0,0


Cardiomyopathy_feature p-val: 9.458e-76


Unnamed: 0,-1,0,1
GROUP1_INF,1,685,5
GROUP2_INF,1,0,0


Arrhythmia_feature p-val: 4.383e-12


Unnamed: 0,-1,0,1
GROUP1_INF,12,659,20
GROUP2_INF,1,0,0


Chronic Act/Per Hepatitis_feature p-val: 9.458e-76


Unnamed: 0,-1,0,1
GROUP1_INF,1,687,3
GROUP2_INF,1,0,0


Arthropathy_feature p-val: 3.217e-02


Unnamed: 0,-1,0,1
GROUP1_INF,87,499,105
GROUP2_INF,1,0,0


OncUnKnow_feature p-val: 3.950e-38


Unnamed: 0,-1,0,1
GROUP1_INF,3,684,4
GROUP2_INF,1,0,0


### Group 1 (only infertility) vs. Group 1 (infertility followed by psoriasis)

In [40]:
dictionary_test4 = run_chi2(patients_group0_only_inf,
                            patients_group0_inf_pso,
                            feature_list,
                            label1="GROUP1_INF",
                            label2="GROUP1_INF+PSO",
                            verbose=False)

for feature in dictionary_test4:
    print(feature, "p-val: %2.3e"%dictionary_test4[feature]["p-val"])
    display(dictionary_test4[feature]["table"])

Hyperthyroidism_feature p-val: 5.763e-03


Unnamed: 0,-1,0,1
GROUP1_INF,2,679,10
GROUP1_INF+PSO,2,59,0


Diabetes_feature p-val: 2.455e-02


Unnamed: 0,-1,0,1
GROUP1_INF,33,563,95
GROUP1_INF+PSO,3,57,1


COPD_feature p-val: 2.557e-02


Unnamed: 0,-1,0,1
GROUP1_INF,4,662,25
GROUP1_INF+PSO,2,59,0


### Group 2 (only infertility) vs. Group 2 (infertility followed psoriasis)

In [42]:
dictionary_test5 = run_chi2(patients_group1_only_inf,
                            patients_group1_inf_pso,
                            feature_list,
                            label1="GROUP2_INF",
                            label2="GROUP2_INF+PSO",
                            verbose=False)

for feature in dictionary_test5:
    print(feature, "p-val: %2.3e"%dictionary_test5[feature]["p-val"])
    display(dictionary_test5[feature]["table"])

### Group 1 (infertility followed by psoriasis) vs. Group 2 (infertility followed by psoriasis)

In [43]:
dictionary_test6 = run_chi2(patients_group0_inf_pso,
                            patients_group1_inf_pso,
                            feature_list,
                            label1="GROUP1_INF+PSO",
                            label2="GROUP2_INF+PSO",
                            verbose=False)

for feature in dictionary_test6:
    print(feature, "p-val: %2.3e"%dictionary_test6[feature]["p-val"])
    display(dictionary_test6[feature]["table"])

In [44]:
for var in patients_group0_inf_pso.columns[3:123]:
    values_inf = patients_group0_inf_pso.apply(lambda x: days_between(x["Infertility "], x[var]), axis=1)
    values_pso = patients_group0_inf_pso.apply(lambda x: days_between(x["Psoriasis"], x[var]), axis=1)
    df = pd.DataFrame([values_inf.fillna(0), values_pso.fillna(0)], index=["inf", "pso"]).T
    df["group"] = 0
    df["group"][df["inf"] < 0] = -1
    df["group"][(df["inf"] > 0) & (df["pso"] <= 0)] = 1
    df["group"][df["pso"] > 0] = 2
    
    patients_group0_inf_pso[var+"_feature"] = df["group"].astype("category")

In [45]:
for var in patients_group1_inf_pso.columns[3:123]:
    values_inf = patients_group1_inf_pso.apply(lambda x: days_between(x["Infertility "], x[var]), axis=1)
    values_pso = patients_group1_inf_pso.apply(lambda x: days_between(x["Psoriasis"], x[var]), axis=1)
    df = pd.DataFrame([values_inf.fillna(0), values_pso.fillna(0)], index=["inf", "pso"]).T
    df["group"] = 0
    df["group"][df["inf"] < 0] = -1
    df["group"][(df["inf"] > 0) & (df["pso"] <= 0)] = 1
    df["group"][df["pso"] > 0] = 2
    
    patients_group1_inf_pso[var+"_feature"] = df["group"].astype("category")

In [46]:
dictionary_test7 = run_chi2(patients_group0_inf_pso,
                            patients_group1_inf_pso,
                            feature_list,
                            label1="GROUP1_INF+PSO",
                            label2="GROUP2_INF+PSO",
                            columns=[-1,0,1,2],
                            verbose=False)

for feature in dictionary_test7:
    print(feature, "p-val: %2.3e"%dictionary_test7[feature]["p-val"])
    display(dictionary_test7[feature]["table"])

In [47]:
from stepmix.stepmix import StepMix

In [48]:
# Define base model
# Here we use a continuous (gaussian) structural response and don't fit
# any variance parameter
model = StepMix(n_components=4,
                measurement='categorical',
                verbose=1,
                random_state=42)

# Fit data
# Provide both measurement data X and structural data Y
model.fit(data_mca)

Initializations (n_init) :   0%|          | 0/1 [00:00<?, ?it/s]

Fitting StepMix...


Initializations (n_init) : 100%|██████████| 1/1 [00:01<00:00,  1.92s/it, max_LL=-143, max_avg_LL=-.191]

MODEL REPORT
    Measurement model parameters
          model_name     categorical                     
          class_no                 0       1    2       3
          param variable                                 
          pis   0_0           0.3333  0.9999  0.0  0.9843
                0_1           0.6667  0.0001  1.0  0.0157
                1_0           0.6667  0.9945  1.0  1.0000
                1_1           0.1667  0.0055  0.0  0.0000
                1_3           0.1667  0.0000  0.0  0.0000
                2_0           0.6667  1.0000  1.0  1.0000
                2_1           0.1667  0.0000  0.0  0.0000
                2_2           0.1667  0.0000  0.0  0.0000
                2_3           0.5000  0.0000  0.0  0.0000
                3_0           0.0000  1.0000  1.0  1.0000
                3_1           0.5000  0.0000  0.0  0.0000
                4_0           0.6667  1.0000  1.0  1.0000
                4_1           0.1667  0.0000  0.0  0.0000
                4_2       




In [49]:
preds = model.predict(data_mca)
preds

array([3, 3, 3, 3, 3, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,

In [50]:
model = StepMix(n_components=2, measurement="categorical", verbose=0, random_state=120)

# Fit model
model.fit(data_mca)

# Class predictions
predictions = model.predict(data_mca)
filtered_inf["Prediction"] = predictions

pd.crosstab(filtered_inf["Psoriasis_binary"], filtered_inf['Prediction'])

Initializations (n_init) : 100%|██████████| 1/1 [00:00<00:00,  7.13it/s, max_LL=-165, max_avg_LL=-.219]

Fitting StepMix...





Prediction,0,1
Psoriasis_binary,Unnamed: 1_level_1,Unnamed: 2_level_1
False,686,6
True,61,0


In [51]:
df = data_mca

silhouette = {}
for n_clusters in range(2, 6):
    model = StepMix(n_components=n_clusters, measurement="categorical", verbose=0, random_state=42)
    model.fit(df)
    silhouette[n_clusters] = silhouette_score(df, model.predict(df))

silhouette

Initializations (n_init) :   0%|          | 0/1 [00:00<?, ?it/s]

Fitting StepMix...


Initializations (n_init) : 100%|██████████| 1/1 [00:00<00:00,  4.41it/s, max_LL=-177, max_avg_LL=-.235]
Initializations (n_init) :   0%|          | 0/1 [00:00<?, ?it/s]

Fitting StepMix...


Initializations (n_init) : 100%|██████████| 1/1 [00:00<00:00,  3.70it/s, max_LL=-139, max_avg_LL=-.185]
Initializations (n_init) :   0%|          | 0/1 [00:00<?, ?it/s]

Fitting StepMix...


Initializations (n_init) : 100%|██████████| 1/1 [00:01<00:00,  1.85s/it, max_LL=-143, max_avg_LL=-.191]
Initializations (n_init) :   0%|          | 0/1 [00:00<?, ?it/s]

Fitting StepMix...


Initializations (n_init) : 100%|██████████| 1/1 [00:01<00:00,  1.73s/it, max_LL=-135, max_avg_LL=-.179]


{2: 0.870997896311345,
 3: 0.8860393244702007,
 4: 0.8118216694073624,
 5: 0.8007162146185477}

In [52]:
model.report(df)

MODEL REPORT
    Measurement model parameters
          model_name     categorical                             
          class_no                 0    1       2       3       4
          param variable                                         
          pis   0_0           0.3333  0.0  0.9974  0.3333  0.9867
                0_1           0.6667  1.0  0.0026  0.6667  0.0133
                1_0           0.6667  1.0  0.9946  0.6667  1.0000
                1_1           0.0000  0.0  0.0054  0.3333  0.0000
                1_3           0.3333  0.0  0.0000  0.0000  0.0000
                2_0           0.6667  1.0  1.0000  0.6667  1.0000
                2_1           0.0000  0.0  0.0000  0.3333  0.0000
                2_2           0.3333  0.0  0.0000  0.0000  0.0000
                2_3           1.0000  0.0  0.0000  0.0000  0.0000
                3_0           0.0000  1.0  1.0000  0.0000  1.0000
                3_1           0.0000  0.0  0.0000  1.0000  0.0000
                4_0           

In [53]:
from stepmix.stepmix import StepMixClassifier

In [95]:
df = data_mca
Y = 1*filtered_inf["Psoriasis_binary"].values

In [96]:
# Split dataset
X_train, X_test, Y_train, Y_test = train_test_split(df, Y, random_state=42)

# Fit StepMix
clf = StepMixClassifier(n_components=2, measurement='categorical', structural='binary', random_state=42, verbose=0, progress_bar=0)
clf.fit(X_train, Y_train)

In [91]:
Y_test

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [57]:
print(f'\nTest Accuracy: {balanced_accuracy_score(Y_test, clf.predict_class(X_test)):.4f}')


Test Accuracy: 0.4940


In [58]:
clf = GaussianMixture(n_components=2, random_state=42)
clf.fit(X_train, Y_train)

In [60]:
print(f'\nTest Accuracy: {balanced_accuracy_score(Y_test, clf.predict(X_test)):.4f}')


Test Accuracy: 0.5446


In [61]:
clf = SpectralClustering(n_clusters=n_clusters, random_state=42)
clf.fit(X_train, Y_train)

In [63]:
print(f'\nTest Accuracy: {balanced_accuracy_score(Y_test, clf.fit_predict(X_test)):.4f}')


Test Accuracy: 0.4851


In [65]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Assuming 'data_mca' is your feature matrix after MCA
# and 'psoriasis_status' is your target variable (0 or 1)

# Split data into features and target
X = data_mca
Y = 1*filtered_inf["Psoriasis_binary"].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba)}")


              precision    recall  f1-score   support

           0       0.89      1.00      0.94       134
           1       0.00      0.00      0.00        17

    accuracy                           0.89       151
   macro avg       0.44      0.50      0.47       151
weighted avg       0.79      0.89      0.83       151

AUC-ROC: 0.4514925373134328


In [66]:
# Split data into features and target
X = filtered_inf[feature_list]
Y = 1*filtered_inf["Psoriasis_binary"].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba)}")


              precision    recall  f1-score   support

           0       0.89      1.00      0.94       134
           1       0.00      0.00      0.00        17

    accuracy                           0.89       151
   macro avg       0.44      0.50      0.47       151
weighted avg       0.79      0.89      0.83       151

AUC-ROC: 0.48683055311676904


In [67]:
from sklearn.model_selection import GridSearchCV

# Split data into features and target
X = data_mca
Y = 1*filtered_inf["Psoriasis_binary"].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_


In [68]:
best_model

In [72]:
best_model.score(X_test, y_test)

0.8874172185430463

In [73]:
from sklearn.metrics import roc_auc_score

# Get predicted probabilities for the positive class
y_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score
auc = roc_auc_score(y_test, y_proba)
print(f"Test set ROC AUC score: {auc:.4f}")

Test set ROC AUC score: 0.4427


In [74]:
class_counts = np.bincount(y_test)
print(f"Class counts in test set: {class_counts}")

Class counts in test set: [134  17]


In [77]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [78]:
grid_search.fit(X_train_resampled, y_train_resampled)

In [79]:
best_model = grid_search.best_estimator_

In [80]:
y_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score
auc = roc_auc_score(y_test, y_proba)
print(f"Test set ROC AUC score: {auc:.4f}")

Test set ROC AUC score: 0.4855


In [92]:
# Split data into features and target
X = filtered_inf[feature_list]
Y = 1*filtered_inf["Psoriasis_binary"].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
}
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [94]:
# Get predicted probabilities for the positive class
y_proba = best_model.predict_proba(X_test)[:,1]

# Calculate ROC AUC score
auc = roc_auc_score(y_test, y_proba)
print(f"Test set ROC AUC score: {auc:.4f}")

Test set ROC AUC score: 0.4480


In [86]:
class_counts = np.bincount(y_test)
print(f"Class counts in test set: {class_counts}")

Class counts in test set: [134  17]


In [87]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)


In [88]:
grid_search.fit(X_train_resampled, y_train_resampled)

In [89]:
best_model = grid_search.best_estimator_
y_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate ROC AUC score
auc = roc_auc_score(y_test, y_proba)
print(f"Test set ROC AUC score: {auc:.4f}")

Test set ROC AUC score: 0.5360
