This is data mining code for the HAP780 final project after exporting the datasets from All of Us

In [1]:
# Import library
import pandas as pd

# Load the datasets
df_analysis = pd.read_csv('./data/df_analysis_final.csv')

Descriptive statistics

In [2]:
# import libraries
import numpy as np
from scipy.stats import chi2_contingency # for chi-square tests

In [3]:
# Get sample size data
N_total = len(df_analysis)
N_no_remission = len(df_analysis[df_analysis['No_remission'] == 1])
N_with_remission = len(df_analysis[df_analysis['No_remission'] == 0])

print(f"N Total: {N_total} | N No Remission: {N_no_remission} | N With Remission: {N_with_remission}")

N Total: 1621 | N No Remission: 429 | N With Remission: 1192


In [4]:
# Get race statistics
race_columns = [
    "race_Another single population", "race_Asian", "race_Black or African American",
    "race_I prefer not to answer", "race_More than one population",
    "race_None Indicated", "race_None of these", "race_PMI: Skip", "race_White"
]

# Prepare the dataframe for statistics
descriptive_stats = pd.DataFrame(columns=["Participant Characteristics", "Entire Dataset", "Without Remission", "With Remission", "p-value", "Significance"])

# Populate the N values as the first row
descriptive_stats.loc[0] = ["N", N_total, N_no_remission, N_with_remission, "-", "-"]

# Populate the data for each race category
for race in race_columns:
    N_race_total = df_analysis[race].sum()
    N_race_no_remission = df_analysis[df_analysis['No_remission'] == 1][race].sum()
    N_race_with_remission = df_analysis[df_analysis['No_remission'] == 0][race].sum()

    percent_race_total = (N_race_total / N_total) * 100
    percent_race_no_remission = (N_race_no_remission / N_no_remission) * 100
    percent_race_with_remission = (N_race_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table = [[N_race_no_remission, N_race_with_remission], 
                         [N_no_remission - N_race_no_remission, N_with_remission - N_race_with_remission]]
    
    chi2, p, _, _ = chi2_contingency(contingency_table)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        race, 
        f"{N_race_total} ({percent_race_total:.2f}%)", 
        f"{N_race_no_remission} ({percent_race_no_remission:.2f}%)",
        f"{N_race_with_remission} ({percent_race_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [5]:
# Get ethnicity statistics
ethnicity_columns = [
    "ethnicity_Hispanic or Latino", "ethnicity_Not Hispanic or Latino",
    "ethnicity_PMI: Prefer Not To Answer", "ethnicity_PMI: Skip",
    "ethnicity_What Race Ethnicity: Race Ethnicity None Of These"
]

# Populate the data for each ethnicity category
for ethnicity in ethnicity_columns:
    N_ethnicity_total = df_analysis[ethnicity].sum()
    N_ethnicity_no_remission = df_analysis[df_analysis['No_remission'] == 1][ethnicity].sum()
    N_ethnicity_with_remission = df_analysis[df_analysis['No_remission'] == 0][ethnicity].sum()

    percent_ethnicity_total = (N_ethnicity_total / N_total) * 100
    percent_ethnicity_no_remission = (N_ethnicity_no_remission / N_no_remission) * 100
    percent_ethnicity_with_remission = (N_ethnicity_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_ethnicity = [
        [N_ethnicity_no_remission, N_ethnicity_with_remission], 
        [N_no_remission - N_ethnicity_no_remission, N_with_remission - N_ethnicity_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_ethnicity)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        ethnicity, 
        f"{N_ethnicity_total} ({percent_ethnicity_total:.2f}%)", 
        f"{N_ethnicity_no_remission} ({percent_ethnicity_no_remission:.2f}%)",
        f"{N_ethnicity_with_remission} ({percent_ethnicity_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [6]:
# Get insurace coverage statistics
insurance_columns = [
    "insurance_Health Insurance: No", "insurance_Health Insurance: Yes",
    "insurance_PMI: Dont Know", "insurance_PMI: Prefer Not To Answer",
    "insurance_PMI: Skip"
]

# Populate the data for each insurance category
for insurance in insurance_columns:
    N_insurance_total = df_analysis[insurance].sum()
    N_insurance_no_remission = df_analysis[df_analysis['No_remission'] == 1][insurance].sum()
    N_insurance_with_remission = df_analysis[df_analysis['No_remission'] == 0][insurance].sum()

    percent_insurance_total = (N_insurance_total / N_total) * 100
    percent_insurance_no_remission = (N_insurance_no_remission / N_no_remission) * 100
    percent_insurance_with_remission = (N_insurance_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_insurance = [
        [N_insurance_no_remission, N_insurance_with_remission], 
        [N_no_remission - N_insurance_no_remission, N_with_remission - N_insurance_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_insurance)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        insurance, 
        f"{N_insurance_total} ({percent_insurance_total:.2f}%)", 
        f"{N_insurance_no_remission} ({percent_insurance_no_remission:.2f}%)",
        f"{N_insurance_with_remission} ({percent_insurance_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [7]:
# Get income statistics
income_columns = [
    "income_Annual Income: 100k 150k", "income_Annual Income: 10k 25k",
    "income_Annual Income: 150k 200k", "income_Annual Income: 25k 35k",
    "income_Annual Income: 35k 50k", "income_Annual Income: 50k 75k",
    "income_Annual Income: 75k 100k", "income_Annual Income: less 10k",
    "income_Annual Income: more 200k", "income_PMI: Prefer Not To Answer",
    "income_PMI: Skip"
]

# Populate the data for each income category
for income in income_columns:
    N_income_total = df_analysis[income].sum()
    N_income_no_remission = df_analysis[df_analysis['No_remission'] == 1][income].sum()
    N_income_with_remission = df_analysis[df_analysis['No_remission'] == 0][income].sum()

    percent_income_total = (N_income_total / N_total) * 100
    percent_income_no_remission = (N_income_no_remission / N_no_remission) * 100
    percent_income_with_remission = (N_income_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_income = [
        [N_income_no_remission, N_income_with_remission], 
        [N_no_remission - N_income_no_remission, N_with_remission - N_income_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_income)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        income, 
        f"{N_income_total} ({percent_income_total:.2f}%)", 
        f"{N_income_no_remission} ({percent_income_no_remission:.2f}%)",
        f"{N_income_with_remission} ({percent_income_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [8]:
# Get SDOH - Near Store statistics
near_store_columns = [
    "near_store_PMI: Skip", "near_store_Somewhat agree",
    "near_store_Somewhat disagree", "near_store_Strongly agree",
    "near_store_Strongly disagree"
]

# Populate the data for each "near store" category
for store in near_store_columns:
    N_store_total = df_analysis[store].sum()
    N_store_no_remission = df_analysis[df_analysis['No_remission'] == 1][store].sum()
    N_store_with_remission = df_analysis[df_analysis['No_remission'] == 0][store].sum()

    percent_store_total = (N_store_total / N_total) * 100
    percent_store_no_remission = (N_store_no_remission / N_no_remission) * 100
    percent_store_with_remission = (N_store_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_store = [
        [N_store_no_remission, N_store_with_remission], 
        [N_no_remission - N_store_no_remission, N_with_remission - N_store_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_store)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        store, 
        f"{N_store_total} ({percent_store_total:.2f}%)", 
        f"{N_store_no_remission} ({percent_store_no_remission:.2f}%)",
        f"{N_store_with_remission} ({percent_store_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [9]:
# Get SDOH - Near Transit statistics
near_transit_columns = [
    "near_transit_PMI: Dont Know", "near_transit_PMI: Skip", 
    "near_transit_Somewhat agree", "near_transit_Somewhat disagree",
    "near_transit_Strongly agree", "near_transit_Strongly disagree"
]

# Populate the data for each "near transit" category
for transit in near_transit_columns:
    N_transit_total = df_analysis[transit].sum()
    N_transit_no_remission = df_analysis[df_analysis['No_remission'] == 1][transit].sum()
    N_transit_with_remission = df_analysis[df_analysis['No_remission'] == 0][transit].sum()

    percent_transit_total = (N_transit_total / N_total) * 100
    percent_transit_no_remission = (N_transit_no_remission / N_no_remission) * 100
    percent_transit_with_remission = (N_transit_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_transit = [
        [N_transit_no_remission, N_transit_with_remission], 
        [N_no_remission - N_transit_no_remission, N_with_remission - N_transit_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_transit)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        transit, 
        f"{N_transit_total} ({percent_transit_total:.2f}%)", 
        f"{N_transit_no_remission} ({percent_transit_no_remission:.2f}%)",
        f"{N_transit_with_remission} ({percent_transit_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [10]:
# Get SDOH - No Food statistics
no_food_columns = [
    "no_food_Never true", "no_food_Often true", 
    "no_food_PMI: Skip", "no_food_Sometimes true"
]

# Populate the data for each "no food" category
for food in no_food_columns:
    N_food_total = df_analysis[food].sum()
    N_food_no_remission = df_analysis[df_analysis['No_remission'] == 1][food].sum()
    N_food_with_remission = df_analysis[df_analysis['No_remission'] == 0][food].sum()

    percent_food_total = (N_food_total / N_total) * 100
    percent_food_no_remission = (N_food_no_remission / N_no_remission) * 100
    percent_food_with_remission = (N_food_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_food = [
        [N_food_no_remission, N_food_with_remission], 
        [N_no_remission - N_food_no_remission, N_with_remission - N_food_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_food)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        food, 
        f"{N_food_total} ({percent_food_total:.2f}%)", 
        f"{N_food_no_remission} ({percent_food_no_remission:.2f}%)",
        f"{N_food_with_remission} ({percent_food_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [11]:
# import library
from scipy.stats import ttest_ind

In [12]:
# Get age at first diagnosis statistics

# Mean age calculations
mean_age_total = df_analysis['age_at_first_diagnosis'].mean()
mean_age_no_remission = df_analysis[df_analysis['No_remission'] == 1]['age_at_first_diagnosis'].mean()
mean_age_with_remission = df_analysis[df_analysis['No_remission'] == 0]['age_at_first_diagnosis'].mean()

# t-test
age_no_remission = df_analysis[df_analysis['No_remission'] == 1]['age_at_first_diagnosis']
age_with_remission = df_analysis[df_analysis['No_remission'] == 0]['age_at_first_diagnosis']

t_stat, p_value = ttest_ind(age_no_remission, age_with_remission)

significance = "*" if p_value < 0.05 else "-"

# Inserting the results into the dataframe right after N (index 0.5)
descriptive_stats.loc[0.5] = ["Age at First Diagnosis (Mean)", 
                            f"{mean_age_total:.2f}", 
                            f"{mean_age_no_remission:.2f}", 
                            f"{mean_age_with_remission:.2f}",
                            f"{p_value:.4f}", 
                            significance]

# Fixing the order
descriptive_stats = descriptive_stats.sort_index().reset_index(drop=True)

In [13]:
# Get tobacco ever statistics

# Calculate the percentages
percentage_total = df_analysis['tobacco_ever'].mean() * 100
percentage_no_remission = df_analysis[df_analysis['No_remission'] == 1]['tobacco_ever'].mean() * 100
percentage_with_remission = df_analysis[df_analysis['No_remission'] == 0]['tobacco_ever'].mean() * 100

# Create contingency table for chi-square test
contingency_table = pd.crosstab(df_analysis['No_remission'], df_analysis['tobacco_ever'])

# Chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

significance = "*" if p_value < 0.05 else "-"

# Appending results to the descriptive_stats dataframe
descriptive_stats.loc[len(descriptive_stats)] = [
    "Ever Used Tobacco (%)",
    f"{percentage_total:.2f}",
    f"{percentage_no_remission:.2f}",
    f"{percentage_with_remission:.2f}",
    f"{p_value:.4f}",
    significance
]

In [14]:
# Get tobacco current statistics
# Calculate the percentages for current tobacco use
percentage_total_current = df_analysis['tobacco_current'].mean() * 100
percentage_no_remission_current = df_analysis[df_analysis['No_remission'] == 1]['tobacco_current'].mean() * 100
percentage_with_remission_current = df_analysis[df_analysis['No_remission'] == 0]['tobacco_current'].mean() * 100

# Create contingency table for chi-square test on current tobacco use
contingency_table_current = pd.crosstab(df_analysis['No_remission'], df_analysis['tobacco_current'])

# Chi-square test for current tobacco use
chi2_current, p_value_current, _, _ = chi2_contingency(contingency_table_current)

significance_current = "*" if p_value_current < 0.05 else "-"

# Appending the row directly to descriptive_stats dataframe using loc[]
descriptive_stats.loc[len(descriptive_stats)] = [
    "Currently Using Tobacco (%)",
    f"{percentage_total_current:.2f}",
    f"{percentage_no_remission_current:.2f}",
    f"{percentage_with_remission_current:.2f}",
    f"{p_value_current:.4f}",
    significance_current
]

In [15]:
# Get obesity statistics
# Calculate the percentages for obesity
percentage_total_obesity = df_analysis['obesity'].mean() * 100
percentage_no_remission_obesity = df_analysis[df_analysis['No_remission'] == 1]['obesity'].mean() * 100
percentage_with_remission_obesity = df_analysis[df_analysis['No_remission'] == 0]['obesity'].mean() * 100

# Create contingency table for chi-square test on obesity
contingency_table_obesity = pd.crosstab(df_analysis['No_remission'], df_analysis['obesity'])

# Chi-square test for obesity
chi2_obesity, p_value_obesity, _, _ = chi2_contingency(contingency_table_obesity)

significance_obesity = "*" if p_value_obesity < 0.05 else "-"

# Appending the row directly to descriptive_stats dataframe using loc[]
descriptive_stats.loc[len(descriptive_stats)] = [
    "Obesity (%)",
    f"{percentage_total_obesity:.2f}",
    f"{percentage_no_remission_obesity:.2f}",
    f"{percentage_with_remission_obesity:.2f}",
    f"{p_value_obesity:.4f}",
    significance_obesity
]

In [16]:
descriptive_stats

Unnamed: 0,Participant Characteristics,Entire Dataset,Without Remission,With Remission,p-value,Significance
0,N,1621,429,1192,-,-
1,Age at First Diagnosis (Mean),52.74,53.39,52.50,0.1340,-
2,race_Another single population,11 (0.68%),1 (0.23%),10 (0.84%),0.3332,-
3,race_Asian,40 (2.47%),11 (2.56%),29 (2.43%),1.0000,-
4,race_Black or African American,168 (10.36%),51 (11.89%),117 (9.82%),0.2647,-
5,race_I prefer not to answer,7 (0.43%),3 (0.70%),4 (0.34%),0.5783,-
6,race_More than one population,23 (1.42%),5 (1.17%),18 (1.51%),0.7799,-
7,race_None Indicated,152 (9.38%),24 (5.59%),128 (10.74%),0.0024,*
8,race_None of these,14 (0.86%),4 (0.93%),10 (0.84%),1.0000,-
9,race_PMI: Skip,25 (1.54%),8 (1.86%),17 (1.43%),0.6864,-


Feature Creation: Create Interaction Variables (2-way only due to memory constraints)

In [17]:
# Import library
from sklearn.preprocessing import PolynomialFeatures

In [18]:
# Drop the target variable
X = df_analysis.drop(columns=['No_remission'])

# Create polynomial features (interaction terms only)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)

# Create a dataframe for the interaction terms
df_interactions = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))

# Join the original df_analysis with the interactions dataframe
df_analysis_extended = pd.concat([df_analysis, df_interactions], axis=1)

In [19]:
df_analysis_extended.head()

Unnamed: 0,race_Another single population,race_Asian,race_Black or African American,race_I prefer not to answer,race_More than one population,race_None Indicated,race_None of these,race_PMI: Skip,race_White,ethnicity_Hispanic or Latino,...,no_food_Never true no_food_Often true,no_food_Never true no_food_PMI: Skip,no_food_Never true no_food_Sometimes true,no_food_Never true age_at_first_diagnosis,no_food_Often true no_food_PMI: Skip,no_food_Often true no_food_Sometimes true,no_food_Often true age_at_first_diagnosis,no_food_PMI: Skip no_food_Sometimes true,no_food_PMI: Skip age_at_first_diagnosis,no_food_Sometimes true age_at_first_diagnosis
0,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,66.201232,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,59.748118,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,52.490075,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,49.333333,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,55.96167,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df_analysis_extended.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621 entries, 0 to 1620
Columns: 30628 entries, race_Another single population to no_food_Sometimes true age_at_first_diagnosis
dtypes: float64(30382), int64(246)
memory usage: 378.8 MB


Split the data into Training (80%) and Test (20%)

In [21]:
# Import library
from sklearn.model_selection import train_test_split

In [22]:
# Splitting the data into training and test sets
train_set, test_set = train_test_split(df_analysis_extended, test_size=0.20, random_state=42)

In [23]:
train_set.head()

Unnamed: 0,race_Another single population,race_Asian,race_Black or African American,race_I prefer not to answer,race_More than one population,race_None Indicated,race_None of these,race_PMI: Skip,race_White,ethnicity_Hispanic or Latino,...,no_food_Never true no_food_Often true,no_food_Never true no_food_PMI: Skip,no_food_Never true no_food_Sometimes true,no_food_Never true age_at_first_diagnosis,no_food_Often true no_food_PMI: Skip,no_food_Often true no_food_Sometimes true,no_food_Often true age_at_first_diagnosis,no_food_PMI: Skip no_food_Sometimes true,no_food_PMI: Skip age_at_first_diagnosis,no_food_Sometimes true age_at_first_diagnosis
905,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,63.742642,0.0,0.0,0.0,0.0,0.0,0.0
1088,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,43.715264,0.0,0.0,0.0,0.0,0.0,0.0
1330,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,43.917864,0.0,0.0,0.0,0.0,0.0,0.0
398,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.390144
1603,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,59.904175,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296 entries, 905 to 1126
Columns: 30628 entries, race_Another single population to no_food_Sometimes true age_at_first_diagnosis
dtypes: float64(30382), int64(246)
memory usage: 302.9 MB


In [25]:
test_set.head()

Unnamed: 0,race_Another single population,race_Asian,race_Black or African American,race_I prefer not to answer,race_More than one population,race_None Indicated,race_None of these,race_PMI: Skip,race_White,ethnicity_Hispanic or Latino,...,no_food_Never true no_food_Often true,no_food_Never true no_food_PMI: Skip,no_food_Never true no_food_Sometimes true,no_food_Never true age_at_first_diagnosis,no_food_Often true no_food_PMI: Skip,no_food_Often true no_food_Sometimes true,no_food_Often true age_at_first_diagnosis,no_food_PMI: Skip no_food_Sometimes true,no_food_PMI: Skip age_at_first_diagnosis,no_food_Sometimes true age_at_first_diagnosis
135,0,0,0,0,0,1,0,0,0,1,...,0.0,0.0,0.0,59.868583,0.0,0.0,0.0,0.0,0.0,0.0
843,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,65.453799,0.0,0.0,0.0,0.0,0.0,0.0
1233,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,48.175222,0.0,0.0,0.0,0.0,0.0,0.0
1214,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,47.482546,0.0,0.0,0.0,0.0,0.0,0.0
628,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,47.096509,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325 entries, 135 to 707
Columns: 30628 entries, race_Another single population to no_food_Sometimes true age_at_first_diagnosis
dtypes: float64(30382), int64(246)
memory usage: 75.9 MB


Feature Selection: LASSO Regression with Cross-Validation

In [27]:
# Import library
from sklearn.linear_model import LassoCV

In [28]:
# Splitting the data into features and target
X_train = train_set.drop(columns=['No_remission'])
y_train = train_set['No_remission']

# Initializing LassoCV (5 folds)
lasso_cv = LassoCV(cv=5, random_state=42)

# Fitting the model
lasso_cv.fit(X_train, y_train)

# Get the feature coefficients
coef = pd.Series(lasso_cv.coef_, index=X_train.columns)

# Filter out the features which have a coefficient of zero
selected_features = coef[coef != 0].index.tolist()

print(f"Number of features selected: {len(selected_features)}")
print("\nSelected features:")
print(selected_features)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Number of features selected: 28

Selected features:
['ethnicity_Hispanic or Latino age_at_first_diagnosis', 'disease_group_928000 age_at_first_diagnosis', 'disease_group_2492009 age_at_first_diagnosis', 'disease_group_3723001 age_at_first_diagnosis', 'disease_group_11061003 age_at_first_diagnosis', 'disease_group_17226007 age_at_first_diagnosis', 'disease_group_40733004 age_at_first_diagnosis', 'disease_group_42030000 age_at_first_diagnosis', 'disease_group_49601007 age_at_first_diagnosis', 'disease_group_53619000 age_at_first_diagnosis', 'disease_group_74627003 age_at_first_diagnosis', 'disease_group_111273006 age_at_first_diagnosis', 'disease_group_118940003 age_at_first_diagnosis', 'disease_group_123946008 age_at_first_diagnosis', 'disease_group_128127008 age_at_first_diagnosis', 'disease_group_197480006 age_at_first_diagnosis', 'disease_group_234337006 age_at_first_diagnosis', 'disease_group_266607004 age_at_first_diagnosis', 'disease_group_362966006 age_at_first_diagnosis', 'disea

Model Training and Testing Using Selected Features

In [29]:
# Import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import (confusion_matrix, 
                             precision_score, 
                             recall_score, 
                             f1_score, 
                             matthews_corrcoef, 
                             roc_auc_score, 
                             average_precision_score)

In [30]:
# Splitting the test data into features and target
X_test = test_set.drop(columns=['No_remission'])
y_test = test_set['No_remission']

# Filter train and test sets for selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Metrics collection
results = {}

for name, model in models.items():
    # Train model
    model.fit(X_train_selected, y_train)
    
    # Predict
    y_pred = model.predict(X_test_selected)
    
    # Metrics
    confusion = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    fmeasure = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_area = roc_auc_score(y_test, model.predict_proba(X_test_selected)[:, 1])
    prc_area = average_precision_score(y_test, model.predict_proba(X_test_selected)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")


Model: Logistic Regression
Confusion Matrix: [[224  13]
 [ 66  22]]
Precision: 0.6285714285714286
Recall: 0.25
F-Measure: 0.35772357723577236
MCC: 0.2797344016266691
ROC Area: 0.7242040659762179
PRC Area: 0.5288581262877398


Model: Random Forest
Confusion Matrix: [[222  15]
 [ 66  22]]
Precision: 0.5945945945945946
Recall: 0.25
F-Measure: 0.352
MCC: 0.26120611040541486
ROC Area: 0.6829449558879939
PRC Area: 0.44852170876872866


Model: Naive Bayes
Confusion Matrix: [[184  53]
 [ 45  43]]
Precision: 0.4479166666666667
Recall: 0.48863636363636365
F-Measure: 0.4673913043478261
MCC: 0.25811937081521247
ROC Area: 0.6838319907940161
PRC Area: 0.4658230646589678


Model: XGBoost
Confusion Matrix: [[200  37]
 [ 58  30]]
Precision: 0.44776119402985076
Recall: 0.3409090909090909
F-Measure: 0.3870967741935484
MCC: 0.2029779896905173
ROC Area: 0.684095703874185
PRC Area: 0.46313702415963814


