This is data mining code for the HAP780 final project after exporting the datasets from All of Us

In [1]:
# Import library
import pandas as pd

# Load the datasets
df_analysis = pd.read_csv('./data/df_analysis_final.csv')

Descriptive statistics

In [2]:
# import libraries
import numpy as np
from scipy.stats import chi2_contingency # for chi-square tests

In [3]:
# Get sample size data
N_total = len(df_analysis)
N_no_remission = len(df_analysis[df_analysis['No_remission'] == 1])
N_with_remission = len(df_analysis[df_analysis['No_remission'] == 0])

print(f"N Total: {N_total} | N No Remission: {N_no_remission} | N With Remission: {N_with_remission}")

N Total: 1621 | N No Remission: 429 | N With Remission: 1192


In [7]:
# Get race statistics
race_columns = [
    "race_Another single population", "race_Asian", "race_Black or African American",
    "race_I prefer not to answer", "race_More than one population",
    "race_None Indicated", "race_None of these", "race_PMI: Skip", "race_White"
]

# Prepare the dataframe for statistics
descriptive_stats = pd.DataFrame(columns=["Participant Characteristics", "Entire Dataset", "Without Remission", "With Remission", "p-value", "Significance"])

# Populate the N values as the first row
descriptive_stats.loc[0] = ["N", N_total, N_no_remission, N_with_remission, "-", "-"]

# Populate the data for each race category
for race in race_columns:
    N_race_total = df_analysis[race].sum()
    N_race_no_remission = df_analysis[df_analysis['No_remission'] == 1][race].sum()
    N_race_with_remission = df_analysis[df_analysis['No_remission'] == 0][race].sum()

    percent_race_total = (N_race_total / N_total) * 100
    percent_race_no_remission = (N_race_no_remission / N_no_remission) * 100
    percent_race_with_remission = (N_race_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table = [[N_race_no_remission, N_race_with_remission], 
                         [N_no_remission - N_race_no_remission, N_with_remission - N_race_with_remission]]
    
    chi2, p, _, _ = chi2_contingency(contingency_table)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        race, 
        f"{N_race_total} ({percent_race_total:.2f}%)", 
        f"{N_race_no_remission} ({percent_race_no_remission:.2f}%)",
        f"{N_race_with_remission} ({percent_race_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [8]:
# Get ethnicity statistics
ethnicity_columns = [
    "ethnicity_Hispanic or Latino", "ethnicity_Not Hispanic or Latino",
    "ethnicity_PMI: Prefer Not To Answer", "ethnicity_PMI: Skip",
    "ethnicity_What Race Ethnicity: Race Ethnicity None Of These"
]

# Populate the data for each ethnicity category
for ethnicity in ethnicity_columns:
    N_ethnicity_total = df_analysis[ethnicity].sum()
    N_ethnicity_no_remission = df_analysis[df_analysis['No_remission'] == 1][ethnicity].sum()
    N_ethnicity_with_remission = df_analysis[df_analysis['No_remission'] == 0][ethnicity].sum()

    percent_ethnicity_total = (N_ethnicity_total / N_total) * 100
    percent_ethnicity_no_remission = (N_ethnicity_no_remission / N_no_remission) * 100
    percent_ethnicity_with_remission = (N_ethnicity_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_ethnicity = [
        [N_ethnicity_no_remission, N_ethnicity_with_remission], 
        [N_no_remission - N_ethnicity_no_remission, N_with_remission - N_ethnicity_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_ethnicity)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        ethnicity, 
        f"{N_ethnicity_total} ({percent_ethnicity_total:.2f}%)", 
        f"{N_ethnicity_no_remission} ({percent_ethnicity_no_remission:.2f}%)",
        f"{N_ethnicity_with_remission} ({percent_ethnicity_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [9]:
# Get insurace coverage statistics
insurance_columns = [
    "insurance_Health Insurance: No", "insurance_Health Insurance: Yes",
    "insurance_PMI: Dont Know", "insurance_PMI: Prefer Not To Answer",
    "insurance_PMI: Skip"
]

# Populate the data for each insurance category
for insurance in insurance_columns:
    N_insurance_total = df_analysis[insurance].sum()
    N_insurance_no_remission = df_analysis[df_analysis['No_remission'] == 1][insurance].sum()
    N_insurance_with_remission = df_analysis[df_analysis['No_remission'] == 0][insurance].sum()

    percent_insurance_total = (N_insurance_total / N_total) * 100
    percent_insurance_no_remission = (N_insurance_no_remission / N_no_remission) * 100
    percent_insurance_with_remission = (N_insurance_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_insurance = [
        [N_insurance_no_remission, N_insurance_with_remission], 
        [N_no_remission - N_insurance_no_remission, N_with_remission - N_insurance_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_insurance)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        insurance, 
        f"{N_insurance_total} ({percent_insurance_total:.2f}%)", 
        f"{N_insurance_no_remission} ({percent_insurance_no_remission:.2f}%)",
        f"{N_insurance_with_remission} ({percent_insurance_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [10]:
# Get income statistics
income_columns = [
    "income_Annual Income: 100k 150k", "income_Annual Income: 10k 25k",
    "income_Annual Income: 150k 200k", "income_Annual Income: 25k 35k",
    "income_Annual Income: 35k 50k", "income_Annual Income: 50k 75k",
    "income_Annual Income: 75k 100k", "income_Annual Income: less 10k",
    "income_Annual Income: more 200k", "income_PMI: Prefer Not To Answer",
    "income_PMI: Skip"
]

# Populate the data for each income category
for income in income_columns:
    N_income_total = df_analysis[income].sum()
    N_income_no_remission = df_analysis[df_analysis['No_remission'] == 1][income].sum()
    N_income_with_remission = df_analysis[df_analysis['No_remission'] == 0][income].sum()

    percent_income_total = (N_income_total / N_total) * 100
    percent_income_no_remission = (N_income_no_remission / N_no_remission) * 100
    percent_income_with_remission = (N_income_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_income = [
        [N_income_no_remission, N_income_with_remission], 
        [N_no_remission - N_income_no_remission, N_with_remission - N_income_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_income)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        income, 
        f"{N_income_total} ({percent_income_total:.2f}%)", 
        f"{N_income_no_remission} ({percent_income_no_remission:.2f}%)",
        f"{N_income_with_remission} ({percent_income_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [11]:
# Get SDOH - Near Store statistics
near_store_columns = [
    "near_store_PMI: Skip", "near_store_Somewhat agree",
    "near_store_Somewhat disagree", "near_store_Strongly agree",
    "near_store_Strongly disagree"
]

# Populate the data for each "near store" category
for store in near_store_columns:
    N_store_total = df_analysis[store].sum()
    N_store_no_remission = df_analysis[df_analysis['No_remission'] == 1][store].sum()
    N_store_with_remission = df_analysis[df_analysis['No_remission'] == 0][store].sum()

    percent_store_total = (N_store_total / N_total) * 100
    percent_store_no_remission = (N_store_no_remission / N_no_remission) * 100
    percent_store_with_remission = (N_store_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_store = [
        [N_store_no_remission, N_store_with_remission], 
        [N_no_remission - N_store_no_remission, N_with_remission - N_store_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_store)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        store, 
        f"{N_store_total} ({percent_store_total:.2f}%)", 
        f"{N_store_no_remission} ({percent_store_no_remission:.2f}%)",
        f"{N_store_with_remission} ({percent_store_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [12]:
# Get SDOH - Near Transit statistics
near_transit_columns = [
    "near_transit_PMI: Dont Know", "near_transit_PMI: Skip", 
    "near_transit_Somewhat agree", "near_transit_Somewhat disagree",
    "near_transit_Strongly agree", "near_transit_Strongly disagree"
]

# Populate the data for each "near transit" category
for transit in near_transit_columns:
    N_transit_total = df_analysis[transit].sum()
    N_transit_no_remission = df_analysis[df_analysis['No_remission'] == 1][transit].sum()
    N_transit_with_remission = df_analysis[df_analysis['No_remission'] == 0][transit].sum()

    percent_transit_total = (N_transit_total / N_total) * 100
    percent_transit_no_remission = (N_transit_no_remission / N_no_remission) * 100
    percent_transit_with_remission = (N_transit_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_transit = [
        [N_transit_no_remission, N_transit_with_remission], 
        [N_no_remission - N_transit_no_remission, N_with_remission - N_transit_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_transit)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        transit, 
        f"{N_transit_total} ({percent_transit_total:.2f}%)", 
        f"{N_transit_no_remission} ({percent_transit_no_remission:.2f}%)",
        f"{N_transit_with_remission} ({percent_transit_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [13]:
# Get SDOH - No Food statistics
no_food_columns = [
    "no_food_Never true", "no_food_Often true", 
    "no_food_PMI: Skip", "no_food_Sometimes true"
]

# Populate the data for each "no food" category
for food in no_food_columns:
    N_food_total = df_analysis[food].sum()
    N_food_no_remission = df_analysis[df_analysis['No_remission'] == 1][food].sum()
    N_food_with_remission = df_analysis[df_analysis['No_remission'] == 0][food].sum()

    percent_food_total = (N_food_total / N_total) * 100
    percent_food_no_remission = (N_food_no_remission / N_no_remission) * 100
    percent_food_with_remission = (N_food_with_remission / N_with_remission) * 100

    # Chi-square test
    contingency_table_food = [
        [N_food_no_remission, N_food_with_remission], 
        [N_no_remission - N_food_no_remission, N_with_remission - N_food_with_remission]
    ]
    
    chi2, p, _, _ = chi2_contingency(contingency_table_food)

    significance = "*" if p < 0.05 else "-"

    descriptive_stats.loc[len(descriptive_stats)] = [
        food, 
        f"{N_food_total} ({percent_food_total:.2f}%)", 
        f"{N_food_no_remission} ({percent_food_no_remission:.2f}%)",
        f"{N_food_with_remission} ({percent_food_with_remission:.2f}%)",
        f"{p:.4f}",
        significance
    ]

In [14]:
# import library
from scipy.stats import ttest_ind

In [15]:
# Get age at first diagnosis statistics

# Mean age calculations
mean_age_total = df_analysis['age_at_first_diagnosis'].mean()
mean_age_no_remission = df_analysis[df_analysis['No_remission'] == 1]['age_at_first_diagnosis'].mean()
mean_age_with_remission = df_analysis[df_analysis['No_remission'] == 0]['age_at_first_diagnosis'].mean()

# t-test
age_no_remission = df_analysis[df_analysis['No_remission'] == 1]['age_at_first_diagnosis']
age_with_remission = df_analysis[df_analysis['No_remission'] == 0]['age_at_first_diagnosis']

t_stat, p_value = ttest_ind(age_no_remission, age_with_remission)

significance = "*" if p_value < 0.05 else "-"

# Inserting the results into the dataframe right after N (index 0.5)
descriptive_stats.loc[0.5] = ["Age at First Diagnosis (Mean)", 
                            f"{mean_age_total:.2f}", 
                            f"{mean_age_no_remission:.2f}", 
                            f"{mean_age_with_remission:.2f}",
                            f"{p_value:.4f}", 
                            significance]

# Fixing the order
descriptive_stats = descriptive_stats.sort_index().reset_index(drop=True)

In [16]:
# Get tobacco ever statistics

# Calculate the percentages
percentage_total = df_analysis['tobacco_ever'].mean() * 100
percentage_no_remission = df_analysis[df_analysis['No_remission'] == 1]['tobacco_ever'].mean() * 100
percentage_with_remission = df_analysis[df_analysis['No_remission'] == 0]['tobacco_ever'].mean() * 100

# Create contingency table for chi-square test
contingency_table = pd.crosstab(df_analysis['No_remission'], df_analysis['tobacco_ever'])

# Chi-square test
chi2, p_value, _, _ = chi2_contingency(contingency_table)

significance = "*" if p_value < 0.05 else "-"

# Appending results to the descriptive_stats dataframe
descriptive_stats.loc[len(descriptive_stats)] = [
    "Ever Used Tobacco (%)",
    f"{percentage_total:.2f}",
    f"{percentage_no_remission:.2f}",
    f"{percentage_with_remission:.2f}",
    f"{p_value:.4f}",
    significance
]

In [17]:
# Get tobacco current statistics
# Calculate the percentages for current tobacco use
percentage_total_current = df_analysis['tobacco_current'].mean() * 100
percentage_no_remission_current = df_analysis[df_analysis['No_remission'] == 1]['tobacco_current'].mean() * 100
percentage_with_remission_current = df_analysis[df_analysis['No_remission'] == 0]['tobacco_current'].mean() * 100

# Create contingency table for chi-square test on current tobacco use
contingency_table_current = pd.crosstab(df_analysis['No_remission'], df_analysis['tobacco_current'])

# Chi-square test for current tobacco use
chi2_current, p_value_current, _, _ = chi2_contingency(contingency_table_current)

significance_current = "*" if p_value_current < 0.05 else "-"

# Appending the row directly to descriptive_stats dataframe using loc[]
descriptive_stats.loc[len(descriptive_stats)] = [
    "Currently Using Tobacco (%)",
    f"{percentage_total_current:.2f}",
    f"{percentage_no_remission_current:.2f}",
    f"{percentage_with_remission_current:.2f}",
    f"{p_value_current:.4f}",
    significance_current
]

In [18]:
# Get obesity statistics
# Calculate the percentages for obesity
percentage_total_obesity = df_analysis['obesity'].mean() * 100
percentage_no_remission_obesity = df_analysis[df_analysis['No_remission'] == 1]['obesity'].mean() * 100
percentage_with_remission_obesity = df_analysis[df_analysis['No_remission'] == 0]['obesity'].mean() * 100

# Create contingency table for chi-square test on obesity
contingency_table_obesity = pd.crosstab(df_analysis['No_remission'], df_analysis['obesity'])

# Chi-square test for obesity
chi2_obesity, p_value_obesity, _, _ = chi2_contingency(contingency_table_obesity)

significance_obesity = "*" if p_value_obesity < 0.05 else "-"

# Appending the row directly to descriptive_stats dataframe using loc[]
descriptive_stats.loc[len(descriptive_stats)] = [
    "Obesity (%)",
    f"{percentage_total_obesity:.2f}",
    f"{percentage_no_remission_obesity:.2f}",
    f"{percentage_with_remission_obesity:.2f}",
    f"{p_value_obesity:.4f}",
    significance_obesity
]

In [19]:
descriptive_stats

Unnamed: 0,Participant Characteristics,Entire Dataset,Without Remission,With Remission,p-value,Significance
0,N,1621,429,1192,-,-
1,Age at First Diagnosis (Mean),52.74,53.39,52.50,0.1340,-
2,race_Another single population,11 (0.68%),1 (0.23%),10 (0.84%),0.3332,-
3,race_Asian,40 (2.47%),11 (2.56%),29 (2.43%),1.0000,-
4,race_Black or African American,168 (10.36%),51 (11.89%),117 (9.82%),0.2647,-
5,race_I prefer not to answer,7 (0.43%),3 (0.70%),4 (0.34%),0.5783,-
6,race_More than one population,23 (1.42%),5 (1.17%),18 (1.51%),0.7799,-
7,race_None Indicated,152 (9.38%),24 (5.59%),128 (10.74%),0.0024,*
8,race_None of these,14 (0.86%),4 (0.93%),10 (0.84%),1.0000,-
9,race_PMI: Skip,25 (1.54%),8 (1.86%),17 (1.43%),0.6864,-


Feature Creation: Age as dummy variables in decades

In [17]:
# Define bins
bins = [0, 18, 28, 38, 48, 58, 68, 78, 88, 98, float('inf')]
labels = ['<18', '18-27', '28-37', '38-47', '48-57', '58-67', '68-77', '78-87', '88-97', '98+']

# Cut the age_at_first_diagnosis into bins
df_analysis['age_group'] = pd.cut(df_analysis['age_at_first_diagnosis'], bins=bins, labels=labels, right=False)

# Convert the binned data into dummy variables
age_dummies = pd.get_dummies(df_analysis['age_group'])

# Concatenate the dummy variables with the original dataframe if needed
df_analysis = pd.concat([df_analysis, age_dummies], axis=1)

# Drop the 'age_at_first_diagnosis' and 'age_group' columns from the dataframe
df_analysis = df_analysis.drop(['age_at_first_diagnosis', 'age_group'], axis=1)

In [18]:
df_analysis.head()

Unnamed: 0,race_Another single population,race_Asian,race_Black or African American,race_I prefer not to answer,race_More than one population,race_None Indicated,race_None of these,race_PMI: Skip,race_White,ethnicity_Hispanic or Latino,...,<18,18-27,28-37,38-47,48-57,58-67,68-77,78-87,88-97,98+
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


Feature Creation: Create Interaction Variables (2-way only due to memory constraints)

In [19]:
# Import library
from sklearn.preprocessing import PolynomialFeatures

In [20]:
# Drop the target variable
X = df_analysis.drop(columns=['No_remission'])

# Create polynomial features (interaction terms only)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)

# Create a dataframe for the interaction terms
df_interactions = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))

# Identify interaction terms only (exclude original feature columns)
interaction_columns = df_interactions.columns[~df_interactions.columns.isin(X.columns)]

# Join the original df_analysis with the interaction terms dataframe
df_analysis_extended = pd.concat([df_analysis, df_interactions[interaction_columns]], axis=1)

Data cleaning: Remove columns that have all zeroes or all ones

In [21]:
# Get all columns that are all ones or all zeroes for dropping
columns_to_drop = df_analysis_extended.columns[(
    df_analysis_extended.sum(axis=0) == len(df_analysis_extended)) | (df_analysis_extended.sum(axis=0) == 0)]

# Drop these columns
df_analysis_extended = df_analysis_extended.drop(columns=columns_to_drop)

In [22]:
df_analysis_extended.head()

Unnamed: 0,race_Another single population,race_Asian,race_Black or African American,race_I prefer not to answer,race_More than one population,race_None Indicated,race_None of these,race_PMI: Skip,race_White,ethnicity_Hispanic or Latino,...,no_food_Often true 38-47,no_food_Often true 48-57,no_food_Often true 58-67,no_food_PMI: Skip 38-47,no_food_PMI: Skip 48-57,no_food_Sometimes true 28-37,no_food_Sometimes true 38-47,no_food_Sometimes true 48-57,no_food_Sometimes true 58-67,no_food_Sometimes true 68-77
0,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,0,0,0,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df_analysis_extended.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621 entries, 0 to 1620
Columns: 15239 entries, race_Another single population to no_food_Sometimes true 68-77
dtypes: float64(14986), int64(246), uint8(7)
memory usage: 188.4 MB


Split the data into Training (80%) and Test (20%)

In [24]:
# Import library
from sklearn.model_selection import train_test_split

In [25]:
# Splitting the data into training and test sets
train_set, test_set = train_test_split(df_analysis_extended, test_size=0.20, random_state=42)

In [26]:
train_set.head()

Unnamed: 0,race_Another single population,race_Asian,race_Black or African American,race_I prefer not to answer,race_More than one population,race_None Indicated,race_None of these,race_PMI: Skip,race_White,ethnicity_Hispanic or Latino,...,no_food_Often true 38-47,no_food_Often true 48-57,no_food_Often true 58-67,no_food_PMI: Skip 38-47,no_food_PMI: Skip 48-57,no_food_Sometimes true 28-37,no_food_Sometimes true 38-47,no_food_Sometimes true 48-57,no_food_Sometimes true 58-67,no_food_Sometimes true 68-77
905,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1088,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1330,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
398,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1603,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296 entries, 905 to 1126
Columns: 15239 entries, race_Another single population to no_food_Sometimes true 68-77
dtypes: float64(14986), int64(246), uint8(7)
memory usage: 150.6 MB


In [28]:
test_set.head()

Unnamed: 0,race_Another single population,race_Asian,race_Black or African American,race_I prefer not to answer,race_More than one population,race_None Indicated,race_None of these,race_PMI: Skip,race_White,ethnicity_Hispanic or Latino,...,no_food_Often true 38-47,no_food_Often true 48-57,no_food_Often true 58-67,no_food_PMI: Skip 38-47,no_food_PMI: Skip 48-57,no_food_Sometimes true 28-37,no_food_Sometimes true 38-47,no_food_Sometimes true 48-57,no_food_Sometimes true 58-67,no_food_Sometimes true 68-77
135,0,0,0,0,0,1,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
843,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1233,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1214,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
628,0,0,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325 entries, 135 to 707
Columns: 15239 entries, race_Another single population to no_food_Sometimes true 68-77
dtypes: float64(14986), int64(246), uint8(7)
memory usage: 37.8 MB


Baseline models without feature selection

In [30]:
# Import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import (confusion_matrix, 
                             precision_score, 
                             recall_score, 
                             f1_score, 
                             matthews_corrcoef, 
                             roc_auc_score, 
                             average_precision_score)

In [31]:
# Splitting the data into features and target
X_train = train_set.drop(columns=['No_remission'])
y_train = train_set['No_remission']


# Splitting the test data into features and target
X_test = test_set.drop(columns=['No_remission'])
y_test = test_set['No_remission']

In [32]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Metrics collection
results = {}

for name, model in models.items():
    # Train model
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Metrics
    confusion = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    fmeasure = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_area = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    prc_area = average_precision_score(y_test, model.predict_proba(X_test)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")


Model: Logistic Regression
Confusion Matrix: [[210  27]
 [ 51  37]]
Precision: 0.578125
Recall: 0.42045454545454547
F-Measure: 0.4868421052631579
MCC: 0.34251449452951
ROC Area: 0.6740506329113923
PRC Area: 0.4858911041187189


Model: Random Forest
Confusion Matrix: [[228   9]
 [ 73  15]]
Precision: 0.625
Recall: 0.17045454545454544
F-Measure: 0.26785714285714285
MCC: 0.22510052462717733
ROC Area: 0.691072113540468
PRC Area: 0.49988793745405424


Model: Naive Bayes
Confusion Matrix: [[113 124]
 [ 47  41]]
Precision: 0.24848484848484848
Recall: 0.4659090909090909
F-Measure: 0.32411067193675885
MCC: -0.05092727430328025
ROC Area: 0.47135116992711923
PRC Area: 0.2603867344776436


Model: XGBoost
Confusion Matrix: [[213  24]
 [ 62  26]]
Precision: 0.52
Recall: 0.29545454545454547
F-Measure: 0.3768115942028986
MCC: 0.23915979090632775
ROC Area: 0.6588991177598772
PRC Area: 0.4233854269576437




Feature Selection: LASSO Regression with Cross-Validation

In [33]:
# Import library
from sklearn.linear_model import LassoCV

In [34]:
# Initializing LassoCV (5 folds)
lasso_cv = LassoCV(cv=5, random_state=42)

# Fitting the model
lasso_cv.fit(X_train, y_train)

# Get the feature coefficients
coef = pd.Series(lasso_cv.coef_, index=X_train.columns)

# Filter out the features which have a coefficient of zero
selected_features_lasso = coef[coef != 0].index.tolist()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [35]:
# Print selected feature and direction
print(f"Number of features selected: {len(selected_features_lasso)}\n")

for feature, coef in zip(selected_features_lasso, lasso_cv.coef_):
    effect = 'increase' if coef > 0 else 'decrease' if coef < 0 else 'no effect'
    print(f"feature: '{feature}' | effect: {effect}")

Number of features selected: 27

feature: 'race_White disease_group_363171009' | effect: no effect
feature: 'race_White disease_group_400096001' | effect: no effect
feature: 'ethnicity_Not Hispanic or Latino disease_group_2492009' | effect: no effect
feature: 'ethnicity_Not Hispanic or Latino disease_group_49601007' | effect: no effect
feature: 'ethnicity_Not Hispanic or Latino disease_group_197480006' | effect: no effect
feature: 'ethnicity_Not Hispanic or Latino near_transit_Strongly agree' | effect: no effect
feature: 'disease_group_928000 disease_group_2492009' | effect: no effect
feature: 'disease_group_928000 disease_group_3723001' | effect: no effect
feature: 'disease_group_928000 disease_group_32895009' | effect: no effect
feature: 'disease_group_928000 disease_group_42030000' | effect: no effect
feature: 'disease_group_928000 disease_group_49601007' | effect: no effect
feature: 'disease_group_928000 disease_group_53619000' | effect: no effect
feature: 'disease_group_928000 dis

Model Training and Testing Using Selected Features (LASSO)

In [36]:
# Filter train and test sets for selected features
X_train_selected_lasso = X_train[selected_features_lasso]
X_test_selected_lasso = X_test[selected_features_lasso]

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Metrics collection
results = {}

for name, model in models.items():
    # Train model
    model.fit(X_train_selected_lasso, y_train)
    
    # Predict
    y_pred = model.predict(X_test_selected_lasso)
    
    # Metrics
    confusion = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    fmeasure = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_area = roc_auc_score(y_test, model.predict_proba(X_test_selected_lasso)[:, 1])
    prc_area = average_precision_score(y_test, model.predict_proba(X_test_selected_lasso)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")

Model: Logistic Regression
Confusion Matrix: [[222  15]
 [ 58  30]]
Precision: 0.6666666666666666
Recall: 0.3409090909090909
F-Measure: 0.4511278195488721
MCC: 0.3571723095971254
ROC Area: 0.7018364019946298
PRC Area: 0.524258839334735


Model: Random Forest
Confusion Matrix: [[212  25]
 [ 60  28]]
Precision: 0.5283018867924528
Recall: 0.3181818181818182
F-Measure: 0.39716312056737585
MCC: 0.2558312787257721
ROC Area: 0.6659953970080552
PRC Area: 0.47157682441214155


Model: Naive Bayes
Confusion Matrix: [[185  52]
 [ 41  47]]
Precision: 0.47474747474747475
Recall: 0.5340909090909091
F-Measure: 0.5026737967914439
MCC: 0.30381910378282784
ROC Area: 0.6923427311085538
PRC Area: 0.4855105937945749


Model: XGBoost
Confusion Matrix: [[209  28]
 [ 59  29]]
Precision: 0.5087719298245614
Recall: 0.32954545454545453
F-Measure: 0.39999999999999997
MCC: 0.24701297419671975
ROC Area: 0.6172564250095895
PRC Area: 0.478238192428965




Class Balancing Using Synthetic Minority Over-sampling Technique (SMOTE)

In [37]:
# Import libraries
from imblearn.over_sampling import SMOTE

In [38]:
# Using SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Resampled sample size
print(f"Original training sample size: {X_train.shape[0]}")
print(f"Resampled training sample size: {X_resampled.shape[0]}")

Original training sample size: 1296
Resampled training sample size: 1910


Model Training and Testing Using Baseline Balanced Data

In [39]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Metrics collection
results = {}

for name, model in models.items():
    # Train model
    model.fit(X_resampled, y_resampled)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Metrics
    confusion = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    fmeasure = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_area = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    prc_area = average_precision_score(y_test, model.predict_proba(X_test)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")

Model: Logistic Regression
Confusion Matrix: [[201  36]
 [ 49  39]]
Precision: 0.52
Recall: 0.4431818181818182
F-Measure: 0.47852760736196326
MCC: 0.3072063137656701
ROC Area: 0.666235136171845
PRC Area: 0.47616874900832173


Model: Random Forest
Confusion Matrix: [[222  15]
 [ 69  19]]
Precision: 0.5588235294117647
Recall: 0.2159090909090909
F-Measure: 0.3114754098360656
MCC: 0.2215823563986596
ROC Area: 0.7042577675489067
PRC Area: 0.4981169149386957


Model: Naive Bayes
Confusion Matrix: [[113 124]
 [ 47  41]]
Precision: 0.24848484848484848
Recall: 0.4659090909090909
F-Measure: 0.32411067193675885
MCC: -0.05092727430328025
ROC Area: 0.47135116992711923
PRC Area: 0.2603867344776436


Model: XGBoost
Confusion Matrix: [[212  25]
 [ 63  25]]
Precision: 0.5
Recall: 0.2840909090909091
F-Measure: 0.36231884057971014
MCC: 0.21996795583359777
ROC Area: 0.6711258151131569
PRC Area: 0.4638713045574123




Feature Selection: LASSO Regression with Cross-Validation

In [40]:
# Initializing LassoCV (5 folds)
lasso_cv_2 = LassoCV(cv=5, random_state=42)

# Fitting the model
lasso_cv_2.fit(X_resampled, y_resampled)

# Get the feature coefficients
coef_2 = pd.Series(lasso_cv_2.coef_, index=X_resampled.columns)

# Filter out the features which have a coefficient of zero
selected_features_lasso_2 = coef_2[coef_2 != 0].index.tolist()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [41]:
# Print selected feature and direction
print(f"Number of features selected: {len(selected_features_lasso_2)}\n")

for feature, coef in zip(selected_features_lasso_2, lasso_cv_2.coef_):
    effect = 'Increase' if coef > 0 else 'Decrease' if coef < 0 else 'Unknown'
    print(f"feature: '{feature}' | effect: {effect}")

Number of features selected: 675

feature: 'race_White' | effect: Unknown
feature: 'ethnicity_Not Hispanic or Latino' | effect: Unknown
feature: 'disease_group_928000' | effect: Unknown
feature: 'disease_group_2492009' | effect: Unknown
feature: 'disease_group_3723001' | effect: Unknown
feature: 'disease_group_11061003' | effect: Unknown
feature: 'disease_group_32895009' | effect: Unknown
feature: 'disease_group_35688006' | effect: Unknown
feature: 'disease_group_40733004' | effect: Decrease
feature: 'disease_group_41266007' | effect: Unknown
feature: 'disease_group_42030000' | effect: Decrease
feature: 'disease_group_46206005' | effect: Unknown
feature: 'disease_group_49601007' | effect: Unknown
feature: 'disease_group_50043002' | effect: Unknown
feature: 'disease_group_53619000' | effect: Unknown
feature: 'disease_group_55342001' | effect: Decrease
feature: 'disease_group_116225000' | effect: Unknown
feature: 'disease_group_118940003' | effect: Decrease
feature: 'disease_group_123946

Model Training and Testing Using Balanced Data and Selected Features (LASSO)

In [42]:
# Filter train and test sets for selected features
X_train_selected_lasso_2 = X_resampled[selected_features_lasso_2]
X_test_selected_lasso_2 = X_test[selected_features_lasso_2]

# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': GaussianNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Metrics collection
results = {}

for name, model in models.items():
    # Train model
    model.fit(X_train_selected_lasso_2, y_resampled)
    
    # Predict
    y_pred = model.predict(X_test_selected_lasso_2)
    
    # Metrics
    confusion = confusion_matrix(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    fmeasure = f1_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    roc_area = roc_auc_score(y_test, model.predict_proba(X_test_selected_lasso_2)[:, 1])
    prc_area = average_precision_score(y_test, model.predict_proba(X_test_selected_lasso_2)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")

Model: Logistic Regression
Confusion Matrix: [[192  45]
 [ 51  37]]
Precision: 0.45121951219512196
Recall: 0.42045454545454547
F-Measure: 0.43529411764705883
MCC: 0.2359006866438814
ROC Area: 0.6725642500958957
PRC Area: 0.47098096596633543


Model: Random Forest
Confusion Matrix: [[216  21]
 [ 68  20]]
Precision: 0.4878048780487805
Recall: 0.22727272727272727
F-Measure: 0.31007751937984496
MCC: 0.1855802182600859
ROC Area: 0.7020281933256617
PRC Area: 0.44216512977252864


Model: Naive Bayes
Confusion Matrix: [[188  49]
 [ 56  32]]
Precision: 0.3950617283950617
Recall: 0.36363636363636365
F-Measure: 0.378698224852071
MCC: 0.16116115207122697
ROC Area: 0.573024549290372
PRC Area: 0.3620609935057382


Model: XGBoost
Confusion Matrix: [[206  31]
 [ 57  31]]
Precision: 0.5
Recall: 0.3522727272727273
F-Measure: 0.41333333333333333
MCC: 0.25047174080276274
ROC Area: 0.6739547372458764
PRC Area: 0.4849458000634811




Hyperparameter Tuning Of Best Data and Feature Combination On Recall
- Scoring will be based on recall
- Chosen training set with highest recall: 
-- Logistic Regression - Balanced no Feature Selection
-- Random Forest - Unbalanced with Feature Selection
-- Naive Bayes - Unbalanced with Feature Selection
-- XGBoost - Balanced with Feature Selection

In [43]:
# Import library
from sklearn.model_selection import GridSearchCV

In [44]:
# Logistic Regression

# Define training sets as balanced no feature selection (X_resampled, y_resampled)

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Initialize the GridSearchCV object for Logistic Regression
grid_search_lr = GridSearchCV(estimator=LogisticRegression(solver='liblinear'), 
                              param_grid=param_grid_lr, 
                              scoring=['recall'], 
                              refit='recall', 
                              cv=5)

# Fit the grid search to the data
grid_search_lr.fit(X_resampled, y_resampled)

# After fitting, we can check the best performance in the training set
print("Best parameters set found on training set:")
print(grid_search_lr.best_params_)

# Predict
best_estimator = grid_search_lr.best_estimator_
y_pred = best_estimator.predict(X_test)
    
# Metrics
confusion = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_area = roc_auc_score(y_test, best_estimator.predict_proba(X_test)[:, 1])
prc_area = average_precision_score(y_test, best_estimator.predict_proba(X_test)[:, 1])
    
results = {
    'Confusion Matrix': confusion,
    'Precision': precision,
    'Recall': recall,
    'F-Measure': fmeasure,
    'MCC': mcc,
    'ROC Area': roc_area,
    'PRC Area': prc_area
}
    
# Display results
for metric_name, metric_value in results.items():
    print(f"{metric_name}: {metric_value}")

Best parameters set found on training set:
{'C': 10, 'penalty': 'l2'}
Confusion Matrix: [[195  42]
 [ 47  41]]
Precision: 0.4939759036144578
Recall: 0.4659090909090909
F-Measure: 0.47953216374269003
MCC: 0.294175424586991
ROC Area: 0.6591388569236671
PRC Area: 0.4542908435893699


In [45]:
# Random Forest

# Define training sets as unbalanced with feature selection (X_train_selected_lasso, y_train)
# Define test set as unbalanced with feature selection (X_test_selected_lasso)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object for Random Forest
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(), 
                              param_grid=param_grid_rf, 
                              scoring=['recall'], 
                              refit='recall', 
                              cv=5)

# Fit the grid search to the data
grid_search_rf.fit(X_train_selected_lasso, y_train)

# After fitting, we can check the best performance in the training set
print("Best parameters set found on training set:")
print(grid_search_rf.best_params_)

# Predict
best_estimator = grid_search_rf.best_estimator_
y_pred = best_estimator.predict(X_test_selected_lasso)
    
# Metrics
confusion = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_area = roc_auc_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
prc_area = average_precision_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
    
results = {
    'Confusion Matrix': confusion,
    'Precision': precision,
    'Recall': recall,
    'F-Measure': fmeasure,
    'MCC': mcc,
    'ROC Area': roc_area,
    'PRC Area': prc_area
}
    
# Display results
for metric_name, metric_value in results.items():
    print(f"{metric_name}: {metric_value}")

Best parameters set found on training set:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}
Confusion Matrix: [[219  18]
 [ 60  28]]
Precision: 0.6086956521739131
Recall: 0.3181818181818182
F-Measure: 0.41791044776119407
MCC: 0.30879255101243663
ROC Area: 0.665012466436517
PRC Area: 0.4732360733215331


In [46]:
# Naive Bayes

# Define training sets as unbalanced with feature selection (X_train_selected_lasso, y_train)
# Define test set as unbalanced with feature selection (X_test_selected_lasso)

# Define the parameter grid for GaussianNB
param_grid_gnb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

# Initialize the GridSearchCV object for GaussianNB
grid_search_gnb = GridSearchCV(estimator=GaussianNB(), 
                               param_grid=param_grid_gnb, 
                               scoring=['recall'], 
                               refit='recall', 
                               cv=5)

# Fit the grid search to the data
grid_search_gnb.fit(X_train_selected_lasso, y_train)

# After fitting, we can check the best performance in the training set
print("Best parameters set found on training set:")
print(grid_search_gnb.best_params_)

# Predict
best_estimator = grid_search_gnb.best_estimator_
y_pred = best_estimator.predict(X_test_selected_lasso)
    
# Metrics
confusion = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_area = roc_auc_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
prc_area = average_precision_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
    
results = {
    'Confusion Matrix': confusion,
    'Precision': precision,
    'Recall': recall,
    'F-Measure': fmeasure,
    'MCC': mcc,
    'ROC Area': roc_area,
    'PRC Area': prc_area
}
    
# Display results
for metric_name, metric_value in results.items():
    print(f"{metric_name}: {metric_value}")

Best parameters set found on training set:
{'var_smoothing': 0.008111308307896872}
Confusion Matrix: [[185  52]
 [ 41  47]]
Precision: 0.47474747474747475
Recall: 0.5340909090909091
F-Measure: 0.5026737967914439
MCC: 0.30381910378282784
ROC Area: 0.692150939777522
PRC Area: 0.48580411161330267


In [47]:
# XGBoost

# Define training sets as balanced with feature selection (X_train_selected_lasso_2, y_resampled)
# Define test set as balanced with feature selection (X_test_selected_lasso_2)

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'colsample_bytree': [0.3, 0.7, 1]
}

# Initialize the GridSearchCV object for XGBoost
grid_search_xgb = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
                               param_grid=param_grid_xgb, 
                               scoring=['recall'], 
                               refit='recall', 
                               cv=5)

# Fit the grid search to the data
grid_search_xgb.fit(X_train_selected_lasso_2, y_resampled)

# After fitting, we can check the best performance in the training set
print("Best parameters set found on training set:")
print(grid_search_xgb.best_params_)

# Predict
best_estimator = grid_search_xgb.best_estimator_
y_pred = best_estimator.predict(X_test_selected_lasso_2)
    
# Metrics
confusion = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_area = roc_auc_score(y_test, best_estimator.predict_proba(X_test_selected_lasso_2)[:, 1])
prc_area = average_precision_score(y_test, best_estimator.predict_proba(X_test_selected_lasso_2)[:, 1])
    
results = {
    'Confusion Matrix': confusion,
    'Precision': precision,
    'Recall': recall,
    'F-Measure': fmeasure,
    'MCC': mcc,
    'ROC Area': roc_area,
    'PRC Area': prc_area
}
    
# Display results
for metric_name, metric_value in results.items():
    print(f"{metric_name}: {metric_value}")

Best parameters set found on training set:
{'colsample_bytree': 0.7, 'learning_rate': 0.5, 'max_depth': 5, 'n_estimators': 100}
Confusion Matrix: [[197  40]
 [ 59  29]]
Precision: 0.42028985507246375
Recall: 0.32954545454545453
F-Measure: 0.3694267515923567
MCC: 0.17469220086087794
ROC Area: 0.6763521288837745
PRC Area: 0.4702358038972644
