This is data mining code for the HAP780 final project after exporting the datasets from All of Us

In [1]:
# Import libraries
import pandas as pd
import numpy as np

In [2]:
# Load the datasets
df_analysis = pd.read_csv('./data/df_analysis_final.csv')

Feature Creation: Age as dummy variables in decades

In [3]:
# Define bins
bins = [0, 18, 28, 38, 48, 58, 68, 78, 88, 98, float('inf')]
labels = ['<18', '18-27', '28-37', '38-47', '48-57', '58-67', '68-77', '78-87', '88-97', '98+']

# Cut the age_at_first_diagnosis into bins
df_analysis['age_group'] = pd.cut(df_analysis['age_at_first_diagnosis'], bins=bins, labels=labels, right=False)

# Convert the binned data into dummy variables
age_dummies = pd.get_dummies(df_analysis['age_group'])

# Concatenate the dummy variables with the original dataframe if needed
df_analysis = pd.concat([df_analysis, age_dummies], axis=1)

# Drop the 'age_at_first_diagnosis' and 'age_group' columns from the dataframe
df_analysis = df_analysis.drop(['age_at_first_diagnosis', 'age_group'], axis=1)

In [4]:
df_analysis.head()

Unnamed: 0,race_Another single population,race_Asian,race_Black or African American,race_I prefer not to answer,race_More than one population,race_None Indicated,race_None of these,race_PMI: Skip,race_White,ethnicity_Hispanic or Latino,...,<18,18-27,28-37,38-47,48-57,58-67,68-77,78-87,88-97,98+
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [5]:
# Do feature selection with LASSO limit the number of predictors to 10

# Import library
from sklearn.linear_model import Lasso
from sklearn.metrics import log_loss

# Set variables
X = df_analysis.drop(columns=['No_remission'])
y = df_analysis['No_remission']

# Initializing Lasso
alpha = 0.01685 # manipulate to have 10 predictors
lasso = Lasso(alpha = alpha)

# Fitting the model
lasso.fit(X, y)

# Calculate McFadden's R-squared
# Get predicted probabilities
y_pred = lasso.predict(X)

# Calculate log-likelihood of the model
log_likelihood_model = -log_loss(y, y_pred)

# Calculate log-likelihood of the null model
null_model_probs = np.full_like(y_pred, y.mean())
log_likelihood_null_model = -log_loss(y, null_model_probs)

# Calculate McFadden's R-squared
mcfadden_r2 = 1 - (log_likelihood_model / log_likelihood_null_model)

print("\nMcFadden's R-squared:", mcfadden_r2)

# Print the minimum lambda value selected by LassoCV
print("\nMinimum lambda (selected alpha):", alpha)

# Print the intercept
print("Intercept:", lasso.intercept_)

# Get selected features with abs value of coefficients greater than 0.05
selected_features = [(feature, coef) for feature, coef in zip(X.columns, lasso.coef_) if abs(coef) > 0.001]

# Print selected feature names and coefficients
print("\nSelected features with coefficients:")
for feature, coef in selected_features:
    print(f"{feature}: {coef}")


# Show results to Dr. Alemi to check if interaction is needed


McFadden's R-squared: 0.0812504501100052

Minimum lambda (selected alpha): 0.01685
Intercept: 0.20384628503540322

Selected features with coefficients:
dx_34713006: 0.03732661328729958
dx_36923009: 0.02835567241261096
dx_40930008: 0.02955844469140588
dx_41256004: 0.00170519315765604
dx_59621000: 0.0010321843388197852
dx_64859006: 0.0032077997316158928
dx_76069003: 0.13524425896264614
dx_239873007: 0.004939783055518465
dx_266435005: 0.06589316687660099
dx_400096001: 0.04895022609304274


In [6]:
# Notes on selected diseases
# Vitamin D deficiency (disorder) - dx_34713006: 0.03732661328729958
# Major depression, single episode (disorder) - dx_36923009: 0.02835567241261096
# Hypothyroidism (disorder) - dx_40930008: 0.02955844469140588
# Presbyopia (disorder) - dx_41256004: 0.00170519315765604
# Essential hypertension (disorder) - dx_59621000: 0.0010321843388197852
# Osteoporosis (disorder) - dx_64859006: 0.0032077997316158928
# Disorder of bone (disorder) - dx_76069003: 0.13524425896264614
# Osteoarthritis of knee (disorder) - dx_239873007: 0.004939783055518465
# Gastroesophageal reflux disease without esophagitis (disorder) - dx_266435005: 0.06589316687660099
# Melanocytic nevus (disorder) - dx_400096001: 0.04895022609304274

In [7]:
# Do feature selection with LASSO limit the number of predictors to 20

# Import library
from sklearn.linear_model import Lasso
from sklearn.metrics import log_loss

# Set variables
X = df_analysis.drop(columns=['No_remission'])
y = df_analysis['No_remission']

# Initializing Lasso
alpha = 0.00785 # manipulate to have 20 predictors
lasso = Lasso(alpha = alpha)

# Fitting the model
lasso.fit(X, y)

# Calculate McFadden's R-squared
# Get predicted probabilities
y_pred = lasso.predict(X)

# Calculate log-likelihood of the model
log_likelihood_model = -log_loss(y, y_pred)

# Calculate log-likelihood of the null model
null_model_probs = np.full_like(y_pred, y.mean())
log_likelihood_null_model = -log_loss(y, null_model_probs)

# Calculate McFadden's R-squared
mcfadden_r2 = 1 - (log_likelihood_model / log_likelihood_null_model)

print("\nMcFadden's R-squared:", mcfadden_r2) # Needs to be cross validated

# Print the minimum lambda value selected by LassoCV
print("\nMinimum lambda (selected alpha):", alpha)

# Print the intercept
print("Intercept:", lasso.intercept_)

# Get selected features with abs value of coefficients greater than 0.05
selected_features = [(feature, coef) for feature, coef in zip(X.columns, lasso.coef_) if abs(coef) > 0.001]

# Print selected feature names and coefficients
print("\nSelected features with coefficients:")
for feature, coef in selected_features:
    print(f"{feature}: {coef}")

# Show results to Dr. Alemi to check if interaction is needed

# Do interaction terms


McFadden's R-squared: 0.12602614800716316

Minimum lambda (selected alpha): 0.00785
Intercept: 0.1605714364423537

Selected features with coefficients:
race_White: -0.017605256791602245
dx_11314008: 0.021926136760082943
dx_18070006: 0.05414097964901779
dx_34713006: 0.047872005975313665
dx_36923009: 0.05668637871086478
dx_40930008: 0.06140726882908559
dx_41256004: 0.028011954366830707
dx_46152009: 0.026007346007453754
dx_59621000: 0.00983361068412485
dx_64859006: 0.020804416115625935
dx_65846009: 0.020899711271691268
dx_70153002: 0.021459827297058604
dx_76069003: 0.1369148676597445
dx_193462001: -0.010548064418124942
dx_193570009: 0.06890836283817868
dx_238810007: -0.0020719296644726506
dx_239873007: 0.028903546231882325
dx_266435005: 0.07507036440009997
dx_271737000: 0.004711805052241484
dx_400096001: 0.11086809183568944


In [8]:
# Notes on selected diseases
# Polyp of corpus uteri (disorder) - dx_11314008: 0.021926136760082943
# Impacted cerumen (disorder) - dx_18070006: 0.05414097964901779
# Vitamin D deficiency (disorder) - dx_34713006: 0.047872005975313665
# Major depression, single episode (disorder) - dx_36923009: 0.05668637871086478
# Hypothyroidism (disorder) - dx_40930008: 0.06140726882908559
# Presbyopia (disorder) - dx_41256004: 0.028011954366830707
# Tear film insufficiency (disorder) - dx_46152009: 0.026007346007453754
# Essential hypertension (disorder) - dx_59621000: 0.00983361068412485
# Osteoporosis (disorder) - dx_64859006: 0.020804416115625935
# Primary ovarian failure (disorder) - dx_65846009: 0.020899711271691268
# Hemorrhoids (disorder) - dx_70153002: 0.021459827297058604
# Disorder of bone (disorder) - dx_76069003: 0.1369148676597445
# Insomnia (disorder) - dx_193462001: -0.010548064418124942
# Cataract (disorder) - dx_193570009: 0.06890836283817868
# Flushing (disorder) - dx_238810007: -0.0020719296644726506
# Osteoarthritis of knee (disorder) - dx_239873007: 0.028903546231882325
# Gastroesophageal reflux disease without esophagitis (disorder) - dx_266435005: 0.07507036440009997
# Anemia (disorder) - dx_271737000: 0.004711805052241484
# Melanocytic nevus (disorder) - dx_400096001: 0.11086809183568944

# Model training

## Train test split

In [9]:
# Import library
from sklearn.model_selection import train_test_split

# Splitting the data into training and test sets
train_set, test_set = train_test_split(df_analysis, test_size=0.20, random_state=42)

In [10]:
# Write train and test sets to a file for reference
train_set.to_csv("./data/train.csv", index=False)
test_set.to_csv("./data/test.csv", index=False)

In [11]:
# Import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import (confusion_matrix, 
                             precision_score, 
                             recall_score, 
                             f1_score, 
                             matthews_corrcoef, 
                             roc_auc_score, 
                             average_precision_score)

In [12]:
# Splitting the data into features and target
X_train = train_set.drop(columns=['No_remission'])
y_train = train_set['No_remission']


# Splitting the test data into features and target
X_test = test_set.drop(columns=['No_remission'])
y_test = test_set['No_remission']

# Overwrite selected features lasso
selected_features_lasso = [feature for feature, _ in selected_features]

X_train_selected_lasso = X_train[selected_features_lasso]
X_test_selected_lasso = X_test[selected_features_lasso]

Hyperparameter Tuning Of Unbalanced Data Models
- Scoring will be based on recall

In [13]:
# Import library
from sklearn.model_selection import GridSearchCV

In [14]:
# Logistic Regression

# Define training sets as unbalanced with feature selection (X_train_selected_lasso, y_train)
# Define test set as balanced with feature selection (X_test_selected_lasso, y_test)

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

# Initialize the GridSearchCV object for Logistic Regression
grid_search_lr = GridSearchCV(estimator=LogisticRegression(solver='liblinear'), 
                              param_grid=param_grid_lr, 
                              scoring=['recall'], 
                              refit='recall', 
                              cv=5)

# Fit the grid search to the data
grid_search_lr.fit(X_train_selected_lasso, y_train)

# After fitting, we can check the best performance in the training set
print("Best parameters set found on training set:")
print(grid_search_lr.best_params_)

# Predict
best_estimator = grid_search_lr.best_estimator_
y_pred = best_estimator.predict(X_test_selected_lasso)
    
# Metrics
confusion = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_area = roc_auc_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
prc_area = average_precision_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
    
results = {
    'Confusion Matrix': confusion,
    'Precision': precision,
    'Recall': recall,
    'F-Measure': fmeasure,
    'MCC': mcc,
    'ROC Area': roc_area,
    'PRC Area': prc_area
}
    
# Display results
for metric_name, metric_value in results.items():
    print(f"{metric_name}: {metric_value}")

# Save best estimator for plotting
unbal_logreg = best_estimator

Best parameters set found on training set:
{'C': 10, 'penalty': 'l2'}
Confusion Matrix: [[210  14]
 [ 70  31]]
Precision: 0.6888888888888889
Recall: 0.3069306930693069
F-Measure: 0.4246575342465753
MCC: 0.3275330751107881
ROC Area: 0.7145067185289957
PRC Area: 0.5736680056289689


In [15]:
# Random Forest

# Define training sets as unbalanced with feature selection (X_train_selected_lasso, y_train)
# Define test set as balanced with feature selection (X_test_selected_lasso, y_test)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object for Random Forest
grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                              param_grid=param_grid_rf, 
                              scoring=['recall'], 
                              refit='recall', 
                              cv=5)

# Fit the grid search to the data
grid_search_rf.fit(X_train_selected_lasso, y_train)

# After fitting, we can check the best performance in the training set
print("Best parameters set found on training set:")
print(grid_search_rf.best_params_)

# Predict
best_estimator = grid_search_rf.best_estimator_
y_pred = best_estimator.predict(X_test_selected_lasso)
    
# Metrics
confusion = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_area = roc_auc_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
prc_area = average_precision_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
    
results = {
    'Confusion Matrix': confusion,
    'Precision': precision,
    'Recall': recall,
    'F-Measure': fmeasure,
    'MCC': mcc,
    'ROC Area': roc_area,
    'PRC Area': prc_area
}
    
# Display results
for metric_name, metric_value in results.items():
    print(f"{metric_name}: {metric_value}")

# Save best estimator for plotting
unbal_rf = best_estimator

Best parameters set found on training set:
{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Confusion Matrix: [[198  26]
 [ 65  36]]
Precision: 0.5806451612903226
Recall: 0.3564356435643564
F-Measure: 0.44171779141104295
MCC: 0.2831266746830076
ROC Area: 0.6915001768033947
PRC Area: 0.515873955145144


In [16]:
# Naive Bayes

# Define training sets as unbalanced with feature selection (X_train_selected_lasso, y_train)
# Define test set as balanced with feature selection (X_test_selected_lasso, y_test)

# Define the parameter grid for GaussianNB
param_grid_gnb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

# Initialize the GridSearchCV object for GaussianNB
grid_search_gnb = GridSearchCV(estimator=GaussianNB(), 
                               param_grid=param_grid_gnb, 
                               scoring=['recall'], 
                               refit='recall', 
                               cv=5)

# Fit the grid search to the data
grid_search_gnb.fit(X_train_selected_lasso, y_train)

# After fitting, we can check the best performance in the training set
print("Best parameters set found on training set:")
print(grid_search_gnb.best_params_)

# Predict
best_estimator = grid_search_gnb.best_estimator_
y_pred = best_estimator.predict(X_test_selected_lasso)
    
# Metrics
confusion = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_area = roc_auc_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
prc_area = average_precision_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
    
results = {
    'Confusion Matrix': confusion,
    'Precision': precision,
    'Recall': recall,
    'F-Measure': fmeasure,
    'MCC': mcc,
    'ROC Area': roc_area,
    'PRC Area': prc_area
}
    
# Display results
for metric_name, metric_value in results.items():
    print(f"{metric_name}: {metric_value}")

# Save best estimator for plotting
unbal_nb = best_estimator

Best parameters set found on training set:
{'var_smoothing': 0.008111308307896872}
Confusion Matrix: [[187  37]
 [ 53  48]]
Precision: 0.5647058823529412
Recall: 0.4752475247524752
F-Measure: 0.5161290322580645
MCC: 0.3265336144278757
ROC Area: 0.7092910183875529
PRC Area: 0.5495002642841829


In [17]:
# XGBoost

# Define training sets as unbalanced with feature selection (X_train_selected_lasso, y_train)
# Define test set as balanced with feature selection (X_test_selected_lasso, y_test)

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'colsample_bytree': [0.3, 0.7, 1]
}

# Initialize the GridSearchCV object for XGBoost
grid_search_xgb = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
                               param_grid=param_grid_xgb, 
                               scoring=['recall'], 
                               refit='recall', 
                               cv=5)

# Fit the grid search to the data
grid_search_xgb.fit(X_train_selected_lasso, y_train)

# After fitting, we can check the best performance in the training set
print("Best parameters set found on training set:")
print(grid_search_xgb.best_params_)

# Predict
best_estimator = grid_search_xgb.best_estimator_
y_pred = best_estimator.predict(X_test_selected_lasso)
    
# Metrics
confusion = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
roc_area = roc_auc_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
prc_area = average_precision_score(y_test, best_estimator.predict_proba(X_test_selected_lasso)[:, 1])
    
results = {
    'Confusion Matrix': confusion,
    'Precision': precision,
    'Recall': recall,
    'F-Measure': fmeasure,
    'MCC': mcc,
    'ROC Area': roc_area,
    'PRC Area': prc_area
}
    
# Display results
for metric_name, metric_value in results.items():
    print(f"{metric_name}: {metric_value}")

# Save best estimator for plotting
unbal_xgb = best_estimator

Plotting the ROC curves using specificity and recall

In [None]:
# Import the libraries
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [None]:
# Define models
models = {
    'Logistic Regression': unbal_logreg,
    'Random Forest': unbal_rf,
    'Naive Bayes': unbal_nb,
    'XGBoost': unbal_xgb,
}

plt.figure(figsize=(10, 8))

# Calculate ROC curve and ROC AUC for each model
for name, model in models.items():
    probas_ = model.predict_proba(X_test_selected_lasso)
    fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve, auc


plt.figure(figsize=(10, 8))

# Calculate precision-recall curve and AUC for each model
for name, model in models.items():
    probas_ = model.predict_proba(X_test_selected_lasso)
    precision, recall, thresholds = precision_recall_curve(y_test, probas_[:, 1])
    auprc = auc(recall, precision)
    plt.plot(recall, precision, lw=2, label=f'{name} (area = {auprc:.2f})')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall (Sensitivity)')
plt.ylabel('Precision (Positive Predictive Value)')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower right") 
plt.show()

Race Mix and Ethnicity Mix Analysis

In [None]:
# Filter test set for particular race or ethnicity

# Non-Hispanic White
test_set_NHWhite = test_set[(test_set['race_White'] == 1) & (test_set['ethnicity_Not Hispanic or Latino'] == 1)]
X_test_NHWhite = test_set_NHWhite.drop(columns=['No_remission'])
X_test_NHWhite = X_test_NHWhite[selected_features_lasso]
y_test_NHWhite = test_set_NHWhite['No_remission']

# Non-Hispanic Black
test_set_NHBlack = test_set[(test_set['race_Black or African American'] == 1) & (test_set['ethnicity_Not Hispanic or Latino'] == 1)]
X_test_NHBlack = test_set_NHBlack.drop(columns=['No_remission'])
X_test_NHBlack = X_test_NHBlack[selected_features_lasso]
y_test_NHBlack = test_set_NHBlack['No_remission']

# Hispanic
test_set_Hispanic = test_set[test_set['ethnicity_Hispanic or Latino'] == 1]
X_test_Hispanic = test_set_Hispanic.drop(columns=['No_remission'])
X_test_Hispanic = X_test_Hispanic[selected_features_lasso]
y_test_Hispanic = test_set_Hispanic['No_remission']

# Other
test_set_Other = test_set[~(((test_set['race_White'] == 1) & (test_set['ethnicity_Not Hispanic or Latino'] == 1)) | 
                            ((test_set['race_Black or African American'] == 1) & (test_set['ethnicity_Not Hispanic or Latino'] == 1)) | 
                            (test_set['ethnicity_Hispanic or Latino'] == 1))]
X_test_Other = test_set_Other.drop(columns=['No_remission'])
X_test_Other = X_test_Other[selected_features_lasso]
y_test_Other = test_set_Other['No_remission']

# Number of selected rows
print(f"""Non-Hispanic White: {test_set_NHWhite.shape[0]} | 
      Non-Hispanic Black: {test_set_NHBlack.shape[0]} | 
      Hispanic: {test_set_Hispanic.shape[0]} | 
      Other: {test_set_Other.shape[0]}""")

In [None]:
# Test models on Non-Hispanic White test set

# Define models
models = {
    'Logistic Regression (Unbalanced)': unbal_logreg,
    'Random Forest (Unbalanced)': unbal_rf,
    'Naive Bayes (Unbalanced)': unbal_nb,
    'XGBoost (Unbalanced)': unbal_xgb
}

# Metrics collection
results = {}

for name, model in models.items():
    
    # Predict
    y_pred = model.predict(X_test_NHWhite)
    
    # Metrics
    confusion = confusion_matrix(y_test_NHWhite, y_pred)
    precision = precision_score(y_test_NHWhite, y_pred)
    recall = recall_score(y_test_NHWhite, y_pred)
    fmeasure = f1_score(y_test_NHWhite, y_pred)
    mcc = matthews_corrcoef(y_test_NHWhite, y_pred)
    roc_area = roc_auc_score(y_test_NHWhite, model.predict_proba(X_test_NHWhite)[:, 1])
    prc_area = average_precision_score(y_test_NHWhite, model.predict_proba(X_test_NHWhite)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")

In [None]:
# Test models on Non-Hispanic Black test set

# Define models
models = {
    'Logistic Regression (Unbalanced)': unbal_logreg,
    'Random Forest (Unbalanced)': unbal_rf,
    'Naive Bayes (Unbalanced)': unbal_nb,
    'XGBoost (Unbalanced)': unbal_xgb
}

# Metrics collection
results = {}

for name, model in models.items():
    
    # Predict
    y_pred = model.predict(X_test_NHBlack)
    
    # Metrics
    confusion = confusion_matrix(y_test_NHBlack, y_pred)
    precision = precision_score(y_test_NHBlack, y_pred)
    recall = recall_score(y_test_NHBlack, y_pred)
    fmeasure = f1_score(y_test_NHBlack, y_pred)
    mcc = matthews_corrcoef(y_test_NHBlack, y_pred)
    roc_area = roc_auc_score(y_test_NHBlack, model.predict_proba(X_test_NHBlack)[:, 1])
    prc_area = average_precision_score(y_test_NHBlack, model.predict_proba(X_test_NHBlack)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")

In [None]:
# Test models on Hispanic test set

# Define models
models = {
    'Logistic Regression (Unbalanced)': unbal_logreg,
    'Random Forest (Unbalanced)': unbal_rf,
    'Naive Bayes (Unbalanced)': unbal_nb,
    'XGBoost (Unbalanced)': unbal_xgb
}

# Metrics collection
results = {}

for name, model in models.items():
    
    # Predict
    y_pred = model.predict(X_test_Hispanic)
    
    # Metrics
    confusion = confusion_matrix(y_test_Hispanic, y_pred)
    precision = precision_score(y_test_Hispanic, y_pred)
    recall = recall_score(y_test_Hispanic, y_pred)
    fmeasure = f1_score(y_test_Hispanic, y_pred)
    mcc = matthews_corrcoef(y_test_Hispanic, y_pred)
    roc_area = roc_auc_score(y_test_Hispanic, model.predict_proba(X_test_Hispanic)[:, 1])
    prc_area = average_precision_score(y_test_Hispanic, model.predict_proba(X_test_Hispanic)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")

In [None]:
# Test models on "Other" test set

# Define models
models = {
    'Logistic Regression (Unbalanced)': unbal_logreg,
    'Random Forest (Unbalanced)': unbal_rf,
    'Naive Bayes (Unbalanced)': unbal_nb,
    'XGBoost (Unbalanced)': unbal_xgb
}

# Metrics collection
results = {}

for name, model in models.items():
    
    # Predict
    y_pred = model.predict(X_test_Other)
    
    # Metrics
    confusion = confusion_matrix(y_test_Other, y_pred)
    precision = precision_score(y_test_Other, y_pred)
    recall = recall_score(y_test_Other, y_pred)
    fmeasure = f1_score(y_test_Other, y_pred)
    mcc = matthews_corrcoef(y_test_Other, y_pred)
    roc_area = roc_auc_score(y_test_Other, model.predict_proba(X_test_Other)[:, 1])
    prc_area = average_precision_score(y_test_Other, model.predict_proba(X_test_Other)[:, 1])
    
    results[name] = {
        'Confusion Matrix': confusion,
        'Precision': precision,
        'Recall': recall,
        'F-Measure': fmeasure,
        'MCC': mcc,
        'ROC Area': roc_area,
        'PRC Area': prc_area
    }
    
# Display results
for name, metrics in results.items():
    print(f"Model: {name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")

In [None]:
# Define models
models = {
    'Logistic Regression': unbal_logreg,
    'Random Forest': unbal_rf,
    'Naive Bayes': unbal_nb,
    'XGBoost': unbal_xgb,
}

# Define race-mix test sets
rm = {
    'Non-Hispanic White' : [X_test_NHWhite, y_test_NHWhite],
    'Non-Hispanic Black' : [X_test_NHBlack, y_test_NHBlack],
    'Hispanic' : [X_test_Hispanic, y_test_Hispanic],
    'Other Ethnicity-Race' : [X_test_Other, y_test_Other]
}

plt.figure(figsize=(10, 8))

# Calculate ROC curve and ROC AUC for each model
for name, model in models.items():
    for rm_name, rm_test_set in rm.items():
        probas_ = model.predict_proba(rm_test_set[0])
        fpr, tpr, thresholds = roc_curve(rm_test_set[1], probas_[:, 1])
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'{rm_name}: {name}  (area = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
plt.figure(figsize=(10, 8))

# Calculate precision-recall curve and AUC for each model
for name, model in models.items():
    for rm_name, rm_test_set in rm.items():
        probas_ = model.predict_proba(rm_test_set[0])
        precision, recall, thresholds = precision_recall_curve(rm_test_set[1], probas_[:, 1])
        auprc = auc(recall, precision)
        plt.plot(recall, precision, lw=2, label=f'{rm_name}: {name} (area = {auprc:.2f})')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall (Sensitivity)')
plt.ylabel('Precision (Positive Predictive Value)')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower right") 
plt.show()