In [136]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso


In [137]:
data = pd.read_csv("../processed_data/merged_data.csv")

In [138]:
list_bp = ['avg_dbp', 'avg_diff', 'avg_sbp', 'max_sbp']
list_ed = ['age', 'sex', 'language', 'insurance_type', 'primary_care', 
            'ed_name', 'bpa_response', 'htn_on_pl', 'htn_on_pmh', 
            'hld_on_pl', 'hld_on_pmh', 'family_dm', 'tobacco_user', 
            'htn_meds', 'statin_meds', 'disposition', 'detailed_race', 
            'weight', 'bmi', 'hba1c', 'height', 'sbp_1st', 'dbp_1st', 
            'poct_gluc']
list_lab = ['max_value_GLUCOSE', 'avg_value_GLUCOSE', 'max_value_CREATININE', 
            'min_value_CREATININE', 'min_value_GLUCOSE',  'avg_value_CREATININE', 
            'avg_value_HEMOGLOBIN A1C', 'max_value_HEMOGLOBIN A1C', 'min_value_HEMOGLOBIN A1C',  
            'min_value_GLUCOSE, POC', 'avg_value_GLUCOSE, POC', 'max_value_GLUCOSE, POC']
list_geo = [
    'po_box', 'homeless', 'total_pop', 'households', 'housing_units', 
    'p_children', 'p_elderly', 'p_adults', 'p_female', 'mdn_age', 
    'p_nhwhite', 'p_nhblack', 'p_hispanic', 'p_nhasian', 'p_other', 
    'p_moved', 'p_longcommute', 'p_marriednone', 'p_marriedkids', 
    'p_singlenone', 'p_malekids', 'p_femalekids', 'p_cohabitkids', 
    'p_nohsdeg', 'p_hsonly', 'p_somecollege', 'p_collegeplus', 
    'p_onlyenglish', 'p_spanishlimited', 'p_asianlimited', 'p_otherlimited', 
    'p_limitedall', 'p_notlimited', 'p_popbelow1fpl', 'p_popbelow2fpl', 
    'p_povmarriedfam', 'p_povmalefam', 'p_povfemalefam', 'hh_mdnincome', 
    'p_pubassist', 'p_foodstamps', 'p_assistorfood', 'p_unemployed', 
    'h_vacant', 'h_renter', 'h_occupants', 'h_novehicles', 'h_mdnrent', 
    'h_rentpercent', 'h_houseprice', 'p_private', 'p_medicare', 'p_medicaid', 
    'p_otherinsur', 'p_uninsured', 'h_nointernet', 'h_nocomputer', 
    'p_foreign', 'p_disabled']
list_visit = ['visit_type']

In [139]:
lists = list_bp+ list_ed+ list_lab+ list_geo+ list_visit
X_all = data[lists]
y = data['pcp_followup'].map({'Yes': 1, 'No': 0})
y = np.array(y).astype(int)

In [140]:
# encode

numeric_cols = X_all.select_dtypes(include=['number']).columns
categorical_cols = X_all.select_dtypes(exclude=['number']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

X_preprocessed = preprocessor.fit_transform(X_all)
if hasattr(X_preprocessed, "toarray"):
    X_preprocessed = X_preprocessed.toarray()


numeric_feature_names = numeric_cols 
cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_feature_names = list(numeric_feature_names) + list(cat_feature_names)

In [141]:
# selected by importance

X_train_val, X_test, y_train_val, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=50)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)

lasso = Lasso(alpha=1e-3,
              max_iter=1035,
              tol=1e-10,
              random_state=50,
              selection='cyclic',)
lasso.fit(X_train, y_train)

coef = lasso.coef_

feature_importance = pd.DataFrame({
    'feature': all_feature_names,
    'importance': coef
})

feature_importance = feature_importance.sort_values('importance', ascending=False)

selected_feature_names = feature_importance[feature_importance['importance'] < 0]
selected_feature_names = selected_feature_names.tail(20)
X_train = pd.DataFrame(X_train, columns=all_feature_names)[selected_feature_names['feature']]
X_val = pd.DataFrame(X_val, columns=all_feature_names)[selected_feature_names['feature']]
X_test= pd.DataFrame(X_test, columns=all_feature_names)[selected_feature_names['feature']]

In [142]:
'''# select features

selected_feature_names  = ['insurance_type_MEDICARE', 'insurance_type_SELFPAY', 'insurance_type_MEDICAID',
                     'detailed_race_Hispanic', 'detailed_race_Other',
                     'language_English', 'language_Other', 
                     'p_longcommute']
X_preprocessed = pd.DataFrame(X_preprocessed, columns=all_feature_names)
# split sets 

X_train_val, X_test, y_train_val, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=50)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)
'''

"# select features\n\nselected_feature_names  = ['insurance_type_MEDICARE', 'insurance_type_SELFPAY', 'insurance_type_MEDICAID',\n                     'detailed_race_Hispanic', 'detailed_race_Other',\n                     'language_English', 'language_Other', \n                     'p_longcommute']\nX_preprocessed = pd.DataFrame(X_preprocessed, columns=all_feature_names)\n# split sets \n\nX_train_val, X_test, y_train_val, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=50)\nX_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)\n"

In [143]:
# logistic regression


In [144]:
# GridSearch for hyperparameter tuning
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer,f1_score,recall_score


param_grid = {
    'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1, 10],
    'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0, 1, 10],
    'max_iter': [100, 200, 500, 1000],
    'class_weight': [None, 'balanced', {0: 3, 1: 1}, {0: 4, 1: 1}, {0: 5, 1: 1}],
    'solver':   ['newton-cg', 'lbfgs', 'liblinear'],
}

logistic = LogisticRegression(random_state=50)

scorer = make_scorer(recall_score, labels=[0])


cv_strategy = StratifiedKFold(
    n_splits=10, 
    shuffle=True, 
    random_state=50)

grid_search = GridSearchCV(
    logistic, 
    param_grid,
    cv=cv_strategy, 
    scoring=scorer, 
    n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)


Best parameters: {'C': 1e-05, 'class_weight': None, 'max_iter': 100, 'solver': 'newton-cg', 'tol': 1e-05}


In [145]:
logistic = LogisticRegression(
    tol=grid_search.best_params_['tol'],
    C=grid_search.best_params_['C'],
    class_weight=grid_search.best_params_['class_weight'],
    max_iter=grid_search.best_params_['max_iter'],
    solver=grid_search.best_params_['solver'],
    
    penalty ='l2', 
    random_state=50)

In [146]:
logistic.fit(X_train, y_train)


In [147]:
from sklearn.metrics import precision_recall_curve, roc_curve
y_proba = logistic.predict_proba(X_val)[:, 1]
precision, recall, thresholds = precision_recall_curve(
    y_val, 
    y_proba,
    pos_label=1)
f1_scores = 2 * (precision * recall) / (precision + recall)

best_index = np.argmax(f1_scores)
optimal_threshold = thresholds[best_index]
print("Optimal threshold:", optimal_threshold)

Optimal threshold: 0.8394312312194462


In [148]:
y_val_pred = np.where(y_proba > optimal_threshold, 1, 0)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_val, y_val_pred))


Accuracy: 0.7561
Precision: 0.7714
Recall: 0.9310
F1 Score: 0.8438
Confusion Matrix:
[[ 4  8]
 [ 2 27]]
              precision    recall  f1-score   support

           0       0.67      0.33      0.44        12
           1       0.77      0.93      0.84        29

    accuracy                           0.76        41
   macro avg       0.72      0.63      0.64        41
weighted avg       0.74      0.76      0.73        41



In [149]:
y_proba = logistic.predict_proba(X_test)[:, 1]

y_test_pred = np.where(y_proba > optimal_threshold, 1, 0)
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_test, y_test_pred))


Accuracy: 0.7059
Precision: 0.7857
Recall: 0.8462
F1 Score: 0.8148
Confusion Matrix:
[[ 3  9]
 [ 6 33]]
              precision    recall  f1-score   support

           0       0.33      0.25      0.29        12
           1       0.79      0.85      0.81        39

    accuracy                           0.71        51
   macro avg       0.56      0.55      0.55        51
weighted avg       0.68      0.71      0.69        51

