In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report,make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso


In [2]:
data = pd.read_csv("../processed_data/merged_data.csv")

In [3]:
list_bp = ['avg_dbp', 'avg_diff', 'avg_sbp', 'max_sbp']
list_ed = ['age', 'sex', 'language', 'insurance_type', 'primary_care', 
            'ed_name', 'bpa_response', 'htn_on_pl', 'htn_on_pmh', 
            'hld_on_pl', 'hld_on_pmh', 'family_dm', 'tobacco_user', 
            'htn_meds', 'statin_meds', 'disposition', 'detailed_race', 
            'weight', 'bmi', 'hba1c', 'height', 'sbp_1st', 'dbp_1st', 
            'poct_gluc']
list_lab = ['max_value_GLUCOSE', 'avg_value_GLUCOSE', 'max_value_CREATININE', 
            'min_value_CREATININE', 'min_value_GLUCOSE',  'avg_value_CREATININE', 
            'avg_value_HEMOGLOBIN A1C', 'max_value_HEMOGLOBIN A1C', 'min_value_HEMOGLOBIN A1C',  
            'min_value_GLUCOSE, POC', 'avg_value_GLUCOSE, POC', 'max_value_GLUCOSE, POC']
list_geo = [
    'po_box', 'homeless', 'total_pop', 'households', 'housing_units', 
    'p_children', 'p_elderly', 'p_adults', 'p_female', 'mdn_age', 
    'p_nhwhite', 'p_nhblack', 'p_hispanic', 'p_nhasian', 'p_other', 
    'p_moved', 'p_longcommute', 'p_marriednone', 'p_marriedkids', 
    'p_singlenone', 'p_malekids', 'p_femalekids', 'p_cohabitkids', 
    'p_nohsdeg', 'p_hsonly', 'p_somecollege', 'p_collegeplus', 
    'p_onlyenglish', 'p_spanishlimited', 'p_asianlimited', 'p_otherlimited', 
    'p_limitedall', 'p_notlimited', 'p_popbelow1fpl', 'p_popbelow2fpl', 
    'p_povmarriedfam', 'p_povmalefam', 'p_povfemalefam', 'hh_mdnincome', 
    'p_pubassist', 'p_foodstamps', 'p_assistorfood', 'p_unemployed', 
    'h_vacant', 'h_renter', 'h_occupants', 'h_novehicles', 'h_mdnrent', 
    'h_rentpercent', 'h_houseprice', 'p_private', 'p_medicare', 'p_medicaid', 
    'p_otherinsur', 'p_uninsured', 'h_nointernet', 'h_nocomputer', 
    'p_foreign', 'p_disabled']
list_visit = ['visit_type']

In [4]:
lists = list_bp+ list_ed+ list_lab+ list_geo+ list_visit
X_all = data[lists]
y = data['pcp_followup'].map({'Yes': 1, 'No': 0})
y = np.array(y).astype(int)

In [5]:
# encode

numeric_cols = X_all.select_dtypes(include=['number']).columns
categorical_cols = X_all.select_dtypes(exclude=['number']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

X_preprocessed = preprocessor.fit_transform(X_all)
if hasattr(X_preprocessed, "toarray"):
    X_preprocessed = X_preprocessed.toarray()


numeric_feature_names = numeric_cols 
cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_feature_names = list(numeric_feature_names) + list(cat_feature_names)

In [6]:
'''
# selected by importance

X_train_val, X_test, y_train_val, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=50)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)

lasso = Lasso(alpha=1e-3,
              max_iter=1035,
              tol=1e-10,
              random_state=50,
              selection='cyclic',)
lasso.fit(X_train, y_train)

coef = lasso.coef_

feature_importance = pd.DataFrame({
    'feature': all_feature_names,
    'importance': coef
})

feature_importance = feature_importance.sort_values('importance', ascending=False)

selected_feature_names = feature_importance[feature_importance['importance'] < 0]
selected_feature_names = selected_feature_names.tail(20)
X_train = pd.DataFrame(X_train, columns=all_feature_names)[selected_feature_names['feature']]
X_val = pd.DataFrame(X_val, columns=all_feature_names)[selected_feature_names['feature']]
X_test= pd.DataFrame(X_test, columns=all_feature_names)[selected_feature_names['feature']]'''

"\n# selected by importance\n\nX_train_val, X_test, y_train_val, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=50)\nX_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)\n\nlasso = Lasso(alpha=1e-3,\n              max_iter=1035,\n              tol=1e-10,\n              random_state=50,\n              selection='cyclic',)\nlasso.fit(X_train, y_train)\n\ncoef = lasso.coef_\n\nfeature_importance = pd.DataFrame({\n    'feature': all_feature_names,\n    'importance': coef\n})\n\nfeature_importance = feature_importance.sort_values('importance', ascending=False)\n\nselected_feature_names = feature_importance[feature_importance['importance'] < 0]\nselected_feature_names = selected_feature_names.tail(20)\nX_train = pd.DataFrame(X_train, columns=all_feature_names)[selected_feature_names['feature']]\nX_val = pd.DataFrame(X_val, columns=all_feature_names)[selected_feature_names['feature']]\nX_test= pd.DataFrame(X_test

In [7]:
# select features

selected_feature_names  = ['insurance_type_MEDICARE', 'insurance_type_SELFPAY', 'insurance_type_MEDICAID',
                     'detailed_race_Hispanic', 'detailed_race_Other',
                     'language_English', 'language_Other', 
                     'p_longcommute']
X_preprocessed = pd.DataFrame(X_preprocessed, columns=all_feature_names)
# split sets 

X_train_val, X_test, y_train_val, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=50)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)


In [8]:
# logistic regression


In [9]:
# GridSearch for hyperparameter tuning
from sklearn.model_selection import StratifiedKFold

param_grid = {
    'penalty': ['l1', 'l2'],  
    'C': [0.001, 0.01, 0.1, 1, 10, 100], 
    'max_iter': [100, 200, 500, 1000],
    'class_weight': [None, 'balanced']
}

logistic = LogisticRegression(solver='liblinear', random_state=50)

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=50)

grid_search = GridSearchCV(logistic, param_grid, cv=cv_strategy, scoring='balanced_accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)


Best parameters: {'C': 0.001, 'class_weight': 'balanced', 'max_iter': 100, 'penalty': 'l2'}


In [10]:
logistic = LogisticRegression(
    C=0.001, 
    class_weight='balanced', 
    max_iter=100, 
    penalty ='l2', 
    solver='liblinear', 
    random_state=50)

In [11]:
logistic.fit(X_train, y_train)


In [12]:
y_val_pred = logistic.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_val, y_val_pred))


Accuracy: 0.5854
Precision: 0.7727
Recall: 0.5862
F1 Score: 0.6667
Confusion Matrix:
[[ 7  5]
 [12 17]]
              precision    recall  f1-score   support

           0       0.37      0.58      0.45        12
           1       0.77      0.59      0.67        29

    accuracy                           0.59        41
   macro avg       0.57      0.58      0.56        41
weighted avg       0.65      0.59      0.60        41



In [13]:
y_test_pred = logistic.predict(X_test)

accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print(classification_report(y_test, y_test_pred))


Accuracy: 0.5882
Precision: 0.8000
Recall: 0.6154
F1 Score: 0.6957
Confusion Matrix:
[[ 6  6]
 [15 24]]
              precision    recall  f1-score   support

           0       0.29      0.50      0.36        12
           1       0.80      0.62      0.70        39

    accuracy                           0.59        51
   macro avg       0.54      0.56      0.53        51
weighted avg       0.68      0.59      0.62        51

