In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV,train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report,make_scorer,precision_recall_curve,average_precision_score
from sklearn.linear_model import LogisticRegression, Lasso
import xgboost as xgb



In [2]:
# load the data

data = pd.read_csv("../processed_data/merged_data.csv")

list_bp = ['avg_dbp', 'avg_diff', 'avg_sbp', 'max_sbp']
list_ed = ['age', 'sex', 'language', 'insurance_type', 'primary_care', 
            'ed_name', 'bpa_response', 'htn_on_pl', 'htn_on_pmh', 
            'hld_on_pl', 'hld_on_pmh', 'family_dm', 'tobacco_user', 
            'htn_meds', 'statin_meds', 'disposition', 'detailed_race', 
            'weight', 'bmi', 'hba1c', 'height', 'sbp_1st', 'dbp_1st', 
            'poct_gluc']
list_lab = ['max_value_GLUCOSE', 'avg_value_GLUCOSE', 'max_value_CREATININE', 
            'min_value_CREATININE', 'min_value_GLUCOSE',  'avg_value_CREATININE', 
            'avg_value_HEMOGLOBIN A1C', 'max_value_HEMOGLOBIN A1C', 'min_value_HEMOGLOBIN A1C',  
            'min_value_GLUCOSE, POC', 'avg_value_GLUCOSE, POC', 'max_value_GLUCOSE, POC']
list_geo = [
    'po_box', 'homeless', 'total_pop', 'households', 'housing_units', 
    'p_children', 'p_elderly', 'p_adults', 'p_female', 'mdn_age', 
    'p_nhwhite', 'p_nhblack', 'p_hispanic', 'p_nhasian', 'p_other', 
    'p_moved', 'p_longcommute', 'p_marriednone', 'p_marriedkids', 
    'p_singlenone', 'p_malekids', 'p_femalekids', 'p_cohabitkids', 
    'p_nohsdeg', 'p_hsonly', 'p_somecollege', 'p_collegeplus', 
    'p_onlyenglish', 'p_spanishlimited', 'p_asianlimited', 'p_otherlimited', 
    'p_limitedall', 'p_notlimited', 'p_popbelow1fpl', 'p_popbelow2fpl', 
    'p_povmarriedfam', 'p_povmalefam', 'p_povfemalefam', 'hh_mdnincome', 
    'p_pubassist', 'p_foodstamps', 'p_assistorfood', 'p_unemployed', 
    'h_vacant', 'h_renter', 'h_occupants', 'h_novehicles', 'h_mdnrent', 
    'h_rentpercent', 'h_houseprice', 'p_private', 'p_medicare', 'p_medicaid', 
    'p_otherinsur', 'p_uninsured', 'h_nointernet', 'h_nocomputer', 
    'p_foreign', 'p_disabled']
list_visit = ['visit_type']


lists = list_bp+ list_ed+ list_lab+ list_geo+ list_visit
X_all = data[lists]
y = data['pcp_followup'].map({'Yes': 1, 'No': 0})
y = np.array(y).astype(int)

In [3]:
# encode

numeric_cols = X_all.select_dtypes(include=['number']).columns
categorical_cols = X_all.select_dtypes(exclude=['number']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )

X_preprocessed = preprocessor.fit_transform(X_all)
if hasattr(X_preprocessed, "toarray"):
    X_preprocessed = X_preprocessed.toarray()


numeric_feature_names = numeric_cols 
cat_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_feature_names = list(numeric_feature_names) + list(cat_feature_names)

In [4]:
# selected by importance
# select tail 10 features
X_train_val, X_test, y_train_val, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=50)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=50)

lasso = Lasso(alpha=1e-3,
              max_iter=1035,
              tol=1e-10,
              random_state=50,
              selection='cyclic',)
lasso.fit(X_train, y_train)

coef = lasso.coef_

feature_importance = pd.DataFrame({
    'feature': all_feature_names,
    'importance': coef
})

feature_importance = feature_importance.sort_values('importance', ascending=False)

selected_feature_names = feature_importance[feature_importance['importance'] < 0]
selected_feature_names = selected_feature_names.tail(10)
X_train = pd.DataFrame(X_train, columns=all_feature_names)[selected_feature_names['feature']]
X_val = pd.DataFrame(X_val, columns=all_feature_names)[selected_feature_names['feature']]
X_test= pd.DataFrame(X_test, columns=all_feature_names)[selected_feature_names['feature']]

In [5]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

In [6]:
from collections import Counter

class_counts = Counter(y_train) 
negative_class_count = class_counts[0]
positive_class_count = class_counts[1]
class_weight_ratio = negative_class_count / positive_class_count

In [7]:
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',  
    eval_metric='logloss',
    early_stopping_rounds=100,
    scale_pos_weight=class_weight_ratio
)

In [13]:
param_grid = {
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'learning_rate': [0.1, 0.3, 0.5],
    'n_estimators': [50, 100, 200, 500],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.3],
    'min_child_weight': [1, 5, 10],
    'max_delta_step': [0, 1, 5],
    'lambda': [0, 0.1, 1, 10],
    'alpha': [0, 0.1, 1, 10]
    }

In [14]:
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='accuracy',  
    cv=5,
    verbose=1,
    n_jobs=-1
)

In [15]:
grid_search.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

Fitting 5 folds for each of 663552 candidates, totalling 3317760 fits


In [16]:
print(grid_search.best_params_)
best_model = grid_search.best_estimator_


{'alpha': 0, 'colsample_bytree': 0.7, 'gamma': 0, 'lambda': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 50, 'subsample': 0.7}


In [17]:
y_pred = best_model.predict(X_test)

y_prob = best_model.predict_proba(X_test)[:, 1]  
threshold = 0.5
y_pred = (y_prob >= threshold).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print(f"accuracy: {accuracy:.2f}")
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

accuracy: 0.59
[[ 4  8]
 [13 26]]
              precision    recall  f1-score   support

           0       0.24      0.33      0.28        12
           1       0.76      0.67      0.71        39

    accuracy                           0.59        51
   macro avg       0.50      0.50      0.49        51
weighted avg       0.64      0.59      0.61        51

