In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [13]:
df = pd.read_csv('cleaned_featured_patient_data.csv')



In [15]:
df.head()

Unnamed: 0,patient_id,age,gender,race,admission_date,discharge_date,primary_diagnosis,num_procedures,num_medications,comorbidities_count,...,insurance_type,followup_required,hospital_unit,readmitted_within_30d,discharge_notes,length_of_stay,had_prior_visits,high_medication_flag,diagnosis_group,high_risk_diagnosis
0,PID00001,71,female,asian,2023-11-24,2023-12-01,chf,1,7,4,...,medicare,True,oncology,0,Patient admitted for chf. Condition stable. Fo...,7,1,0,chronic,False
1,PID00002,34,male,white,2023-02-27,2023-03-01,pneumonia,5,12,0,...,medicaid,False,cardiology,1,Patient admitted for pneumonia. Condition crit...,2,1,0,acute,False
2,PID00003,80,male,white,2023-01-13,2023-01-22,diabetes,0,17,2,...,self-pay,False,er,1,Patient admitted for diabetes. Condition stabl...,9,1,1,chronic,False
3,PID00004,40,male,asian,2023-05-21,2023-05-25,hypertension,1,17,5,...,medicaid,True,icu,1,Patient admitted for hypertension. Condition s...,4,1,1,chronic,False
4,PID00005,43,male,asian,2023-05-06,2023-05-09,covid-19,4,10,4,...,private,False,er,0,Patient admitted for covid-19. Condition stabl...,3,1,0,acute,True


In [17]:
# Removing columns whichb can create confusions & readmitted_within_30d as we are trying to predict
X = df.drop(columns=['readmitted_within_30d', 'discharge_notes', 'patient_id'])  # Drop text & ID
y = df['readmitted_within_30d']


In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [23]:
# Identify categorical and numerical features
categorical_features = ['gender', 'race', 'primary_diagnosis', 'discharge_disposition', 'insurance_type', 'hospital_unit', 'diagnosis_group']
numerical_features = ['age', 'num_procedures', 'num_medications', 'comorbidities_count', 'prior_visits', 'length_of_stay', 'high_medication_flag']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)


In [25]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [27]:
clf.fit(X_train, y_train)


In [29]:
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_proba))


Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.60      0.61        97
           1       0.64      0.67      0.65       103

    accuracy                           0.64       200
   macro avg       0.63      0.63      0.63       200
weighted avg       0.63      0.64      0.63       200

Confusion Matrix:
 [[58 39]
 [34 69]]
ROC AUC Score: 0.6623961565408869
