In [2]:
import pandas as pd

In [3]:
file_path='data\hospital_knnimputer.csv'
df = pd.read_csv(file_path)

In [4]:
print(df['readmitted'].unique())

['no' 'yes']


In [5]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

In [6]:
df['readmitted'] = df['readmitted'].str.strip().str.lower().map({'yes':1, 'no':0})

In [7]:
numerical_features = ['n_inpatient','n_emergency','time_in_hospital','n_procedures','n_lab_procedures','n_medications']
categorical_features = ['age','diag_1','diag_2','diag_3','medical_specialty','change','diabetes_med','glucose_test','A1Ctest']

In [8]:

y = df['readmitted'].values

In [9]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_cat = encoder.fit_transform(df[categorical_features])

In [10]:
scaler = StandardScaler()
X_num = scaler.fit_transform(df[numerical_features])

In [11]:
X_interactions = np.column_stack([
    df['n_inpatient'] * df['time_in_hospital'],
    df['n_lab_procedures'] + df['n_procedures'],
    df['n_medications'] / (df['time_in_hospital']+1), 
    df['n_inpatient'] + df['n_outpatient'] + df['n_emergency']  
])

In [12]:
X_interactions_scaled = StandardScaler().fit_transform(X_interactions)

In [14]:
X_full = np.hstack([X_num, X_cat, X_interactions_scaled])

In [15]:
smote = SMOTE(random_state=42)

In [16]:
X_res, y_res = smote.fit_resample(X_full, y)



In [17]:
from sklearn.model_selection import train_test_split

In [20]:

X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rf_temp = RandomForestClassifier(n_estimators=300, random_state=42)
rf_temp.fit(X_train, y_train)

In [None]:
importances = rf_temp.feature_importances_
indices = np.argsort(importances)[::-1]

In [None]:
cumulative_importance = np.cumsum(importances[indices])
top_features_idx = indices[cumulative_importance <= 0.85]

In [None]:
X_train_sel = X_train[:, top_features_idx]
X_test_sel = X_test[:, top_features_idx]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import xgboost as xgb

In [27]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
from catboost import CatBoostClassifier

In [31]:
from lightgbm import LGBMClassifier

In [None]:
p = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
    roc_disp.plot(color='darkorange') # Change ROC curve color
    plt.title("ROC Curve (Last Fold)")
    plt.show()

    # Error Rate Visualization (Example: Simple bar plot of misclassification)
    errors = y_true != y_pred
    error_rate = errors.mean()
    print(f"\n✅ Error Rate (Last Fold): {error_rate:.4f}")

    # A more detailed error analysis could involve plotting specific types of errors (false positives, false negatives)


# Main execution
file_path = '/content/hospital_readmissions.csv'
target_column = 'readmitted'

# 1. Load Data
df = load_data(file_path)

# 2. Feature Engineering
df = perform_feature_engineering(df)

# 3. Preprocessing
X, y, numerical_cols = preprocess_data(df, target_column)

# 4. Train and Evaluate Model
avg_f1, avg_roc_auc, avg_acc, last_fold_results = train_and_evaluate_model(X, y, numerical_cols)

# 5. Visualization (Last fold)
visualize_results(last_fold_results['y_true'], last_fold_results['y_pred'], last_fold_results['y_probs'])

In [None]:
'''xgb_model = xgb.XGBClassifier(
    n_estimators=500, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, use_label_encoder=False,
    eval_metric='logloss', random_state=42
)'''

In [None]:
'''cat_model = CatBoostClassifier(
    iterations=500, depth=6, learning_rate=0.05,
    verbose=0, random_state=42
)'''

In [None]:
'''lgb_model = LGBMClassifier(
    n_estimators=500, max_depth=6, learning_rate=0.05, random_state=42
)'''

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
log_clf = LogisticRegression(solver='liblinear', max_iter=2000, random_state=42)
log_clf.fit(X_interactions,y)

rf_clf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(rf_clf, {'n_estimators':[100],'max_depth':[10,20]}, cv=3, scoring='f1', n_jobs=-1)
grid_rf.fit(X_interactions,y)
best_rf = grid_rf.best_estimator_

        # AdaBoost
ada_clf = AdaBoostClassifier(random_state=42)
grid_ada = GridSearchCV(ada_clf, {'n_estimators':[100],'learning_rate':[0.1,0.5]}, cv=3, scoring='f1', n_jobs=-1)
grid_ada.fit(X_interactions,y)
best_ada = grid_ada.best_estimator_

print(best_ada,best_rf)

AdaBoostClassifier(learning_rate=0.1, n_estimators=100, random_state=42) RandomForestClassifier(max_depth=10, random_state=42)


In [34]:
ada_clf=AdaBoostClassifier(learning_rate=0.1, n_estimators=100, random_state=42)
rf_clf=RandomForestClassifier(max_depth=10, random_state=42)

In [35]:
stack_model = StackingClassifier(
    estimators=[('logistic', log_clf), ('adab', ada_clf), ('ranfo', rf_clf)],
    final_estimator=LogisticRegression(max_iter=2000),
    cv=5, n_jobs=-1
)

In [36]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [37]:
stack_model.fit(X_train_sel, y_train)

In [38]:
y_pred = stack_model.predict(X_test_sel)
y_prob = stack_model.predict_proba(X_test_sel)[:,1]

In [39]:
print("Accuracy:", round(accuracy_score(y_test, y_pred),4))
print("F1-Score:", round(f1_score(y_test, y_pred),4))
print("Precision:", round(precision_score(y_test, y_pred),4))
print("Recall:", round(recall_score(y_test, y_pred),4))
print("ROC-AUC:", round(roc_auc_score(y_test, y_prob),4))

Accuracy: 0.6275
F1-Score: 0.6122
Precision: 0.6383
Recall: 0.5881
ROC-AUC: 0.6775
