<a href="https://colab.research.google.com/github/Vyshnavi2k5/2k25/blob/main/Untitled31.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ================= ENSEMBLE LEARNING FOR HEALTHCARE DATA =================
import pandas as pd
import numpy as np
import zipfile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    VotingClassifier,
    StackingClassifier
)
from xgboost import XGBClassifier

# ========== LOAD DATA ==========
zip_path = "/content/Healthcare dataset.zip"   # Update path if needed
with zipfile.ZipFile(zip_path, "r") as z:
    file_name = z.namelist()[0]  # assume first CSV inside zip
    df = pd.read_csv(z.open(file_name))

print("Dataset Shape:", df.shape)
print("Columns:", df.columns.tolist())

# ========== SELECT TARGET ==========
# Example: predict "Medical Condition"
target = "Medical Condition"
X = df.drop(columns=[target])
y = df[target]

# ========== TRAIN/TEST SPLIT ==========
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ========== PREPROCESSING ==========
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(
    steps=[("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

# ========== BASE MODELS ==========
log_reg = LogisticRegression(max_iter=200)
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)
et = ExtraTreesClassifier(random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)

# ========== ENSEMBLE TECHNIQUES ==========
ensembles = {
    "Voting_Hard": VotingClassifier(
        estimators=[("lr", log_reg), ("rf", rf), ("xgb", xgb)], voting="hard"
    ),
    "Voting_Soft": VotingClassifier(
        estimators=[("lr", log_reg), ("rf", rf), ("xgb", xgb)], voting="soft"
    ),
    "Bagging_RF": rf,
    "Bagging_ET": et,
    "Boosting_GB": gb,
    "Boosting_Ada": ada,
    "Boosting_XGB": xgb,
    "Stacking": StackingClassifier(
        estimators=[("rf", rf), ("gb", gb), ("xgb", xgb)],
        final_estimator=LogisticRegression(),
        passthrough=True,
    ),
}

# ========== TRAIN & EVALUATE ==========
for name, model in ensembles.items():
    clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print(f"\n===== {name} =====")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred)[:600])  # shortened output


Dataset Shape: (55500, 15)
Columns: ['Name', 'Age', 'Gender', 'Blood Type', 'Medical Condition', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date', 'Medication', 'Test Results']
