In [None]:
# =====================================
# CIRRHOSIS PATIENT SURVIVAL PREDICTION
# Models: Logistic Regression, Decision Tree, Random Forest, SVM, KNN
# =====================================

# ✅ STEP 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# =====================================
# ✅ STEP 2: Load dataset
# =====================================
# Upload your CSV file (you’ll get a file upload button)
from google.colab import files
uploaded = files.upload()

# After upload, replace the filename below if needed
csv_file = list(uploaded.keys())[0]
df = pd.read_csv(csv_file)
print("Dataset shape:", df.shape)
df.head()

# =====================================
# ✅ STEP 3: Detect target column
# =====================================
found = 'Status'
print(f"Target column used: {found}")

# =====================================
# ✅ STEP 4: Prepare data
# =====================================
y = df[found]
X = df.drop(columns=[found])

# Encode target (multiclass friendly)
if not np.issubdtype(y.dtype, np.number):
    y = y.astype('category').cat.codes

# Drop ID-like columns
id_like = [c for c in X.columns if 'id' in c.lower() or 'patient' in c.lower()]
if id_like:
    X = X.drop(columns=id_like)
    print("Dropped ID-like columns:", id_like)

# Separate numeric & categorical
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# Preprocessors
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# =====================================
# ✅ STEP 5: Train models
# =====================================
models = {
    'LogisticRegression': LogisticRegression(max_iter=2000, solver='liblinear'),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

results = []

for name, model in models.items():
    print("\n" + "="*40)
    print(f"Training and evaluating: {name}")

    pipe = Pipeline([
        ('preproc', preprocessor),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, zero_division=0))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion matrix:\n", cm)

    try:
        y_proba = pipe.predict_proba(X_test)
        if y_proba.shape[1] == 2:
            roc = roc_auc_score(y_test, y_proba[:,1])
        else:
            roc = roc_auc_score(y_test, y_proba, multi_class='ovr')
    except Exception:
        roc = np.nan

    results.append({'Model': name, 'Accuracy': acc, 'ROC_AUC': roc})

# =====================================
# ✅ STEP 6: Summary of results
# =====================================
summary = pd.DataFrame(results)
print("\nModel Performance Summary:\n")
print(summary)

# Save summary
summary.to_csv("model_performance_summary.csv", index=False)
print("\nSaved: model_performance_summary.csv")

# =====================================
# ✅ STEP 7: ROC Curves (binary only)
# =====================================
if y.nunique() == 2:
    plt.figure(figsize=(8,6))
    for name, model in models.items():
        pipe = Pipeline([('preproc', preprocessor), ('model', model)])
        pipe.fit(X_train, y_train)
        try:
            y_proba = pipe.predict_proba(X_test)[:,1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            plt.plot(fpr, tpr, label=f"{name}")
        except:
            pass
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curves (Binary Classification)")
    plt.legend()
    plt.show()


Saving cirrhosis.csv to cirrhosis.csv
Dataset shape: (418, 20)
status asdoansdasjndsaoin
Status
Target column used: Status
Dropped ID-like columns: ['ID', 'Spiders', 'Tryglicerides']
Numeric columns: ['N_Days', 'Age', 'Bilirubin', 'Albumin', 'Alk_Phos', 'SGOT', 'Prothrombin', 'Stage']
Categorical columns: ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Edema', 'Cholesterol', 'Copper', 'Platelets']

Training and evaluating: LogisticRegression
Accuracy: 0.7619
              precision    recall  f1-score   support

           0       0.75      0.89      0.82        47
           1       0.00      0.00      0.00         5
           2       0.79      0.69      0.73        32

    accuracy                           0.76        84
   macro avg       0.51      0.53      0.52        84
weighted avg       0.72      0.76      0.74        84

Confusion matrix:
 [[42  0  5]
 [ 4  0  1]
 [10  0 22]]

Training and evaluating: DecisionTree
Accuracy: 0.6310
              precision    recall  f1-score   su