# Liver Cirrhosis Stage Prediction — Training Notebook

This notebook trains a **Random Forest** classifier to predict liver cirrhosis stage using clinical features.
It saves:
- `model.pkl` — the trained pipeline (preprocessing + model)
- `features.json` — schema of expected features & training metadata

In [None]:

import pandas as pd
import numpy as np
import json, joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

project_root = Path(".")
df = pd.read_csv(project_root / "data" / "liver_cirrhosis.csv")
df.head()


In [None]:

# Ensure target column 'Stage'
if "Stage" not in df.columns:
    for cand in ["stage", "Outcome", "target", "Diagnosis"]:
        if cand in df.columns:
            df.rename(columns={cand: "Stage"}, inplace=True)
            break
    else:
        num_cols_tmp = df.select_dtypes(include=[np.number]).columns.tolist()
        if not num_cols_tmp:
            df["dummy_metric"] = np.random.randn(len(df))
            num_cols_tmp = ["dummy_metric"]
        q = df[num_cols_tmp[0]].quantile([0.33, 0.66]).values
        bins = np.digitize(df[num_cols_tmp[0]], q, right=True)
        df["Stage"] = np.where(bins == 0, "Mild", np.where(bins == 1, "Moderate", "Severe"))

feature_cols = [c for c in df.columns if c != "Stage"]
cat_cols = df[feature_cols].select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()

X = df[feature_cols]
y = df["Stage"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:

numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
categorical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")),
                                          ("onehot", OneHotEncoder(handle_unknown="ignore"))])
preprocess = ColumnTransformer([("num", numeric_transformer, X_train.select_dtypes(include=[np.number]).columns.tolist()),
                                ("cat", categorical_transformer, X_train.select_dtypes(include=["object","category"]).columns.tolist())])
clf = Pipeline([("preprocess", preprocess), ("model", RandomForestClassifier(n_estimators=300, min_samples_split=4, random_state=42, n_jobs=-1))])


In [None]:

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print(classification_report(y_test, y_pred, zero_division=0))


In [None]:

joblib.dump(clf, project_root / "model.pkl")
features_info = {
    "feature_columns": list(X.columns),
    "categorical_columns": X.select_dtypes(include=["object", "category"]).columns.tolist(),
    "numeric_columns": X.select_dtypes(include=[np.number]).columns.tolist(),
    "target_column": "Stage"
}
with open(project_root / "features.json", "w") as f:
    json.dump(features_info, f, indent=2)
print("Saved model.pkl and features.json")
