# Multi-Class Prediction of Cirrhosis Outcomes

This notebook demonstrates preprocessing, training, evaluation, and visualization of multiple machine learning models to predict cirrhosis patient outcomes.

## Importing Libraries

We firstly need to install the libraries we will use in this project.

To do so, run the following command in the terminal:

```bash
pip install -r requirements.txt
```

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc
import warnings
import time
from sklearn.exceptions import ConvergenceWarning

# ML models imports
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Ignore warnings
warnings.filterwarnings("ignore", message=".*use_label_encoder.*")
warnings.filterwarnings("ignore", message=".*ResourceTracker.*")
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## Data Preprocessing Pipeline

#### Data Loading

Loads the dataset from the specified path into a Pandas DataFrame

#### ID Column Removal

Drops the `id` column as it does not contribute to model training

#### Fixing Data Types for Numerical Columns

Ensures that specific columns are numeric, converting invalid values to NaN

#### Encoding Categorical Variables

Encodes categorical variables using `LabelEncoder`
Encodes the target column `Status`

#### Handling Missing Values

Imputes missing values in numeric columns using the mean strategy

#### Feature Scaling

Standardizes numeric columns to have zero mean and unit variance

#### Feature-Target Split

Separates the dataset into features and target

#### Train-Test Split

Splits the data into training and testing sets with stratified sampling


In [None]:
DATA_PATH = 'data/train.csv'
df = pd.read_csv(DATA_PATH)

df = df.drop('id', axis=1)

numeric_columns_to_fix = ['Cholesterol', 'Copper', 'Tryglicerides', 'Platelets', 'Stage']
for col in numeric_columns_to_fix:
    df[col] = pd.to_numeric(df[col], errors='coerce')

categorical_cols = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

target_encoder = LabelEncoder()
df['Status'] = target_encoder.fit_transform(df['Status'])

numeric_cols = ['N_Days', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
                'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
imputer = SimpleImputer(strategy='mean')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

X = df.drop('Status', axis=1)
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

__all__ = ['X_train', 'X_test', 'y_train', 'y_test', 'target_encoder']


## Model Evaluation Function

#### Function: `evaluate_model`

- Evaluates a model with optional cross-validation.
- If `cv` is set, performs Stratified K-Fold CV to get mean and std of accuracy, precision, recall, and F1-score.
- Trains the model and records training time.
- Predicts on test set and records testing time.
- Calculates accuracy, precision, recall, F1-score, and confusion matrix on test data.
- Returns a dictionary with all these metrics, the trained model, and predictions.

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, average='macro', cv=None):
    results = {}

    if cv:
        skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
        accuracy_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
        precision_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring=f'precision_{average}')
        recall_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring=f'recall_{average}')
        f1_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring=f'f1_{average}')

        results.update({
            'cv_accuracy': accuracy_scores.mean(),
            'cv_precision': precision_scores.mean(),
            'cv_recall': recall_scores.mean(),
            'cv_f1_score': f1_scores.mean(),
            'cv_std': {
                'accuracy': accuracy_scores.std(),
                'precision': precision_scores.std(),
                'recall': recall_scores.std(),
                'f1': f1_scores.std()
            },
        })

    start_train = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_train

    start_test = time.time()
    y_pred = model.predict(X_test)
    testing_time = time.time() - start_test

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average=average, zero_division=0)
    recall = recall_score(y_test, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=average, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)

    results.update({
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'training_time': training_time,
        'testing_time': testing_time,
        'confusion_matrix': cm,
        'model': model,
        'y_pred': y_pred
    })

    return results

##  Confusion Matrix Plotting Function

####  Function: `plot_confusion_matrix`

This function visualizes the confusion matrix of a classification model‚Äôs predictions. It:

- Accepts a confusion matrix (`cm`) and an optional model name.
- Uses Seaborn‚Äôs heatmap to plot the matrix with annotated counts.
- Labels the axes as "Actual" (rows) and "Predicted" (columns).
- Displays class labels: `['C', 'CL', 'D']`.

---

##  Learning Curve Plotting Function

####  Function: `plot_learning_curve`

This function plots the learning curve of a model training process. It:

- Takes a model, features `X`, labels `y`, and an optional plot title.
- Uses 5-fold cross-validation to evaluate training and validation accuracy.
- Computes scores for training set sizes from 10% to 100%.
- Plots training and validation accuracy vs. training set size.
- Helps diagnose underfitting or overfitting by visualizing model performance with varying training data.

---

##  ROC Curve Plotting Function

####  Function: `plot_roc_curve`

This function plots the Receiver Operating Characteristic (ROC) curve(s) to evaluate model classification performance. It:

- Accepts a trained model, test features `X_test`, true labels `y_test`, and optional model name.
- Attempts to get prediction probabilities using `predict_proba`.
- If binary classification, plots a single ROC curve with AUC.
- If multiclass, plots one ROC curve per class, labeling each with class name and AUC.
- Adds a diagonal line for reference (random classifier).
- Shows axis labels, legend, and grid for clarity.

---


In [None]:
def plot_confusion_matrix(cm, model_name='Model'):
    labels = ['C', 'CL', 'D']
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.show()


def plot_learning_curve(model, X, y, title='Learning Curve'):
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 5),
        n_jobs=-1
    )
    train_mean = train_scores.mean(axis=1)
    val_mean = val_scores.mean(axis=1)

    plt.figure(figsize=(8, 5))
    plt.plot(train_sizes, train_mean, label='Training score')
    plt.plot(train_sizes, val_mean, label='Validation score')
    plt.title(title)
    plt.xlabel('Training Set Size')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.show()


def plot_roc_curve(model, X_test, y_test, model_name='Model'):
    try:
        y_score = model.predict_proba(X_test)
        class_labels = ['C', 'CL', 'D']  

        plt.figure(figsize=(8, 5))

        if len(np.unique(y_test)) == 2:
            fpr, tpr, _ = roc_curve(y_test, y_score[:, 1])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
        else:
            for i in range(y_score.shape[1]):
                fpr, tpr, _ = roc_curve(y_test == i, y_score[:, i])
                roc_auc = auc(fpr, tpr)
                plt.plot(fpr, tpr, label=f'Status {class_labels[i]} (area = {roc_auc:.2f})')

        plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {model_name}')
        plt.legend(loc='lower right')
        plt.grid()
        plt.tight_layout()
        plt.show()
    except AttributeError:
        print(f"Model {model_name} does not support probability prediction.")


## Plotting Functions Results

#### CatBoost

<img src="images/Figure_01.png" width="30%"/>
<img src="images/Figure_02.png" width="32%"/>
<img src="images/Figure_03.png" width="32%"/>

---

#### XGBoost

<img src="images/Figure_11.png" width="30%"/>
<img src="images/Figure_12.png" width="32%"/>
<img src="images/Figure_13.png" width="32%"/>

---

#### LightGBM

<img src="images/Figure_21.png" width="30%"/>
<img src="images/Figure_22.png" width="32%"/>
<img src="images/Figure_23.png" width="32%"/>

---

#### StackingClassifier

<img src="images/Figure_31.png" width="30%"/>
<img src="images/Figure_32.png" width="32%"/>
<img src="images/Figure_33.png" width="32%"/>


### Analysing the Data

#### For Confusion Matrix
In general, it is evident that the models are good at identifying Class C (likely the majority class). They struggle significantly with Class CL. Class D performs moderately well but still suffers from some overlap with Class C.

#### For Learning Curve

The models learn the training data very well (95%+ accuracy), but they perform noticeably worse on unseen data (80%+ accuracy). There's a gap (~15%) between training and validation performance.
This suggests that the models likely have enough capacity to fit the data but are slightly overfitting‚Äîmemorizing details from the training set that don‚Äôt generalize well.
However, a validation score above 80% on 7,905 samples with 18 features is still quite good.

#### For ROC Curve

Status C: AUC = 0.9 ‚Üí Excellent discrimination ‚Äî the models can very well distinguish class C from the others.
Status CL: AUC = 0.77 ‚Üí Fair discrimination ‚Äî the models are okay but less confident in distinguishing CL from other classes. Could improve.
Status D: AUC = 0.9 ‚Üí Excellent discrimination ‚Äî the models are strong at identifying class D.
Overall the models perform very well on classes C and D. Class CL is a bit harder to predict confidently because it‚Äôs less represented and overlap with other classes.

---

## Main Function: Model Training and Evaluation Pipeline

#### Function: `main`

- Defines a dictionary of different classifiers with specified parameters.
- Iterates over each model, training and evaluating them using the `evaluate_model` function.
- Prints accuracy and F1-score for each model.
- Visualizes the confusion matrix, ROC curve, and learning curve for each model.
- Returns a dictionary containing evaluation results for all models.

In [None]:
def main():
    models = {
        'CatBoost': CatBoostClassifier(
            random_state=42, 
            verbose=0
        ),
        'XGBoost': XGBClassifier(
            eval_metric='mlogloss',
            objective='multi:softmax',
            num_class=len(target_encoder.classes_),
            random_state=42
        ),
        'LightGBM': LGBMClassifier(
            random_state=42,
            max_depth=7,
            min_gain_to_split=0,
            min_child_samples=20,
            verbosity=-1 
        )
    }

    estimators = [
        ('svm', SVC(probability=True, random_state=42)),
        ('knn', KNeighborsClassifier()),
        ('mlp', MLPClassifier(random_state=42, max_iter=1000, early_stopping=True)),
        ('et', ExtraTreesClassifier(n_estimators=100, random_state=42)),
        ('dt', DecisionTreeClassifier(max_depth=5, random_state=42))
    ]

    stacking_clf = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=200),
        passthrough=False,
        n_jobs=-1
    )

    models['StackingClassifier'] = stacking_clf 

    results_summary = {}

    for name, model in models.items():
        print(f"\nüîç Training and evaluating: {name}")
        results = evaluate_model(
            model, 
            X_train, y_train, 
            X_test, y_test, 
            average='macro', 
            cv=5  
        )

        print(f"‚Üí CV Accuracy:  {results['cv_accuracy']:.4f}")
        print(f"‚Üí CV Precision: {results['cv_precision']:.4f}")
        print(f"‚Üí CV Recall:    {results['cv_recall']:.4f}")
        print(f"‚Üí CV F1 Score:  {results['cv_f1_score']:.4f}")
        print(f"‚Üí Test Accuracy: {results['accuracy']:.4f}")
        print(f"‚Üí Training Time: {results['training_time']:.2f} s")
        print(f"‚Üí Testing Time:  {results['testing_time']:.4f} s")

        results_summary[name] = {
            'cv_accuracy': results['cv_accuracy'],
            'cv_precision': results['cv_precision'],
            'cv_recall': results['cv_recall'],
            'cv_f1_score': results['cv_f1_score'],
            'test_accuracy': results['accuracy'],
            'precision': results['precision'],
            'recall': results['recall'],
            'f1_score': results['f1_score'],
            'training_time': results['training_time'],
            'testing_time': results['testing_time']
        }

        plot_confusion_matrix(results['confusion_matrix'], model_name=name)
        plot_learning_curve(model, X_train, y_train, title=f'{name} Learning Curve')
        plot_roc_curve(model, X_test, y_test, model_name=name)

    print("\n=== üßæ Model Performance Summary ===")
    summary_df = pd.DataFrame(results_summary).T.round(4)
    print(summary_df)

if __name__ == "__main__":
    main()
