In [12]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import RobustScaler,OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_auc_score,roc_curve

In [13]:
df = pd.read_csv(r"D:\Internship\Liver-cirrhosis-prediction-working\datasets\cleaned_dataset.csv")

In [14]:
df.head()

Unnamed: 0,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,2221,C,Placebo,50,0,0,1,0,N,0.5,149.0,4.04,227.0,598.0,52.7,57.0,256,9.9,1
1,1230,C,Placebo,54,1,1,0,1,N,0.5,219.0,3.93,22.0,663.0,45.0,75.0,220,10.8,2
2,4184,C,Placebo,32,0,0,0,0,N,0.5,320.0,3.54,51.0,1243.0,122.45,80.0,225,10.0,2
3,2090,D,Placebo,45,0,0,0,0,N,0.7,255.0,3.74,23.0,1024.0,77.5,58.0,151,10.2,2
4,2105,D,Placebo,59,0,0,1,0,N,1.9,486.0,3.54,74.0,1052.0,108.5,109.0,151,11.5,1


In [15]:
num_cols = ['N_Days','Age','Bilirubin','Cholesterol','Albumin','Copper','Alk_Phos','SGOT','Tryglicerides','Platelets','Prothrombin']
cat_cols = []
for col in df.columns:
    if col not in num_cols and col != 'Stage':
        cat_cols.append(col)

X = df.drop('Stage',axis=1)
y = df['Stage']

In [16]:
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [18]:
preprocesser = ColumnTransformer(transformers=[
    ("Scaling",RobustScaler(),num_cols),
    ("Encoding",OneHotEncoder(handle_unknown='ignore',drop='first',sparse_output=False),cat_cols)
])

In [19]:
clfs = {
    "DecisionTree":DecisionTreeClassifier(),
    "XGBClassifier":XGBClassifier(),
    "RandomForestClassifier":RandomForestClassifier(),
    "LogisticRegression":LogisticRegression()
    }

In [21]:
for name, clf in clfs.items():
    model = Pipeline([
        ("preprocess", preprocesser),
        ("clf", clf)
    ])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n{name}")
    print(classification_report(y_test,y_pred))
    print("\n",confusion_matrix(y_test,y_pred))


DecisionTree
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      2486
           1       0.90      0.91      0.90      2564
           2       0.93      0.93      0.93      2450

    accuracy                           0.91      7500
   macro avg       0.91      0.91      0.91      7500
weighted avg       0.91      0.91      0.91      7500


 [[2243  165   78]
 [ 135 2329  100]
 [  69  104 2277]]

XGBClassifier
              precision    recall  f1-score   support

           0       0.97      0.95      0.96      2486
           1       0.95      0.96      0.96      2564
           2       0.97      0.97      0.97      2450

    accuracy                           0.96      7500
   macro avg       0.96      0.96      0.96      7500
weighted avg       0.96      0.96      0.96      7500


 [[2372   77   37]
 [  51 2473   40]
 [  28   45 2377]]

RandomForestClassifier
              precision    recall  f1-score   support

           0    