#  LIVER PATIENT CLASSIFICATION 

# Import Libraries

In [247]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as pls 
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_auc_score, log_loss, matthews_corrcoef,
    mean_absolute_error, mean_squared_error, r2_score,
    mean_absolute_percentage_error, explained_variance_score,
    classification_report
)

# Load data 

In [248]:
df = pd.read_csv("/kaggle/input/indian-liver-patient-dataset/Indian Liver Patient Dataset (ILPD).csv")
df.head()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


# Data Analysis

In [249]:
df.shape

(583, 11)

In [250]:
df.isnull().sum()

age                 0
gender              0
tot_bilirubin       0
direct_bilirubin    0
tot_proteins        0
albumin             0
ag_ratio            0
sgpt                0
sgot                0
alkphos             4
is_patient          0
dtype: int64

In [251]:
df = df.dropna()

In [252]:
df.dtypes

age                   int64
gender               object
tot_bilirubin       float64
direct_bilirubin    float64
tot_proteins          int64
albumin               int64
ag_ratio              int64
sgpt                float64
sgot                float64
alkphos             float64
is_patient            int64
dtype: object

In [253]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 579 entries, 0 to 582
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               579 non-null    int64  
 1   gender            579 non-null    object 
 2   tot_bilirubin     579 non-null    float64
 3   direct_bilirubin  579 non-null    float64
 4   tot_proteins      579 non-null    int64  
 5   albumin           579 non-null    int64  
 6   ag_ratio          579 non-null    int64  
 7   sgpt              579 non-null    float64
 8   sgot              579 non-null    float64
 9   alkphos           579 non-null    float64
 10  is_patient        579 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 54.3+ KB


In [254]:
df.describe()

Unnamed: 0,age,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
count,579.0,579.0,579.0,579.0,579.0,579.0,579.0,579.0,579.0,579.0
mean,44.782383,3.315371,1.494128,291.366149,81.126079,110.414508,6.481693,3.138515,0.947064,1.284974
std,16.221786,6.227716,2.816499,243.561863,183.182845,289.850034,1.084641,0.794435,0.319592,0.451792
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,61.0,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [255]:
df.columns

Index(['age', 'gender', 'tot_bilirubin', 'direct_bilirubin', 'tot_proteins',
       'albumin', 'ag_ratio', 'sgpt', 'sgot', 'alkphos', 'is_patient'],
      dtype='object')

In [256]:
df["is_patient"].value_counts()

is_patient
1    414
2    165
Name: count, dtype: int64

# Data Preprocessing 

In [257]:
df['gender'].value_counts()

gender
Male      439
Female    140
Name: count, dtype: int64

In [258]:
df['is_patient'].value_counts()

is_patient
1    414
2    165
Name: count, dtype: int64

In [259]:
df['gender'] = df['gender'].replace({"Male":1,"Female":0})

  df['gender'] = df['gender'].replace({"Male":1,"Female":0})


# Splitting dataset 

In [260]:
X = df.drop("is_patient",axis=1)
y = df["is_patient"]

In [261]:
from sklearn.model_selection import train_test_split 

In [262]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)

In [263]:
from sklearn.preprocessing import StandardScaler

In [264]:
scaler = StandardScaler()

In [265]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model Implementation 


In [266]:
# Add class_weight to base model
base_model = DecisionTreeClassifier(max_depth=5, class_weight='balanced')

# Pass only base_model to BaggingClassifier (no class_weights here)
model = BaggingClassifier(base_estimator=base_model, n_estimators=100, random_state=42)

# Fit model
model.fit(X_train_scaled, y_train)

# Predict
y_pred = model.predict(X_test_scaled)
y_probs = model.predict_proba(X_test_scaled)[:, 1]

  """Private function used to partition estimators between jobs."""


# Evaluation 


In [267]:
print("📊 Classification Metrics (on Test Set)")
print(f"Accuracy       : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision      : {precision_score(y_test, y_pred, average='binary'):.4f}")
print(f"Recall         : {recall_score(y_test, y_pred, average='binary'):.4f}")
print(f"F1 Score       : {f1_score(y_test, y_pred, average='binary'):.4f}")
print(f"MCC            : {matthews_corrcoef(y_test, y_pred):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print(f"ROC AUC Score  : {roc_auc_score(y_test, y_probs):.4f}")
print(f"Log Loss       : {log_loss(y_test, y_probs):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

📊 Classification Metrics (on Test Set)
Accuracy       : 0.6293
Precision      : 0.8704
Recall         : 0.5663
F1 Score       : 0.6861
MCC            : 0.3203
Confusion Matrix:
 [[47 36]
 [ 7 26]]
ROC AUC Score  : 0.7097
Log Loss       : 0.6198

Classification Report:
               precision    recall  f1-score   support

           1       0.87      0.57      0.69        83
           2       0.42      0.79      0.55        33

    accuracy                           0.63       116
   macro avg       0.64      0.68      0.62       116
weighted avg       0.74      0.63      0.65       116

