# Liver Disease Prediction Model

This notebook trains a classification model to predict liver disease based on blood test results.

**Features:** Age, Gender, Total Bilirubin, Direct Bilirubin, Alkaline Phosphotase, Alamine Aminotransferase, Aspartate Aminotransferase, Total Proteins, Albumin, Albumin/Globulin Ratio  
**Target:** Liver disease (1) or Not (2)  
**Dataset:** Indian Liver Patient Dataset (ILPD)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, roc_auc_score, precision_score, recall_score, f1_score
)
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set plot styles
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

print("✓ Libraries loaded!")

## Load Dataset

**Dataset:** https://www.kaggle.com/datasets/uciml/indian-liver-patient-records

In [None]:
# TODO: Load your liver disease dataset
# df = pd.read_csv('path/to/liver_disease.csv')
# Expected columns: Age, Gender, Total_Bilirubin, Direct_Bilirubin, 
#                   Alkaline_Phosphotase, Alamine_Aminotransferase,
#                   Aspartate_Aminotransferase, Total_Protiens,
#                   Albumin, Albumin_and_Globulin_Ratio, Dataset
# display(df.head())
# print(f"Shape: {df.shape}")

## EDA

In [None]:
# print(df.info())
# print("\nMissing values:")
# print(df.isnull().sum())
# print("\nTarget distribution:")
# print(df['Dataset'].value_counts())  # 1=liver patient, 2=non-liver patient

In [None]:
# fig, axes = plt.subplots(2, 2, figsize=(14, 10))
# df['Dataset'].value_counts().plot(kind='bar', ax=axes[0,0])
# axes[0,0].set_title('Target Distribution')
# df['Age'].hist(bins=30, ax=axes[0,1])
# axes[0,1].set_title('Age Distribution')
# sns.heatmap(df.corr(), annot=False, cmap='coolwarm', ax=axes[1,0])
# axes[1,0].set_title('Correlation Matrix')
# df.boxplot(column='Total_Bilirubin', by='Dataset', ax=axes[1,1])
# plt.tight_layout()
# plt.show()

## Preprocessing

In [None]:
# Handle missing values
# df = df.fillna(df.median(numeric_only=True))
# 
# # Encode gender
# le = LabelEncoder()
# df['Gender'] = le.fit_transform(df['Gender'])  # Male=1, Female=0
# 
# # Separate features and target
# X = df.drop('Dataset', axis=1)
# y = df['Dataset']
# # Convert target: 1=disease, 0=no disease
# y = (y == 1).astype(int)
# 
# print(f"Features: {X.shape}")
# print(f"Target: {y.value_counts()}")

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )
# 
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
# print("✓ Data split and scaled!")

## Model Training

In [None]:
# models = {
#     'Random Forest': RandomForestClassifier(n_estimators=200, random_state=42),
#     'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, random_state=42),
#     'SVM': SVC(probability=True, kernel='rbf')
# }
# 
# for name, model in models.items():
#     model.fit(X_train_scaled, y_train)
#     y_pred = model.predict(X_test_scaled)
#     acc = accuracy_score(y_test, y_pred)
#     print(f"{name:20s} Accuracy: {acc:.4f}")

In [None]:
# Select best model and evaluate
# best_model = models['Random Forest']  # Replace with actual best
# y_pred = best_model.predict(X_test_scaled)
# 
# print("Classification Report:")
# print(classification_report(y_test, y_pred, target_names=['No Disease', 'Liver Disease']))
# 
# cm = confusion_matrix(y_test, y_pred)
# sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
# plt.title('Confusion Matrix - Liver Disease')
# plt.show()

In [None]:
# Feature importance (for tree-based models)
# if hasattr(best_model, 'feature_importances_'):
#     importance_df = pd.DataFrame({
#         'feature': X.columns,
#         'importance': best_model.feature_importances_
#     }).sort_values('importance', ascending=False)
#     
#     plt.figure(figsize=(10, 6))
#     plt.barh(importance_df['feature'], importance_df['importance'], color='mediumseagreen')
#     plt.xlabel('Importance Score', fontsize=12)
#     plt.title('Feature Importance - Liver Disease Prediction', fontsize=14, fontweight='bold')
#     plt.gca().invert_yaxis()
#     plt.tight_layout()
#     plt.show()
#     
#     print("\nMost Important Blood Test Markers:")
#     print(importance_df.to_string(index=False))

### Feature Importance

In [None]:
# Performance metrics visualization
# metrics = {
#     'Accuracy': accuracy_score(y_test, y_pred),
#     'Precision': precision_score(y_test, y_pred),
#     'Recall': recall_score(y_test, y_pred),
#     'F1-Score': f1_score(y_test, y_pred),
#     'ROC-AUC': roc_auc
# }
# 
# plt.figure(figsize=(10, 6))
# colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12', '#9b59b6']
# bars = plt.bar(metrics.keys(), metrics.values(), color=colors)
# plt.ylim([0, 1])
# plt.ylabel('Score', fontsize=12)
# plt.title('Model Performance Metrics - Liver Disease', fontsize=14, fontweight='bold')
# plt.axhline(y=0.8, color='gray', linestyle='--', alpha=0.5, label='80% threshold')
# plt.grid(axis='y', alpha=0.3)
# plt.legend()
# 
# for i, (key, value) in enumerate(metrics.items()):
#     plt.text(i, value + 0.02, f'{value:.3f}', ha='center', fontweight='bold')
# 
# plt.xticks(rotation=15, ha='right')
# plt.tight_layout()
# plt.show()
# 
# print("\nMetrics Summary:")
# for metric, value in metrics.items():
#     print(f"  {metric:12s}: {value:.4f}")

### Performance Metrics

In [None]:
# ROC Curve
# fpr, tpr, thresholds = roc_curve(y_test, y_proba)
# roc_auc = roc_auc_score(y_test, y_proba)
# 
# plt.figure(figsize=(8, 6))
# plt.plot(fpr, tpr, color='darkgreen', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate', fontsize=12)
# plt.ylabel('True Positive Rate', fontsize=12)
# plt.title('ROC Curve - Liver Disease Prediction', fontsize=14, fontweight='bold')
# plt.legend(loc="lower right")
# plt.grid(alpha=0.3)
# plt.tight_layout()
# plt.show()
# 
# print(f"Area Under ROC Curve: {roc_auc:.4f}")

### ROC Curve

In [None]:
# Detailed evaluation with enhanced confusion matrix
# y_pred = best_model.predict(X_test_scaled)
# y_proba = best_model.predict_proba(X_test_scaled)[:, 1]
# 
# print("Classification Report:")
# print("="*60)
# print(classification_report(y_test, y_pred, target_names=['No Disease', 'Liver Disease']))
# 
# # Enhanced Confusion Matrix
# cm = confusion_matrix(y_test, y_pred)
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
# 
# # Count-based confusion matrix
# sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
#             xticklabels=['No Disease', 'Liver Disease'],
#             yticklabels=['No Disease', 'Liver Disease'],
#             ax=ax1, cbar_kws={'label': 'Count'})
# ax1.set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold')
# ax1.set_ylabel('Actual', fontsize=12)
# ax1.set_xlabel('Predicted', fontsize=12)
# 
# # Normalized confusion matrix
# cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Greens',
#             xticklabels=['No Disease', 'Liver Disease'],
#             yticklabels=['No Disease', 'Liver Disease'],
#             ax=ax2, cbar_kws={'label': 'Percentage'})
# ax2.set_title('Normalized Confusion Matrix (%)', fontsize=14, fontweight='bold')
# ax2.set_ylabel('Actual', fontsize=12)
# ax2.set_xlabel('Predicted', fontsize=12)
# 
# plt.tight_layout()
# plt.show()
# 
# # Detailed metrics
# tn, fp, fn, tp = cm.ravel()
# print(f"\nConfusion Matrix Breakdown:")
# print(f"  True Negatives:  {tn}")
# print(f"  False Positives: {fp}")
# print(f"  False Negatives: {fn} ⚠️")
# print(f"  True Positives:  {tp}")

## Model Evaluation with Visualizations

## Save Model

In [None]:
# joblib.dump(best_model, '../Frontend/models/liver_model.sav')
# joblib.dump(scaler, '../Frontend/models/liver_scaler.sav')
# print("✓ Model and scaler saved!")

## Usage

In [None]:
# def predict_liver_disease(gender, age, total_bilirubin, direct_bilirubin,
#                          alkaline_phosphotase, alamine_aminotransferase,
#                          aspartate_aminotransferase, total_proteins,
#                          albumin, ag_ratio):
#     model = joblib.load('../Frontend/models/liver_model.sav')
#     scaler = joblib.load('../Frontend/models/liver_scaler.sav')
#     
#     input_data = np.array([[gender, age, total_bilirubin, direct_bilirubin,
#                             alkaline_phosphotase, alamine_aminotransferase,
#                             aspartate_aminotransferase, total_proteins,
#                             albumin, ag_ratio]])
#     input_scaled = scaler.transform(input_data)
#     prediction = model.predict(input_scaled)[0]
#     return 'Liver Disease' if prediction == 1 else 'No Disease'
# 
# # Test
# result = predict_liver_disease(1, 65, 0.7, 0.1, 187, 16, 18, 6.8, 3.3, 0.90)
# print(f"Prediction: {result}")