In [None]:
# Liver Cirrhosis Stage Detection
# Predicting histologic stage of disease (1, 2, or 3) from patient data

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 5)


In [None]:
# Load dataset
print("=" * 80)
print("LIVER CIRRHOSIS STAGE DETECTION")
print("=" * 80)
print("\n[1] Loading Dataset...")

df = pd.read_csv('liver_cirrhosis.csv')
print(f"✓ Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# Display basic info
print("\n[2] Dataset Overview")
print("-" * 80)
print(df.head())
print("\nTarget Distribution:")
print(df['Stage'].value_counts().sort_index())

# Check missing values
print("\n[3] Missing Values Check")
print("-" * 80)
missing = df.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("✓ No missing values!")


In [None]:
# Visualizations
print("\n[4] Visualizations")
print("-" * 80)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Stage distribution
stage_counts = df['Stage'].value_counts().sort_index()
axes[0].bar(stage_counts.index, stage_counts.values, color=['#3498db', '#2ecc71', '#e74c3c'])
axes[0].set_xlabel('Stage')
axes[0].set_ylabel('Count')
axes[0].set_title('Liver Cirrhosis Stage Distribution')
axes[0].set_xticks([1, 2, 3])

# Pie chart
axes[1].pie(stage_counts.values, labels=[f'Stage {i}' for i in stage_counts.index],
            autopct='%1.1f%%', colors=['#3498db', '#2ecc71', '#e74c3c'])
axes[1].set_title('Stage Distribution (%)')
plt.tight_layout()
plt.show()

In [None]:
# Key feature distributions
fig, axes = plt.subplots(2, 3, figsize=(14, 8))
features = ['Bilirubin', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Prothrombin']
for idx, feat in enumerate(features):
    row, col = idx // 3, idx % 3
    for stage in [1, 2, 3]:
        axes[row, col].hist(df[df['Stage'] == stage][feat].dropna(),
                           alpha=0.5, label=f'Stage {stage}', bins=20)
    axes[row, col].set_xlabel(feat)
    axes[row, col].set_ylabel('Frequency')
    axes[row, col].legend()
    axes[row, col].set_title(f'{feat} by Stage')
plt.tight_layout()
plt.show()

In [None]:
# Data Preprocessing
print("\n[5] Data Preprocessing")
print("-" * 80)

# Separate features and target
X = df.drop('Stage', axis=1)
y = df['Stage']

# Encode categorical variables
print("Encoding categorical variables...")
label_encoders = {}
categorical_cols = X.select_dtypes(include=['object']).columns

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"✓ Encoded {len(categorical_cols)} categorical columns")

# Handle missing values if any
if X.isnull().sum().sum() > 0:
    print("Filling missing values with median...")
    X = X.fillna(X.median())

# Feature scaling
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
print(f"✓ Train set: {X_train.shape[0]} samples")
print(f"✓ Test set: {X_test.shape[0]} samples")

In [None]:
# Model Training
print("\n[6] Model Training - Random Forest")
print("-" * 80)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
print("✓ Model trained successfully!")

# Cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"✓ Cross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

In [None]:
# Model Evaluation
print("\n[7] Model Evaluation")
print("-" * 80)

# Predictions
y_pred = rf_model.predict(X_test)

# Accuracy
train_acc = rf_model.score(X_train, y_train)
test_acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Testing Accuracy: {test_acc:.4f}")
print(f"F1 Score: {f1:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Stage 1', 'Stage 2', 'Stage 3']))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Stage 1', 'Stage 2', 'Stage 3'],
            yticklabels=['Stage 1', 'Stage 2', 'Stage 3'])
plt.title('Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
# Feature Importance
print("\n[8] Feature Importance Analysis")
print("-" * 80)

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance.head(10))

# Plot feature importance
plt.figure(figsize=(10, 6))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['Importance'])
plt.yticks(range(len(top_features)), top_features['Feature'])
plt.xlabel('Importance')
plt.title('Top 15 Feature Importance', fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# Sample Predictions
print("\n[9] Sample Predictions")
print("-" * 80)
sample_predictions = pd.DataFrame({
    'Actual': y_test.values[:10],
    'Predicted': y_pred[:10]
})
print(sample_predictions)

# Final Summary
print("\n" + "=" * 80)
print("MODEL SUMMARY")
print("=" * 80)
print(f"Model: Random Forest Classifier")
print(f"Training Samples: {len(X_train)}")
print(f"Testing Samples: {len(X_test)}")
print(f"Number of Features: {X.shape[1]}")
print(f"Target Classes: {sorted(y.unique())}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"F1 Score: {f1:.4f}")
print("=" * 80)
print("✓ Model ready for deployment!")
print("=" * 80)