In [1]:
!pip install cloudpickle

Collecting cloudpickle
  Downloading cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Downloading cloudpickle-3.1.1-py3-none-any.whl (20 kB)
Installing collected packages: cloudpickle
Successfully installed cloudpickle-3.1.1


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import cloudpickle

# -----------------------
# 1. Load Data
# -----------------------
df = pd.read_csv("C:/MultipleDiseasePrediction/env/Scripts/indian_liver_patient.csv")

# Fix missing values
df['Albumin_and_Globulin_Ratio'] = df['Albumin_and_Globulin_Ratio'].fillna(
    df['Albumin_and_Globulin_Ratio'].median()
)

# Encode Gender (Male=1, Female=0)
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Target variable: Dataset (1 = disease, 0 = no disease)
y = df['Dataset'].map({1: 1, 2: 0})

# Features
X = df.drop(columns=['Dataset'])

# -----------------------
# 2. Feature Engineering (custom transformer)
# -----------------------
def add_custom_features(X_df):
    X_new = X_df.copy()
    X_new['Bilirubin_Ratio'] = X_new['Direct_Bilirubin'] / (X_new['Total_Bilirubin'] + 1e-6)
    X_new['Albumin_Protein_Ratio'] = X_new['Albumin'] / (X_new['Total_Protiens'] + 1e-6)
    return X_new

feature_engineer = FunctionTransformer(add_custom_features)

# -----------------------
# 3. Define Preprocessor
# -----------------------
num_features = X.columns.tolist() + ["Bilirubin_Ratio", "Albumin_Protein_Ratio"]

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_features)
])

# -----------------------
# 4. Base Pipeline (with XGB)
# -----------------------
pipeline = ImbPipeline(steps=[
    ('feature_engineer', feature_engineer),
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42
    ))
])

# -----------------------
# 5. Train-Test Split
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Compute scale_pos_weight (class imbalance)
scale_pos_weight = (y_train.value_counts()[0] / y_train.value_counts()[1])

# -----------------------
# 6. Hyperparameter Tuning
# -----------------------
param_grid = {
    "xgb__n_estimators": [200, 400, 600],
    "xgb__max_depth": [3, 5, 7],
    "xgb__learning_rate": [0.01, 0.05, 0.1],
    "xgb__subsample": [0.8, 1],
    "xgb__colsample_bytree": [0.7, 0.9, 1],
    "xgb__scale_pos_weight": [scale_pos_weight]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=2
)

print("\n🔍 Running GridSearchCV... (this may take a while)")
grid.fit(X_train, y_train)

print("\n✅ Best Parameters Found:", grid.best_params_)
print("✅ Best CV Accuracy:", grid.best_score_)

# -----------------------
# 7. Evaluate Model
# -----------------------
best_model = grid.best_estimator_

# Test set evaluation
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("\n--- Test Set Performance ---")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

# Train set evaluation
y_train_pred = best_model.predict(X_train)
y_train_prob = best_model.predict_proba(X_train)[:, 1]

print("\n--- Training Set Performance ---")
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Training ROC-AUC:", roc_auc_score(y_train, y_train_prob))

# -----------------------
# 8. Save Best Model
# -----------------------
with open("liver_disease_pipeline.pkl", "wb") as f:
    cloudpickle.dump(best_model, f)

print("\n💾 Best pipeline saved as 'liver_disease_pipeline.pkl'")



🔍 Running GridSearchCV... (this may take a while)
Fitting 5 folds for each of 162 candidates, totalling 810 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Best Parameters Found: {'xgb__colsample_bytree': 0.9, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 3, 'xgb__n_estimators': 600, 'xgb__scale_pos_weight': np.float64(0.3993993993993994), 'xgb__subsample': 1}
✅ Best CV Accuracy: 0.6952871196522536

--- Test Set Performance ---
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.68      0.65        34
           1       0.86      0.83      0.85        83

    accuracy                           0.79       117
   macro avg       0.74      0.75      0.75       117
weighted avg       0.79      0.79      0.79       117

Accuracy: 0.7863247863247863
ROC-AUC: 0.7905740609496812

--- Training Set Performance ---
Training Accuracy: 0.9978540772532188
Training ROC-AUC: 1.0

💾 Best pipeline saved as 'liver_disease_pipeline.pkl'


In [9]:
# RMSE
from sklearn.metrics import mean_squared_error
import numpy as np
rmse = np.sqrt(mean_squared_error(y_test, y_prob))
print("RMSE:", rmse)

RMSE: 0.43999513897052955
