In [None]:
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.colab import files
import io

# Step 1: Upload Dataset
print("📂 Please upload your dataset...")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding="ISO-8859-1")

# Define normal ranges (from medical references)
normal_ranges = {
    "Total Bilirubin": (0.1, 1.2),
    "Direct Bilirubin": (0.0, 0.3),
    "Alkaline Phosphatase": (44, 147),
    "SGPT": (7, 56),
    "SGOT": (10, 40),
    "Total Proteins": (6.0, 8.3),
    "ALB": (3.5, 5.0),
    "A/G Ratio": (1.1, 2.5)
}

# Step 2: Add Binary Features for Normal Ranges
for feature, (low, high) in normal_ranges.items():
    if feature in df.columns:
        df[f"{feature}_Abnormal"] = ((df[feature] < low) | (df[feature] > high)).astype(int)

# Step 3: Prepare Data
target_column = "Result"
X = df.drop(columns=[target_column])
y = df[target_column]

# Standardize Numeric Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Split Data (70% Training, 30% Testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Step 5: Define Models
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "Random Forest": RandomForestClassifier(),
    "LightGBM": LGBMClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Bagged Decision Tree": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gaussian Naïve Bayes": GaussianNB()
}

# Step 6: Train & Evaluate Models
best_model = None
best_accuracy = 0
correct_models = []

print("\n📊 Evaluating Models...\n")

for name, model in models.items():
    print(f"🔄 Training {name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred) * 100
    correct_count = sum(y_pred == y_test)

    if correct_count > 0:
        correct_models.append((name, correct_count, acc))

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

# Show models that predicted at least one correct result
print("\n✅ Models that predicted at least one correct result:")
for name, correct_count, acc in correct_models:
    print(f"✔️ {name}: {correct_count} correct predictions | Accuracy: {acc:.2f}%")

# Save the Best Model
joblib.dump(best_model, "Best_Liver_Disease_Model.pkl")
files.download("Best_Liver_Disease_Model.pkl")
print(f"\n💾 Best Model ({type(best_model).__name__}) saved with accuracy: {best_accuracy:.2f}%")

# Step 7: Function to Test New Patient Data
def predict_liver_disease():
    print("\n📌 Enter patient details below:")
    patient_data = []

    for feature in X.columns:
        value = float(input(f"Enter {feature}: "))
        patient_data.append(value)

    # Convert input data to array and standardize
    patient_array = np.array(patient_data).reshape(1, -1)
    patient_scaled = scaler.transform(patient_array)

    # Load the best model
    best_model = joblib.load("Best_Liver_Disease_Model.pkl")

    # Make Prediction
    prediction = best_model.predict(patient_scaled)[0]
    result = "Liver Disease Detected (1)" if prediction == 1 else "Healthy (0)"

    print(f"\n🔍 Prediction: {result}")

# Step 8: Run User Input Function
predict_liver_disease()


📂 Please upload your dataset...


Saving finaldataset.csv to finaldataset.csv

📊 Evaluating Models...

🔄 Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



🔄 Training Random Forest...
🔄 Training LightGBM...
[LightGBM] [Info] Number of positive: 15147, number of negative: 15147
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003001 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1201
[LightGBM] [Info] Number of data points in the train set: 30294, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




🔄 Training Extra Trees...
🔄 Training Gradient Boosting...
🔄 Training Bagged Decision Tree...
🔄 Training Decision Tree...
🔄 Training Logistic Regression...
🔄 Training K-Nearest Neighbors...
🔄 Training Gaussian Naïve Bayes...

✅ Models that predicted at least one correct result:
✔️ XGBoost: 12943 correct predictions | Accuracy: 99.68%
✔️ Random Forest: 12951 correct predictions | Accuracy: 99.75%
✔️ LightGBM: 12920 correct predictions | Accuracy: 99.51%
✔️ Extra Trees: 12946 correct predictions | Accuracy: 99.71%
✔️ Gradient Boosting: 11438 correct predictions | Accuracy: 88.09%
✔️ Bagged Decision Tree: 12913 correct predictions | Accuracy: 99.45%
✔️ Decision Tree: 12874 correct predictions | Accuracy: 99.15%
✔️ Logistic Regression: 8955 correct predictions | Accuracy: 68.97%
✔️ K-Nearest Neighbors: 12575 correct predictions | Accuracy: 96.85%
✔️ Gaussian Naïve Bayes: 8654 correct predictions | Accuracy: 66.65%


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


💾 Best Model (RandomForestClassifier) saved with accuracy: 99.75%

📌 Enter patient details below:
Enter Age of the patient: -0.953230801
Enter Gender: 0.495063232
Enter Total Bilirubin: -0.642555554
Enter Direct Bilirubin: -0.542082829
Enter Alkphos Alkaline Phosphotase: -0.403203339
Enter Sgpt Alamine Aminotransferase: 2.169244172
Enter Sgot Aspartate Aminotransferase: 0.275259843
Enter Total Protiens: 0.167026416
Enter ALB Albumin: 0.489290622
Enter A/G Ratio Albumin and Globulin Ratio: 0.890427526
Enter Total Bilirubin_Abnormal: -0.642555554
Enter Direct Bilirubin_Abnormal: -0.542082829

🔍 Prediction: Healthy (0)




In [None]:
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.colab import files
import io

# Step 1: Upload Dataset
print("📂 Please upload your dataset...")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding="ISO-8859-1")

# Step 2: Prepare Data (without range-based features)
target_column = "Result"
X = df.drop(columns=[target_column])
y = df[target_column]

# Standardize Numeric Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Step 4: Define Models
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "Random Forest": RandomForestClassifier(),
    "LightGBM": LGBMClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Bagged Decision Tree": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gaussian Naïve Bayes": GaussianNB(),
    "Hybrid (Random Forest + XGBoost)": None  # Placeholder
}

# Step 5: Train & Evaluate Models
best_model = None
best_accuracy = 0
correct_models = []

print("\n📊 Evaluating Models...\n")

for name, model in models.items():
    print(f"🔄 Training {name}...")
    start_time = time.time()

    if name == "Hybrid (Random Forest + XGBoost)":
        # Train Random Forest on training data
        rf = RandomForestClassifier()
        rf.fit(X_train, y_train)

        # Predict probabilities with RF for training set
        rf_train_proba = rf.predict_proba(X_train)[:, 1].reshape(-1, 1)

        # Train XGBoost on RF probabilities
        xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
        xgb.fit(rf_train_proba, y_train)

        # Predict using hybrid model (RF ➜ XGB)
        rf_test_proba = rf.predict_proba(X_test)[:, 1].reshape(-1, 1)
        y_pred = xgb.predict(rf_test_proba)

        model = (rf, xgb)  # Save both models for later use
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred) * 100
    correct_count = sum(y_pred == y_test)

    if correct_count > 0:
        correct_models.append((name, correct_count, acc))

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

# Step 6: Print results
print("\n✅ Models that predicted at least one correct result:")
for name, correct_count, acc in correct_models:
    print(f"✔️ {name}: {correct_count} correct predictions | Accuracy: {acc:.2f}%")

# Save best model
if best_model_name == "Hybrid (Random Forest + XGBoost)":
    joblib.dump(best_model[0], "Best_RF.pkl")
    joblib.dump(best_model[1], "Best_XGB_on_RF.pkl")
    print(f"\n💾 Best Hybrid Model saved (RandomForest + XGBoost) | Accuracy: {best_accuracy:.2f}%")
    files.download("Best_RF.pkl")
    files.download("Best_XGB_on_RF.pkl")
else:
    joblib.dump(best_model, "Best_Liver_Disease_Model.pkl")
    print(f"\n💾 Best Model ({type(best_model).__name__}) saved | Accuracy: {best_accuracy:.2f}%")
    files.download("Best_Liver_Disease_Model.pkl")

# Step 7: Prediction Function
def predict_liver_disease():
    print("\n📌 Enter patient details below:")
    patient_data = []

    for feature in X.columns:
        value = float(input(f"Enter {feature}: "))
        patient_data.append(value)

    patient_array = np.array(patient_data).reshape(1, -1)
    patient_scaled = scaler.transform(patient_array)

    print("\n🔍 Predicting using the best model...")

    if best_model_name == "Hybrid (Random Forest + XGBoost)":
        rf = joblib.load("Best_RF.pkl")
        xgb = joblib.load("Best_XGB_on_RF.pkl")

        rf_proba = rf.predict_proba(patient_scaled)[:, 1].reshape(-1, 1)
        prediction = xgb.predict(rf_proba)[0]
    else:
        model = joblib.load("Best_Liver_Disease_Model.pkl")
        prediction = model.predict(patient_scaled)[0]

    result = "Liver Disease Detected (1)" if prediction == 1 else "Healthy (0)"
    print(f"\n🧬 Prediction: {result}")

# Step 8: Run Prediction
predict_liver_disease()


📂 Please upload your dataset...


Saving finaldataset.csv to finaldataset (1).csv

📊 Evaluating Models...

🔄 Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



🔄 Training Random Forest...
🔄 Training LightGBM...
[LightGBM] [Info] Number of positive: 15147, number of negative: 15147
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005712 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1195
[LightGBM] [Info] Number of data points in the train set: 30294, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




🔄 Training Extra Trees...
🔄 Training Gradient Boosting...
🔄 Training Bagged Decision Tree...
🔄 Training Decision Tree...
🔄 Training Logistic Regression...
🔄 Training K-Nearest Neighbors...
🔄 Training Gaussian Naïve Bayes...
🔄 Training Hybrid (Random Forest + XGBoost)...


Parameters: { "use_label_encoder" } are not used.




✅ Models that predicted at least one correct result:
✔️ XGBoost: 12943 correct predictions | Accuracy: 99.68%
✔️ Random Forest: 12949 correct predictions | Accuracy: 99.73%
✔️ LightGBM: 12920 correct predictions | Accuracy: 99.51%
✔️ Extra Trees: 12962 correct predictions | Accuracy: 99.83%
✔️ Gradient Boosting: 11438 correct predictions | Accuracy: 88.09%
✔️ Bagged Decision Tree: 12903 correct predictions | Accuracy: 99.38%
✔️ Decision Tree: 12873 correct predictions | Accuracy: 99.15%
✔️ Logistic Regression: 8966 correct predictions | Accuracy: 69.05%
✔️ K-Nearest Neighbors: 12597 correct predictions | Accuracy: 97.02%
✔️ Gaussian Naïve Bayes: 8743 correct predictions | Accuracy: 67.34%
✔️ Hybrid (Random Forest + XGBoost): 12935 correct predictions | Accuracy: 99.62%

💾 Best Model (ExtraTreesClassifier) saved | Accuracy: 99.83%


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


📌 Enter patient details below:
Enter Age of the patient: -0.953230801
Enter Gender: 0.495063232
Enter Total Bilirubin: -0.642555554
Enter Direct Bilirubin: -0.542082829
Enter Alkphos Alkaline Phosphotase: -0.403203339
Enter Sgpt Alamine Aminotransferase: 2.169244172
Enter Sgot Aspartate Aminotransferase: 0.275259843
Enter Total Protiens: 0.167026416
Enter ALB Albumin: 0.489290622
Enter A/G Ratio Albumin and Globulin Ratio: 0.890427526

🔍 Predicting using the best model...

🧬 Prediction: Healthy (0)




**with ranges**

In [None]:
import pandas as pd
import numpy as np
import time
import joblib
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from google.colab import files
import io

# Step 1: Upload Dataset
print("📂 Please upload your dataset...")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(io.BytesIO(uploaded[file_name]), encoding="ISO-8859-1")

# Step 2: Add Binary Features for Normal Ranges
normal_ranges = {
    "Total Bilirubin": (0.1, 1.2),
    "Direct Bilirubin": (0.0, 0.3),
    "Alkaline Phosphatase": (44, 147),
    "SGPT": (7, 56),
    "SGOT": (10, 40),
    "Total Proteins": (6.0, 8.3),
    "ALB": (3.5, 5.0),
    "A/G Ratio": (1.1, 2.5)
}

for feature, (low, high) in normal_ranges.items():
    if feature in df.columns:
        df[f"{feature}_Abnormal"] = ((df[feature] < low) | (df[feature] > high)).astype(int)

# Step 3: Prepare Data
target_column = "Result"
X = df.drop(columns=[target_column])
y = df[target_column]

# Standardize Numeric Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Split Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Step 5: Define Models
models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "Random Forest": RandomForestClassifier(),
    "LightGBM": LGBMClassifier(),
    "Extra Trees": ExtraTreesClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Bagged Decision Tree": BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10),
    "Decision Tree": DecisionTreeClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Gaussian Naïve Bayes": GaussianNB(),
    "Hybrid (Random Forest + XGBoost)": None  # Placeholder
}

# Step 6: Train & Evaluate Models
best_model = None
best_accuracy = 0
correct_models = []

print("\n📊 Evaluating Models...\n")

for name, model in models.items():
    print(f"🔄 Training {name}...")
    start_time = time.time()

    if name == "Hybrid (Random Forest + XGBoost)":
        # Train Random Forest
        rf = RandomForestClassifier()
        rf.fit(X_train, y_train)

        # Train XGBoost on RF prediction probabilities
        rf_train_proba = rf.predict_proba(X_train)[:, 1].reshape(-1, 1)
        xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
        xgb.fit(rf_train_proba, y_train)

        # Predict using hybrid pipeline
        rf_test_proba = rf.predict_proba(X_test)[:, 1].reshape(-1, 1)
        y_pred = xgb.predict(rf_test_proba)

        model = (rf, xgb)  # Save the hybrid model
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred) * 100
    correct_count = sum(y_pred == y_test)

    if correct_count > 0:
        correct_models.append((name, correct_count, acc))

    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        best_model_name = name

# Step 7: Show Models that Got Some Predictions Right
print("\n✅ Models that predicted at least one correct result:")
for name, correct_count, acc in correct_models:
    print(f"✔️ {name}: {correct_count} correct predictions | Accuracy: {acc:.2f}%")

# Step 8: Save Best Model
if best_model_name == "Hybrid (Random Forest + XGBoost)":
    joblib.dump(best_model[0], "Best_RF.pkl")
    joblib.dump(best_model[1], "Best_XGB_on_RF.pkl")
    print(f"\n💾 Best Hybrid Model saved (RandomForest + XGBoost) | Accuracy: {best_accuracy:.2f}%")
    files.download("Best_RF.pkl")
    files.download("Best_XGB_on_RF.pkl")
else:
    joblib.dump(best_model, "Best_Liver_Disease_Model.pkl")
    print(f"\n💾 Best Model ({type(best_model).__name__}) saved | Accuracy: {best_accuracy:.2f}%")
    files.download("Best_Liver_Disease_Model.pkl")

# Step 9: Live Prediction Function
def predict_liver_disease():
    print("\n📌 Enter patient details below:")
    patient_data = []

    for feature in X.columns:
        value = float(input(f"Enter {feature}: "))
        patient_data.append(value)

    patient_array = np.array(patient_data).reshape(1, -1)
    patient_scaled = scaler.transform(patient_array)

    print("\n🔍 Predicting using the best model...")

    if best_model_name == "Hybrid (Random Forest + XGBoost)":
        rf = joblib.load("Best_RF.pkl")
        xgb = joblib.load("Best_XGB_on_RF.pkl")
        rf_proba = rf.predict_proba(patient_scaled)[:, 1].reshape(-1, 1)
        prediction = xgb.predict(rf_proba)[0]
    else:
        model = joblib.load("Best_Liver_Disease_Model.pkl")
        prediction = model.predict(patient_scaled)[0]

    result = "Liver Disease Detected (1)" if prediction == 1 else "Healthy (0)"
    print(f"\n🧬 Prediction: {result}")

# Step 10: Run Prediction
predict_liver_disease()
