In [25]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from sklearn.impute import SimpleImputer

# Load dataset
dataset = pd.read_csv('liver_cirrhosis.csv')

# Encode categorical variables
dataset['sex'] = dataset['sex'].map({'M': 1, 'F': 0}).fillna(0)
dataset['ascites'] = dataset['ascites'].map({'Y': 1, 'N': 0}).fillna(0)
dataset['hepatomegaly'] = dataset['hepatomegaly'].map({'Y': 1, 'N': 0}).fillna(0)
dataset['spiders'] = dataset['spiders'].map({'Y': 1, 'N': 0}).fillna(0)
dataset['edema'] = dataset['edema'].map({'Y': 1, 'N': 0}).fillna(0)

# Handle missing values for numeric columns
imputer = SimpleImputer(strategy='mean')
dataset[['bilirubin', 'cholesterol', 'albumin', 'copper', 'alk_phos', 'sgot', 'tryglicerides', 'platelets', 'prothrombin']] = \
    imputer.fit_transform(dataset[['bilirubin', 'cholesterol', 'albumin', 'copper', 'alk_phos', 'sgot', 'tryglicerides', 'platelets', 'prothrombin']])

# Label encode target column (stage)
label_encoder = LabelEncoder()
dataset['stage'] = label_encoder.fit_transform(dataset['stage'])

# Features and target
X = dataset[['age', 'n_days', 'sex', 'ascites', 'hepatomegaly', 'spiders', 'edema', 
             'bilirubin', 'cholesterol', 'albumin', 'copper']]  # 11 features
y = dataset['stage']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Define base models for stacking
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('lr', LogisticRegression(random_state=42)),
    ('lgbm', lgb.LGBMClassifier(random_state=42))
]

# Final estimator (RandomForestClassifier)
final_estimator = RandomForestClassifier(n_estimators=100, random_state=42)

# Create Stacking Classifier
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=final_estimator)

# Train the model
stacking_model.fit(X_train_smote, y_train_smote)

# Save the model
joblib.dump(stacking_model, 'liver_cirrhosis_stacking_model.pkl')

# Check accuracy on test set
print(f"Accuracy on test set: {stacking_model.score(X_test, y_test):.2f}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001965 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1214
[LightGBM] [Info] Number of data points in the train set: 8004, number of used features: 11
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1238
[LightGBM] [Info] Number of data points in the train set: 6403, number of used features: 11
[LightGBM] [Info] Start training from score -1.098768
[LightGBM] [Info] Start training from score -1.098768
[LightGBM] [Info] Start training from score -1.098300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1247
[LightGBM] [Info] Number of data points in the train set: 6403, number of used features: 11
[LightGBM] [Info] Start training from score -1.098300
[LightGBM] [Info] Start training from score -1.098768
[LightGBM] [Info] Start training from score -1.098768
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000712 secon