In [None]:
!pip install catboost matplotlib pandas scikit-learn kaggle optuna ipywidgets kaleido shap jupyterlab-rise

In [61]:
#!dir ../../../home/vscode

In [62]:
import os
from pathlib import Path
container_check = os.getenv("iscontainer")
if container_check=="y":
    config_dir = Path("/home/vscode/.config/kaggle")
    config_dir.mkdir(parents=True, exist_ok=True)
    
    with open(config_dir / "kaggle.json", "w") as dst:
        with open("./kaggle.json", "r") as src:
            dst.write(src.read())

# Download dataset

In [None]:
import os
import zipfile
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()



dataset_name = "arshid/iris-flower-dataset"
download_folder = Path("data/iris-prediction")
download_folder.mkdir(parents=True, exist_ok=True)

api.dataset_download_files(dataset_name, path=str(download_folder), unzip=True)



In [None]:
!dir "./data/iris-prediction"

# Load data

In [None]:

import pandas as pd

df = pd.read_csv(download_folder / "iris.csv")

df.head(5)




     


# One-hot encoding categorical non-order features

In [67]:
from sklearn.preprocessing import OneHotEncoder

hot_encoder = OneHotEncoder(drop='first')

# Pipeline for transforming columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
    ],
    remainder='passthrough'  
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

encoded_data = preprocessor.fit_transform(df)

encoded_data

# Creation of a data frame from transformed data

In [None]:
transformed_df = pd.DataFrame(
  encoded_data,
  columns=preprocessor.get_feature_names_out())

transformed_df

# Division into training and test data

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(transformed_df, test_size=0.2, random_state=42)

df_test.info()

In [None]:
df_train.info()





     


In [72]:
y_train = df_train.pop("remainder__species")
X_train = df_train


In [None]:
X_train.info()

# Automatic hyperparameter optimization for CatBoostClassifier using Optuna

In [None]:
import joblib
import optuna
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

outfolder = Path("results")
outfolder.mkdir(parents=True, exist_ok=True)

if isinstance(y_train, (pd.Series, pd.DataFrame)):
     y_train_np = y_train.values.ravel()
elif isinstance(y_train, list):
     y_train_np = np.array(y_train)
else:
     y_train_np = y_train

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_np)

best_params_path = outfolder / "best_params.pkl"

if not best_params_path.is_file():
    X_train_opt, X_val_opt, y_train_opt_encoded, y_val_opt_encoded = train_test_split(X_train, y_train_encoded, test_size=0.25, random_state=42)

    def objective(trial):
        params = {
            "depth": trial.suggest_int("depth", 2, 10),
            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-5, 100.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 1.0),
            "random_strength": trial.suggest_float("random_strength", 1e-5, 100.0, log=True),
            "loss_function": "MultiClass",
            "eval_metric": "MultiClass"
        }
        model = CatBoostClassifier(**params, verbose=0, random_state=42)
        model.fit(X_train_opt, y_train_opt_encoded,
                  eval_set=(X_val_opt, y_val_opt_encoded),
                  early_stopping_rounds=50,
                  verbose=0)
        return model.get_best_score()["validation"]["MultiClass"]

    study_name = "catboost-multi-optimization"
    storage_name = f"sqlite:///{study_name}.db"

    print(f"Creating/loading Optuna study '{study_name}' from '{storage_name}'")
    study = optuna.create_study(study_name=study_name, storage=storage_name, direction="minimize", load_if_exists=True)

    print("Starting Optuna optimization...")
    study.optimize(objective, n_trials=50)
    print("Optimization finished.")
    print("Best trial:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

    best_params = study.best_params
    joblib.dump(best_params, best_params_path)
    params = best_params
else:
    print("Loading best parameters from file.")
    params = joblib.load(best_params_path)

print("Best Parameters for CV:", params)

# Cross-validation of CatBoostClassifier with optimized parameters and saving results to CSV

In [87]:
import joblib
import optuna
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool, cv
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

outfolder = Path("results")
outfolder.mkdir(parents=True, exist_ok=True)

if isinstance(X_train, pd.DataFrame):
    X_train_processed = X_train
else:
    X_train_processed = pd.DataFrame(X_train) # Ensure X_train is a DataFrame for consistency

if isinstance(y_train, (pd.Series, pd.DataFrame)):
     y_train_np = y_train.values.ravel()
elif isinstance(y_train, list):
     y_train_np = np.array(y_train)
else:
     y_train_np = y_train

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_np)

best_params_path = outfolder / "best_params.pkl"

if not best_params_path.is_file():
    X_train_opt, X_val_opt, y_train_opt_encoded, y_val_opt_encoded = train_test_split(X_train_processed, y_train_encoded, test_size=0.25, random_state=42)

    def objective(trial):
        params = {
            "depth": trial.suggest_int("depth", 2, 10),
            "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log=True),
            "iterations": trial.suggest_int("iterations", 100, 1000),
            "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-5, 100.0, log=True),
            "bagging_temperature": trial.suggest_float("bagging_temperature", 0.01, 1.0),
            "random_strength": trial.suggest_float("random_strength", 1e-5, 100.0, log=True),
            "loss_function": "MultiClass", # Fixed parameter in objective
            "eval_metric": "MultiClass"     # Fixed parameter for evaluation in objective
        }
        model = CatBoostClassifier(**params, verbose=0, random_state=42)
        model.fit(X_train_opt, y_train_opt_encoded,
                  eval_set=(X_val_opt, y_val_opt_encoded),
                  early_stopping_rounds=50,
                  verbose=0)
        return model.get_best_score()["validation"]["MultiClass"]

    study_name = "catboost-multi-optimization"
    storage_name = f"sqlite:///{study_name}.db"

    print(f"Creating/loading Optuna study '{study_name}' from '{storage_name}'")
    study = optuna.create_study(study_name=study_name, storage=storage_name, direction="minimize", load_if_exists=True)

    print("Starting Optuna optimization...")
    study.optimize(objective, n_trials=50)
    print("Optimization finished.")
    print("Best trial:")
    print(" Value: ", study.best_trial.value)
    print(" Params: ")
    for key, value in study.best_trial.params.items():
        print(f"    {key}: {value}")

    best_params = study.best_params
    joblib.dump(best_params, best_params_path)
    params = best_params
else:
    print("Loading best parameters from file.")
    params = joblib.load(best_params_path)

# --- FIX: Add loss_function back as it's required for cv and might not be saved by Optuna ---
params["loss_function"] = "MultiClass"
# --- Add other required parameters for CV / reporting ---
params["eval_metric"] = "MultiClass"


print("Parameters used for CV:", params)


Loading best parameters from file.
Parameters used for CV: {'depth': 2, 'learning_rate': 0.0860980163836657, 'iterations': 233, 'l2_leaf_reg': 0.04377587811072853, 'bagging_temperature': 0.35678838592729784, 'random_strength': 0.10944176769690127, 'loss_function': 'MultiClass', 'eval_metric': 'MultiClass', 'custom_metric': ['F1:macro', 'F1:weighted']}


# Plotting cross-validation F1 score with error bands using Plotly

In [86]:
# Use the processed DataFrame and encoded labels for CV
data = Pool(X_train_processed, y_train_encoded)

print("\nStarting CatBoost CV...")
cv_results = cv(
    params=params,
    pool=data,
    fold_count=5,
    partition_random_seed=42,
    shuffle=True,
    verbose=True,
    early_stopping_rounds=50,
)

print("CV finished.")
print(cv_results.head())

cv_results.to_csv(outfolder / "cv_results.csv", index=False)
print(f"CV results saved to {outfolder / 'cv_results.csv'}")

if 'test-MultiClass-mean' in cv_results.columns:
    best_iter_idx = cv_results['test-MultiClass-mean'].idxmin()
    best_iterations = cv_results.loc[best_iter_idx, 'iterations']
    print(f"\nBest number of iterations found from CV (based on MultiClass): {best_iterations}")

    final_model_params = params.copy()
    final_model_params.pop('iterations', None) 
    final_model_params['n_estimators'] = best_iterations 

    # Optional: Remove eval_metric and custom_metric if they aren't needed for final training output
    final_model_params.pop('eval_metric', None)
    final_model_params.pop('custom_metric', None)


    print("\nTraining final model on full training data...")
    final_model = CatBoostClassifier(
        **final_model_params,
        random_state=42,
        verbose=True
    )

    # Fit the final model on the *entire* encoded training data (using the processed DataFrame)
    final_model.fit(X_train_processed, y_train_encoded)

    print("Final model training finished.")

    final_model_path = outfolder / "final_catboost_model.joblib"
    joblib.dump(final_model, final_model_path)
    print(f"Final model saved to {final_model_path}")
else:
     print("\nCould not determine best iterations from CV results (MultiClass column not found?). Skipping final model training.")


Starting CatBoost CV...


CatBoostError: catboost/private/libs/options/loss_description.cpp:36: Invalid metric description, it should be in the form "metric_name:param1=value1;...;paramN=valueN"

# Plotting cross-validation Logloss with error bands using Plotly

In [None]:
import plotly.graph_objects as go

# Create figure
fig = go.Figure()

# Add mean performance line
fig.add_trace(
    go.Scatter(
        x=cv_results["iterations"], y=cv_results["test-Logloss-mean"], mode="lines", name="Mean logloss", line=dict(color="blue")
    )
)

# Add shaded error region
fig.add_trace(
    go.Scatter(
        x=pd.concat([cv_results["iterations"], cv_results["iterations"][::-1]]),
        y=pd.concat([cv_results["test-Logloss-mean"]+cv_results["test-Logloss-std"], 
                     cv_results["test-Logloss-mean"]-cv_results["test-Logloss-std"]]),
        fill="toself", 
        fillcolor="rgba(0, 0, 255, 0.2)",
        line=dict(color="rgba(255, 255, 255, 0)"),
        showlegend=False
    )
)

# Customize layout
fig.update_layout(
    title="Cross-Validation (N=5) Mean Logloss with Error Bands",
    xaxis_title="Training Steps",
    yaxis_title="Logloss",
    template="plotly_white"
)

fig.show()

fig.write_image(outfolder / "test_logloss.png")

# Final training of the CatBoost model and saving the model and parameters

In [None]:
model.fit(
    X_train,
    y_train,
    verbose_eval=50,
    early_stopping_rounds=50,
    use_best_model=False,
    plot=True
)

model.save_model(outfolder / 'catboost_model_stroke_prediction.cbm')
joblib.dump(params, outfolder / 'model_params.pkl')

# Generating predictions on the test dataset using the trained CatBoost model

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, log_loss

model.predict(df_test)
preds = model.predict(df_test[X_train.columns])

print("Accuracy:", accuracy_score(y_test, y_pred))

     


# SHAP analysis for feature importance on the test dataset

In [None]:
import shap
import matplotlib.pyplot as plt
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(df_test)

shap.summary_plot(shap_values, df_test, show=False)
plt.savefig(outfolder / "test_shap_overall.png")



     


In [None]:
df_test["target"] = preds
df_test.to_csv(outfolder / "predictions.csv", index=False)



     
