In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from sklearn.metrics import fbeta_score, precision_score, recall_score
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore", category=UserWarning)

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [3]:
df = pd.read_csv("dataset.csv")
df['LUNG_CANCER'] = df['LUNG_CANCER'].map({'NO':0, 'YES':1})
x=df.drop(columns='LUNG_CANCER')
y=df['LUNG_CANCER']
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.2, random_state=42)
info1 = []

lrs = list(np.array([1e-5]).repeat(9) * np.array(range(1, 10)))
lrs += list(np.array([1e-4]).repeat(9) * np.array(range(1, 10)))
lrs += list(np.array([1e-3]).repeat(9) * np.array(range(1, 10)))
lrs += list(np.array([1e-2]).repeat(9) * np.array(range(1, 10)))
lrs += list(np.array([1e-1]).repeat(5) * np.array(range(1, 6)))
lrs = list(np.round(lrs, 5))

for md in range(1, 6):
    for lr in lrs:
        cls = CatBoostClassifier(
            iterations=100000,
            max_depth=md,
            silent=True,
            learning_rate=lr,
            custom_metric=["Accuracy", "Recall", "Precision", "F1"],
            random_seed=123
        )
        cls.fit(
            x_train,
            y_train,
            cat_features=[
                'GENDER',
                'SMOKING',
                'YELLOW_FINGERS',
                'ANXIETY',
                'PEER_PRESSURE',
                'CHRONIC_DISEASE',
                'FATIGUE',
                'ALLERGY',
                'WHEEZING',
                'ALCOHOL_CONSUMING',
                'COUGHING',
                'SHORTNESS_OF_BREATH',
                'SWALLOWING_DIFFICULTY',
                'CHEST_PAIN'
            ],
            eval_set=(x_test, y_test),
            use_best_model=True,
            early_stopping_rounds=2000
        )

        y_pred = cls.predict(x_test)

        accuracy = cls.best_score_['validation']['Accuracy']
        precision = cls.best_score_['validation']['Precision']
        recall = cls.best_score_['validation']['Recall']
        f1 = cls.best_score_['validation']['F1']
        info1.append((accuracy, precision, recall, f1, md, lr))

        print(f"Best Model ({md}, {lr}): Accuracy: {accuracy:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}")


Best Model (1, 1e-05): Accuracy: 0.5217 | Precision: 0.5117 | Recall: 0.8758 | F1: 0.6436
Best Model (1, 2e-05): Accuracy: 0.5367 | Precision: 0.5243 | Recall: 0.8725 | F1: 0.6380
Best Model (1, 3e-05): Accuracy: 0.5283 | Precision: 0.5187 | Recall: 0.8826 | F1: 0.6431
Best Model (1, 4e-05): Accuracy: 0.5267 | Precision: 0.5143 | Recall: 0.8456 | F1: 0.6396
Best Model (1, 5e-05): Accuracy: 0.5367 | Precision: 0.5243 | Recall: 0.8725 | F1: 0.6439
Best Model (1, 6e-05): Accuracy: 0.5283 | Precision: 0.5167 | Recall: 0.9530 | F1: 0.6581
Best Model (1, 7e-05): Accuracy: 0.5283 | Precision: 0.5211 | Recall: 0.9195 | F1: 0.6539
Best Model (1, 8e-05): Accuracy: 0.5367 | Precision: 0.5253 | Recall: 0.8960 | F1: 0.6519
Best Model (1, 9e-05): Accuracy: 0.5350 | Precision: 0.5271 | Recall: 0.9262 | F1: 0.6460
Best Model (1, 0.0001): Accuracy: 0.5350 | Precision: 0.5271 | Recall: 0.9698 | F1: 0.6583
Best Model (1, 0.0002): Accuracy: 0.5500 | Precision: 0.5407 | Recall: 0.9094 | F1: 0.6486
Best Mod

In [None]:
# Best Model (4, 0.08): Accuracy: 0.5750 | Precision: 0.5609 | Recall: 0.7651 | F1: 0.6104