# Libraries which we use

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.tree import *
from sklearn.ensemble import *
import optuna
import importlib
import Project_5_moduls as P5
importlib.reload(P5)  # Reload the module to ensure the latest version is used
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

# Upload dataset, Feature engineering

In [8]:
train = pd.read_csv("train.csv", index_col = 0)
test = pd.read_csv("test.csv", index_col = 0)
seed=1
train = P5.useful_features(train)
test = P5.useful_features(test)
X = train.drop("smoking", axis=1)
y = train["smoking"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed, stratify=y)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# Creating Model with checking in optuna

In [None]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1200)
    max_depth = trial.suggest_int("max_depth", 2, 40)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=seed,
        n_jobs=-1
    )

    roc_auc = cross_val_score(rf, X_train, y_train, cv=skf, scoring="roc_auc", n_jobs=-1).mean()
    
    return roc_auc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Print the best parameters and score
print("Best Parameters:", study.best_params)
print("Best ROC-AUC:", study.best_value)

# Train the best model on the full training set
best_rf = RandomForestClassifier(**study.best_params, random_state=seed, n_jobs=-1)
best_rf.fit(X_train, y_train)

# Evaluate on the test set
test_roc_auc = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])
print("Test ROC-AUC:", test_roc_auc)



[I 2024-10-28 17:48:41,893] A new study created in memory with name: no-name-68c46668-3741-49b4-a8f4-6f9036cd34cc
[I 2024-10-28 17:49:00,442] Trial 0 finished with value: 0.8675783955876692 and parameters: {'n_estimators': 736, 'max_depth': 5, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.8675783955876692.
[I 2024-10-28 17:49:28,088] Trial 1 finished with value: 0.8781148074688485 and parameters: {'n_estimators': 460, 'max_depth': 17, 'min_samples_split': 7, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.8781148074688485.
[I 2024-10-28 17:49:48,742] Trial 2 finished with value: 0.8787969427783139 and parameters: {'n_estimators': 417, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.8787969427783139.
[I 2024-10-28 17:50:23,587] Trial 3 finished with value: 0.8787551140545894 and parameters: {'n_estimators': 722, 'max_depth': 27, 'min_samples_split': 5, 'min_samples_leaf': 9}. Best is trial 2 with value:

# HPO with optuna and after manual with checking Overfitting and Create Submission file

In [12]:
model = RandomForestClassifier(
    random_state=1, 
    n_jobs=4,
    n_estimators = 869,
    max_depth = 39,
    min_samples_split = 2,
    min_samples_leaf = 8
)
model.fit(X, y)
y_pred_test = model.predict_proba(test)[:, 1]
df_submission = pd.DataFrame(pd.read_csv("test.csv")["id"])
df_submission["smoking"] = y_pred_test
df_submission.to_csv("sample_submission.csv", index=False)