In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from optuna.samplers import TPESampler

In [2]:
df=pd.read_csv('cleaned.csv',index_col=0)
testset=pd.read_csv('test.csv',index_col=0)

In [3]:
df_copy=df.copy()
df_copy["SMOKING"] = df["SMOKING"].astype(int)
X = df_copy.drop('SMOKING', axis=1)
y = df_copy['SMOKING']

X_test = testset.drop('SMOKING', axis=1)
y_test = testset['SMOKING']

In [4]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np


In [5]:

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf  = trial.suggest_int('min_samples_leaf', 1, 10)
    n_estimators = trial.suggest_int('n_estimators', 100, 300, step=20)
    max_depth = trial.suggest_int('max_depth', 5, 10)
    max_features = trial.suggest_int('max_features', 3, 10)

    scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        X_train_scaled = X_train.copy()
        X_val_scaled = X_val.copy()
        

        # scaler = StandardScaler()
        # X_train_scaled['AGE'] = scaler.fit_transform(X_train[['AGE']])
        # X_val_scaled['AGE'] = scaler.transform(X_val[['AGE']])

        
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            random_state=42,
            n_jobs=-1  ,
            min_samples_split = min_samples_split,
            min_samples_leaf = min_samples_leaf
        )
        rf.fit(X_train_scaled, y_train)
        y_pred = rf.predict(X_val_scaled)
        acc = accuracy_score(y_val, y_pred)
        f1= f1_score(y_val,y_pred)
        recall=recall_score(y_val,y_pred)


        y_pred_train=rf.predict(X_train_scaled)
        acc_train = accuracy_score(y_train, y_pred_train)

        overfit_penalty = abs(acc_train - acc)

        score = 1 * recall 
        #- 0.5 * overfit_penalty
        scores.append(score)

    return np.mean(scores)



study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
study.optimize(objective, n_trials=20)  


print("best_params", study.best_params)
print(f"best_value: {study.best_value:.4f}")

[I 2025-08-16 09:35:55,115] A new study created in memory with name: no-name-d1f9d220-1f6d-4d1c-9729-18b823fb3836
[I 2025-08-16 09:35:55,841] Trial 0 finished with value: 0.8923280423280422 and parameters: {'min_samples_split': 9, 'min_samples_leaf': 10, 'n_estimators': 260, 'max_depth': 8, 'max_features': 4}. Best is trial 0 with value: 0.8923280423280422.
[I 2025-08-16 09:35:56,604] Trial 1 finished with value: 0.8343915343915344 and parameters: {'min_samples_split': 4, 'min_samples_leaf': 1, 'n_estimators': 280, 'max_depth': 8, 'max_features': 8}. Best is trial 0 with value: 0.8923280423280422.
[I 2025-08-16 09:35:57,378] Trial 2 finished with value: 0.8851851851851851 and parameters: {'min_samples_split': 2, 'min_samples_leaf': 10, 'n_estimators': 280, 'max_depth': 6, 'max_features': 4}. Best is trial 0 with value: 0.8923280423280422.
[I 2025-08-16 09:35:57,997] Trial 3 finished with value: 0.8423280423280423 and parameters: {'min_samples_split': 5, 'min_samples_leaf': 4, 'n_estima

best_params {'min_samples_split': 11, 'min_samples_leaf': 10, 'n_estimators': 160, 'max_depth': 7, 'max_features': 3}
best_value: 0.9138


In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    X_train_scaled = X_train.copy()
    X_val_scaled = X_val.copy()
    
    # scaler = StandardScaler()
    # X_train_scaled['AGE'] = scaler.fit_transform(X_train[['AGE']])
    # X_val_scaled['AGE'] = scaler.transform(X_val[['AGE']])

    
    rf = RandomForestClassifier(
        n_estimators=280,
        max_depth=6,
        max_features=4,
        random_state=42,
        n_jobs=-1  ,
        min_samples_split=10,
        min_samples_leaf= 5
    )
    rf.fit(X_train_scaled, y_train)
    y_pred = rf.predict(X_val_scaled)
    acc = accuracy_score(y_val, y_pred)
   
    
    y_pred_train=rf.predict(X_train_scaled)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    
    print(f"train accuracy: {acc_train:.4f}")
    print(f"test accuracy: {acc:.4f}")
    scores.append(acc)



    
    
    print(classification_report(y_val, y_pred, digits=2))
    print("-" * 40)

train accuracy: 0.8883
test accuracy: 0.6200
              precision    recall  f1-score   support

           0       0.64      0.32      0.42        22
           1       0.62      0.86      0.72        28

    accuracy                           0.62        50
   macro avg       0.63      0.59      0.57        50
weighted avg       0.62      0.62      0.59        50

----------------------------------------
train accuracy: 0.8985
test accuracy: 0.8200
              precision    recall  f1-score   support

           0       0.81      0.77      0.79        22
           1       0.83      0.86      0.84        28

    accuracy                           0.82        50
   macro avg       0.82      0.81      0.82        50
weighted avg       0.82      0.82      0.82        50

----------------------------------------
train accuracy: 0.8889
test accuracy: 0.7143
              precision    recall  f1-score   support

           0       0.71      0.57      0.63        21
           1       0

In [7]:
from sklearn.model_selection import train_test_split
#best_params {'min_samples_split': 2, 'min_samples_leaf': 10, 'n_estimators': 280, 'max_depth': 6, 'max_features': 4}

rf = RandomForestClassifier(
        n_estimators=280,
        max_depth=6,
        max_features=4,
        random_state=42,
        n_jobs=-1  ,
        min_samples_split=10,
        min_samples_leaf= 5
    )
rf.fit(X, y)

y_pred = rf.predict(X_test)
acc = accuracy_score(y_test, y_pred)


y_pred_train=rf.predict(X)
acc_train = accuracy_score(y, y_pred_train)


print(f"train accuracy: {acc_train:.4f}")
print(f"test accuracy: {acc:.4f}")



    
    
print(classification_report(y_test, y_pred, digits=4))
print("-" * 40)


train accuracy: 0.9069
test accuracy: 0.8065
              precision    recall  f1-score   support

           0     0.8261    0.7037    0.7600        27
           1     0.7949    0.8857    0.8378        35

    accuracy                         0.8065        62
   macro avg     0.8105    0.7947    0.7989        62
weighted avg     0.8085    0.8065    0.8039        62

----------------------------------------


In [8]:
y_true = y_test.values  

y_score_rf = rf.predict_proba(X_test)[:, 1]

df_rf = pd.DataFrame({
    "y_true": y_true.flatten(),  
    "rf": y_score_rf
})
df_rf.to_csv("roc_scores_rf.csv", index=False)
print("saved:::roc_scores_rf.csv")

saved:::roc_scores_rf.csv
