In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df=pd.read_csv('../balanced_cleaned_trian_dataset.csv',index_col=0)
test_set=pd.read_csv('../testset.csv',index_col=0)

In [3]:
grouped = df.groupby("smoking")
df_0 = grouped.get_group(0)
df_1 = grouped.get_group(1)
df_0_sub = df_0.sample(n=150, random_state=42)
df_1_sub = df_1.sample(n=150, random_state=42)
df_sub = pd.concat([df_0_sub, df_1_sub])
df_sub = df_sub.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
df_sub['smoking'].value_counts()

smoking
1    150
0    150
Name: count, dtype: int64

In [5]:
train_copy=df_sub.drop('id',axis=1)
X = train_copy.drop('smoking', axis=1)
y = train_copy['smoking']

test_set=test_set.drop('id',axis=1)
X_test= test_set.drop('smoking', axis=1)
y_test = test_set['smoking']

In [6]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 20, 100, step=10)
    max_depth = trial.suggest_int('max_depth', 2, 6)
    max_features = trial.suggest_int('max_features', 2, 10)
    # min_samples_split = trial.suggest_int('min_samples_split', 2, 30)
    # min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 15)

    scores = []
    F1=[]

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        
        # scaler = MinMaxScaler()
        # X_train_scaled = scaler.fit_transform(X_train)
        # X_val_scaled = scaler.transform(X_val)

        
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            random_state=42,
            n_jobs=-1,
        )


        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        F1c=f1_score(y_val, y_pred, average='macro') 
        F1.append(F1c)


        y_pred_train = rf.predict(X_train)
        acc_train = accuracy_score(y_train, y_pred_train)
        f1_train = f1_score(y_train, y_pred_train, average='macro')

        overfit_penalty = abs(acc_train - acc)

        #scores.append(acc)
        score = 0.5 * acc - 0.5 * overfit_penalty


        scores.append(score)


    return np.mean(scores)

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=20)  


print("best_params:", study.best_params)
print(f"best_value: {study.best_value:.4f}")

[I 2025-08-14 10:56:52,496] A new study created in memory with name: no-name-197772c5-cb12-4d01-a453-58beb81a7702
[I 2025-08-14 10:56:52,806] Trial 0 finished with value: 0.24833333333333335 and parameters: {'n_estimators': 50, 'max_depth': 6, 'max_features': 8}. Best is trial 0 with value: 0.24833333333333335.
[I 2025-08-14 10:56:53,122] Trial 1 finished with value: 0.3491666666666667 and parameters: {'n_estimators': 70, 'max_depth': 2, 'max_features': 3}. Best is trial 1 with value: 0.3491666666666667.
[I 2025-08-14 10:56:53,344] Trial 2 finished with value: 0.22000000000000003 and parameters: {'n_estimators': 20, 'max_depth': 6, 'max_features': 7}. Best is trial 1 with value: 0.3491666666666667.
[I 2025-08-14 10:56:53,717] Trial 3 finished with value: 0.3270833333333333 and parameters: {'n_estimators': 80, 'max_depth': 2, 'max_features': 10}. Best is trial 1 with value: 0.3491666666666667.
[I 2025-08-14 10:56:54,095] Trial 4 finished with value: 0.3158333333333333 and parameters: {'

best_params: {'n_estimators': 70, 'max_depth': 2, 'max_features': 3}
best_value: 0.3492


In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # scaler = MinMaxScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_val_scaled = scaler.transform(X_val)

    rf = RandomForestClassifier(
            n_estimators=70,
            max_depth=2,
            max_features=3,
            random_state=42,
            
        )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    acc = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='macro')
    recall = recall_score(y_val, y_pred, average='macro')
    f1 = f1_score(y_val, y_pred, average='macro')

    print(f"Fold {fold + 1}:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print("-" * 30)

    # 
    # print(f"Fold {fold + 1} Classification Report:")
    # print(classification_report(y_val, y_pred, digits=2))
    # print("-" * 40)
    
    train_pred = rf.predict(X_train)
    train_acc = accuracy_score(y_train, train_pred)
    print(f"Train Accuracy: {train_acc:.4f}, Val Accuracy: {acc:.4f}")

    scores.append(acc)

avg_acc = np.mean(scores)

Fold 1:
  Accuracy:  0.7167
  Precision: 0.7381
  Recall:    0.7167
  F1 Score:  0.7101
------------------------------
Train Accuracy: 0.7750, Val Accuracy: 0.7167
Fold 2:
  Accuracy:  0.7667
  Precision: 0.7871
  Recall:    0.7667
  F1 Score:  0.7624
------------------------------
Train Accuracy: 0.7958, Val Accuracy: 0.7667
Fold 3:
  Accuracy:  0.6833
  Precision: 0.7257
  Recall:    0.6833
  F1 Score:  0.6677
------------------------------
Train Accuracy: 0.8208, Val Accuracy: 0.6833
Fold 4:
  Accuracy:  0.7833
  Precision: 0.7862
  Recall:    0.7833
  F1 Score:  0.7828
------------------------------
Train Accuracy: 0.7833, Val Accuracy: 0.7833
Fold 5:
  Accuracy:  0.7667
  Precision: 0.7715
  Recall:    0.7667
  F1 Score:  0.7656
------------------------------
Train Accuracy: 0.7667, Val Accuracy: 0.7667


In [8]:
rf_final = RandomForestClassifier(
           n_estimators=70,
            max_depth=2,
            max_features=3,
            random_state=42,
)
rf_final.fit(X, y)  

y_test_pred = rf_final.predict(X_test)

acc_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='macro')
recall_test = recall_score(y_test, y_test_pred, average='macro')
f1_test = f1_score(y_test, y_test_pred, average='macro')

train_pred = rf_final.predict(X)  
train_acc = accuracy_score(y, train_pred)
print(f"Train Accuracy: {train_acc:.4f}, test Accuracy: {acc_test:.4f}")

print(f"Test Accuracy:  {acc_test:.4f}")
print(f"Test Precision: {precision_test:.4f}")
print(f"Test Recall:    {recall_test:.4f}")
print(f"Test F1 Score:  {f1_test:.4f}")
print("\n RF Test set classification report:")
print(classification_report(y_test, y_test_pred, digits=4))

Train Accuracy: 0.7867, test Accuracy: 0.7590
Test Accuracy:  0.7590
Test Precision: 0.7643
Test Recall:    0.7590
Test F1 Score:  0.7578

 RF Test set classification report:
              precision    recall  f1-score   support

           0     0.8019    0.6880    0.7406      1000
           1     0.7268    0.8300    0.7750      1000

    accuracy                         0.7590      2000
   macro avg     0.7643    0.7590    0.7578      2000
weighted avg     0.7643    0.7590    0.7578      2000

