In [19]:
import pandas as pd
import optuna
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
import warnings
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv("../extracted_data/wine.csv")

## Basic Feature Engineering

In [3]:
from src.data_splitter import TrainTestSplitter

In [4]:
data_splitter= TrainTestSplitter()

In [6]:
X_train, X_test, y_train, y_test = data_splitter.split_data(df, "Class")

## Implementing Optuna for HyperParameter tuning

In [14]:
def objective(trial):
    n_estimator = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 10)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    learning_rate = trial.suggest_float("learning_rate", 0.1, 1.0)

    clf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, class_weight="balanced", n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    return score

In [18]:
study = optuna.create_study(study_name="Random Forest Classifier", direction="maximize")
study.optimize(objective, n_trials=20)

print(f"Study statistics: {study.best_params}")
print(f"Best trial: {study.best_value}")

[I 2025-07-29 03:40:28,316] A new study created in memory with name: Random Forest Classifier
[I 2025-07-29 03:40:30,316] Trial 0 finished with value: 0.9814814814814815 and parameters: {'n_estimators': 817, 'max_depth': 9, 'min_samples_leaf': 4, 'min_samples_split': 6, 'learning_rate': 0.8889106233203461}. Best is trial 0 with value: 0.9814814814814815.
[I 2025-07-29 03:40:30,768] Trial 1 finished with value: 0.9629629629629629 and parameters: {'n_estimators': 168, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 8, 'learning_rate': 0.2349388983518697}. Best is trial 0 with value: 0.9814814814814815.
[I 2025-07-29 03:40:31,231] Trial 2 finished with value: 0.9629629629629629 and parameters: {'n_estimators': 156, 'max_depth': 9, 'min_samples_leaf': 5, 'min_samples_split': 6, 'learning_rate': 0.9072292984829743}. Best is trial 0 with value: 0.9814814814814815.
[I 2025-07-29 03:40:31,869] Trial 3 finished with value: 0.9629629629629629 and parameters: {'n_estimators': 254, 'm

Study statistics: {'n_estimators': 745, 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 7, 'learning_rate': 0.9355005717494875}
Best trial: 1.0


## Using GridSearchCV, RandomSearchCV

In [24]:
model = RandomForestClassifier()
params = {
    "n_estimators": np.arange(100, 1000, step=100),
    "max_depth": np.arange(2, 20),
    "min_samples_leaf": np.arange(2, 20),
    "min_samples_split": np.arange(2, 20),
}
grid = GridSearchCV(model, params, cv=6, n_jobs=-1)

In [25]:
grid.fit(X_train, y_train)

KeyboardInterrupt: 

In [27]:
X_train.shape

(124, 13)