In [1]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna

RANDOM_STATE = 1234

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
columns = ["class", "Alcohol", "Malicacid", "Ash", "Alcalinity_of_ash", "Magnesium", 
           "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", 
           "Color_intensity", "Hue", "0D280_0D315_of_diluted_wines", "Proline"]
df = pd.read_csv("D:/Work/wine/wine/wine.data", names=columns)

In [3]:
df.isnull().sum().values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [4]:
numerical_cols = columns[1:]
df[numerical_cols] = StandardScaler().fit_transform(df[numerical_cols])

In [5]:
X = df.drop(['class'], axis=1)
y = df['class']
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (142, 13), y_train: (142,)
X_test: (36, 13), y_test: (36,)


In [7]:
def objective(trial):
  model_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
  if model_name == "SVC":
    xgb_max_depth = trial.suggest_int("xgb_max_depth", 1, 50, log=True)
    xgb_learning_rate = trial.suggest_float("xgb_learning_rate", 1e-5, 1, log=True)
    xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 5, 200, log=True)
    xgb_lambda = trial.suggest_int("xgb_lambda", 1, 100, log=True)
    xgb_gamma = trial.suggest_float("xgb_gamma", 0.001, 1, log=True)
    model = XGBClassifier(max_depth=xgb_max_depth,
                              learning_rate=xgb_learning_rate,
                              n_estimators=xgb_n_estimators, 
                              gamma=xgb_gamma,
                              reg_lambda=xgb_lambda)
  else:
    rf_max_depth = trial.suggest_int("rf_max_depth", 1, 50, log=True)
    rf_min_samples_split = trial.suggest_int("rf_min_samples_split", 2, 30, log=True)
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 1, 200, log=True)
    model = RandomForestClassifier(max_depth=rf_max_depth,
                                      n_estimators=rf_n_estimators,
                                      min_samples_split=rf_min_samples_split)

  #Score is a array of accuracy for cross validation set (Kfold)
  score = cross_val_score(model, X_train, y_train, n_jobs=10, cv=3)
  #Take the mean of the score array for all of the cross validation
  accuracy = score.mean() 
  
  return accuracy


In [8]:
if __name__ == "__main__":
  #Utilize Optuna to maximize the score. In this case it is maximizing accuracy
  study = optuna.create_study(direction="maximize")
  study.optimize(objective, n_trials=100)
  print(study.best_trial)

[I 2023-12-05 21:28:26,365] A new study created in memory with name: no-name-3759404e-94b7-45b3-b965-d7a00103fc12
[I 2023-12-05 21:28:28,546] Trial 0 finished with value: 0.95774231678487 and parameters: {'classifier': 'SVC', 'xgb_max_depth': 39, 'xgb_learning_rate': 0.023160843459821903, 'xgb_n_estimators': 186, 'xgb_lambda': 8, 'xgb_gamma': 0.06364912859046934}. Best is trial 0 with value: 0.95774231678487.
[I 2023-12-05 21:28:29,665] Trial 1 finished with value: 0.9086879432624113 and parameters: {'classifier': 'SVC', 'xgb_max_depth': 6, 'xgb_learning_rate': 0.00040467407122206595, 'xgb_n_estimators': 36, 'xgb_lambda': 1, 'xgb_gamma': 0.008017803942191157}. Best is trial 0 with value: 0.95774231678487.
[I 2023-12-05 21:28:30,858] Trial 2 finished with value: 0.9719267139479907 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 2, 'rf_min_samples_split': 14, 'rf_n_estimators': 26}. Best is trial 2 with value: 0.9719267139479907.
[I 2023-12-05 21:28:31,720] Trial 3 finishe

FrozenTrial(number=13, state=1, values=[0.9930555555555555], datetime_start=datetime.datetime(2023, 12, 5, 21, 28, 33, 462869), datetime_complete=datetime.datetime(2023, 12, 5, 21, 28, 33, 590534), params={'classifier': 'RandomForest', 'rf_max_depth': 38, 'rf_min_samples_split': 12, 'rf_n_estimators': 55}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('SVC', 'RandomForest')), 'rf_max_depth': IntDistribution(high=50, log=True, low=1, step=1), 'rf_min_samples_split': IntDistribution(high=30, log=True, low=2, step=1), 'rf_n_estimators': IntDistribution(high=200, log=True, low=1, step=1)}, trial_id=13, value=None)


In [18]:
study.best_params

{'classifier': 'RandomForest',
 'rf_max_depth': 38,
 'rf_min_samples_split': 12,
 'rf_n_estimators': 55}

In [19]:
tuned_model = RandomForestClassifier(max_depth=38, 
                                     min_samples_split=12,
                                     n_estimators=55)

In [20]:
history = tuned_model.fit(X_train, y_train)

In [21]:
yhat = tuned_model.predict(X_test)

In [22]:
yhat

array([1, 1, 1, 1, 2, 1, 2, 0, 0, 2, 2, 2, 0, 1, 1, 0, 0, 2, 2, 2, 0, 1,
       1, 2, 1, 2, 0, 0, 0, 1, 0, 2, 1, 1, 1, 1], dtype=int64)

In [23]:
y_test

array([1, 1, 1, 1, 2, 1, 2, 0, 0, 2, 2, 1, 0, 1, 1, 0, 0, 2, 2, 2, 0, 1,
       1, 2, 1, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [25]:
print(f"Accuracy: {accuracy_score(y_test, yhat)}")

Accuracy: 0.9444444444444444
