In [12]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import optuna

RANDOM_STATE = 1234

In [13]:
columns = ["class", "Alcohol", "Malicacid", "Ash", "Alcalinity_of_ash", "Magnesium", 
           "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", 
           "Color_intensity", "Hue", "0D280_0D315_of_diluted_wines", "Proline"]
df = pd.read_csv("D:/Work/wine/wine/wine.data", names=columns)

In [14]:
df.isnull().sum().values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [15]:
numerical_cols = columns[1:]
df[numerical_cols] = StandardScaler().fit_transform(df[numerical_cols])

In [16]:
X = df.drop(['class'], axis=1)
y = df['class']
le = LabelEncoder()
y = le.fit_transform(y)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=RANDOM_STATE)

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

X_train: (142, 13), y_train: (142,)
X_test: (36, 13), y_test: (36,)


In [18]:
def objective(trial):
  model_name = trial.suggest_categorical("classifier", ["XGB", "RandomForest"])
  if model_name == "XGB":
    xgb_max_depth = trial.suggest_int("xgb_max_depth", 1, 50, log=True)
    xgb_learning_rate = trial.suggest_float("xgb_learning_rate", 1e-5, 1, log=True)
    xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 5, 200, log=True)
    xgb_lambda = trial.suggest_int("xgb_lambda", 1, 100, log=True)
    xgb_gamma = trial.suggest_float("xgb_gamma", 0.001, 1, log=True)
    model = XGBClassifier(max_depth=xgb_max_depth,
                              learning_rate=xgb_learning_rate,
                              n_estimators=xgb_n_estimators, 
                              gamma=xgb_gamma,
                              reg_lambda=xgb_lambda)
  else:
    rf_max_depth = trial.suggest_int("rf_max_depth", 1, 50, log=True)
    rf_min_samples_split = trial.suggest_int("rf_min_samples_split", 2, 30, log=True)
    rf_n_estimators = trial.suggest_int("rf_n_estimators", 1, 200, log=True)
    model = RandomForestClassifier(max_depth=rf_max_depth,
                                      n_estimators=rf_n_estimators,
                                      min_samples_split=rf_min_samples_split)

  #Score is a array of accuracy for cross validation set (Kfold)
  score = cross_val_score(model, X_train, y_train, n_jobs=10, cv=3)
  #Take the mean of the score array for all of the cross validation
  accuracy = score.mean() 
  
  return accuracy

In [19]:
if __name__ == "__main__":
  #Utilize Optuna to maximize the score. In this case it is maximizing accuracy
  study = optuna.create_study(direction="maximize") #direction is defaulted to minimize
  study.optimize(objective, n_trials=100) #Pass in a callable function "objective" 

[I 2023-12-07 09:59:06,210] A new study created in memory with name: no-name-c4a20711-13ca-46a2-9aca-21a2a240acef
[I 2023-12-07 09:59:08,180] Trial 0 finished with value: 0.9086879432624113 and parameters: {'classifier': 'XGB', 'xgb_max_depth': 3, 'xgb_learning_rate': 1.4405642989436467e-05, 'xgb_n_estimators': 8, 'xgb_lambda': 1, 'xgb_gamma': 0.08642053547526074}. Best is trial 0 with value: 0.9086879432624113.
[I 2023-12-07 09:59:09,199] Trial 1 finished with value: 0.91548463356974 and parameters: {'classifier': 'XGB', 'xgb_max_depth': 30, 'xgb_learning_rate': 0.014598580373419587, 'xgb_n_estimators': 23, 'xgb_lambda': 88, 'xgb_gamma': 0.6728752650209121}. Best is trial 1 with value: 0.91548463356974.
[I 2023-12-07 09:59:10,291] Trial 2 finished with value: 0.6341607565011821 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 1, 'rf_min_samples_split': 2, 'rf_n_estimators': 1}. Best is trial 1 with value: 0.91548463356974.
[I 2023-12-07 09:59:11,276] Trial 3 finished wit

In [20]:
study.best_params

{'classifier': 'RandomForest',
 'rf_max_depth': 11,
 'rf_min_samples_split': 4,
 'rf_n_estimators': 17}

In [21]:
tuned_model = RandomForestClassifier(max_depth=50, 
                                     min_samples_split=16,
                                     n_estimators=126)

In [22]:
history = tuned_model.fit(X_train, y_train)

In [23]:
yhat = tuned_model.predict(X_test)

In [24]:
yhat

array([1, 1, 1, 1, 2, 1, 2, 0, 0, 2, 2, 2, 0, 1, 1, 0, 0, 2, 2, 2, 0, 1,
       1, 2, 1, 2, 0, 0, 0, 1, 0, 2, 1, 1, 1, 1], dtype=int64)

In [25]:
y_test

array([1, 1, 1, 1, 2, 1, 2, 0, 0, 2, 2, 1, 0, 1, 1, 0, 0, 2, 2, 2, 0, 1,
       1, 2, 1, 2, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [26]:
print(f"Accuracy: {accuracy_score(y_test, yhat)}")

Accuracy: 0.9444444444444444
