In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 1234

In [34]:
columns = ["class", "Alcohol", "Malicacid", "Ash", "Alcalinity_of_ash", "Magnesium", 
           "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", 
           "Color_intensity", "Hue", "0D280_0D315_of_diluted_wines", "Proline"]
df = pd.read_csv("D:/wine/wine/wine.data", names=columns)
new_df = df

In [35]:
df.isnull().sum().values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [36]:
numerical_cols = columns[1:]
df[numerical_cols] = StandardScaler().fit_transform(df[numerical_cols])

In [37]:
X = df.drop(['class'], axis=1)
y = df['class']
le = LabelEncoder()
y = le.fit_transform(y)

In [38]:
X_train, X_, y_train, y_ = train_test_split(X, y, train_size=0.6, random_state=RANDOM_STATE)
X_test, X_cv, y_test, y_cv = train_test_split(X_, y_, train_size=0.5, random_state=RANDOM_STATE)
del X_, y_

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print(f"X_cv: {X_cv.shape}, y_cv: {y_cv.shape}")

X_train: (106, 13), y_train: (106,)
X_test: (36, 13), y_test: (36,)
X_cv: (36, 13), y_cv: (36,)


In [39]:
model = XGBClassifier(random_state=RANDOM_STATE)

param_grid = {
  "max_depth": [1, 2, 3, 4, 5, 10, 20, 50],
  "learning_rate": [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1],
  "n_estimators": [5, 10, 20, 40, 70, 100, 150, 200],
  "lambda": [0, 1, 4, 6, 10, 20, 50, 100],
  "gamma": [0.001, 0.01, 0.1, 1]
}

grid_model = GridSearchCV(
  estimator=model,
  param_grid=param_grid,
  scoring='accuracy',
  refit='accuracy',
  cv=5,
  n_jobs=10,
  verbose=0,
  return_train_score=True
)

grid_result = grid_model.fit(X_train, y_train, verbose=0)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [40]:
print(f"Best Parameters:\n\t{grid_model.best_params_}")
print(f"Best Score:\n\t{grid_model.best_score_}")

Best Parameters:
	{'gamma': 0.001, 'lambda': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 70}
Best Score:
	0.9813852813852814


In [44]:
result_df = pd.DataFrame(grid_model.cv_results_)
result_df = result_df.sort_values('rank_test_accuracy')
result_df.to_csv('parameter_result_xgboost.csv')

In [45]:
ypred = grid_model.predict(X_test)

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [46]:
print(f"Test Accuracy: \n\t{accuracy_score(y_test, ypred)}")

Test Accuracy: 
	0.9444444444444444
