In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 1234

In [2]:
columns = ["class", "Alcohol", "Malicacid", "Ash", "Alcalinity_of_ash", "Magnesium", 
           "Total_phenols", "Flavanoids", "Nonflavanoid_phenols", "Proanthocyanins", 
           "Color_intensity", "Hue", "0D280_0D315_of_diluted_wines", "Proline"]
df = pd.read_csv("D:/Work/wine/wine/wine.data", names=columns)

In [3]:
numerical_cols = columns[1:]
df[numerical_cols] = StandardScaler().fit_transform(df[numerical_cols])

In [4]:
X = df.drop(['class'], axis=1)
y = df['class']
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
X_train, X_, y_train, y_ = train_test_split(X, y, train_size=0.6, random_state=RANDOM_STATE)
X_test, X_cv, y_test, y_cv = train_test_split(X_, y_, train_size=0.5, random_state=RANDOM_STATE)
del X_, y_

print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print(f"X_cv: {X_cv.shape}, y_cv: {y_cv.shape}")

X_train: (106, 13), y_train: (106,)
X_test: (36, 13), y_test: (36,)
X_cv: (36, 13), y_cv: (36,)


In [6]:
model = RandomForestClassifier(random_state=RANDOM_STATE)

param_grid = {
  "min_samples_split": [2, 3, 4, 5, 10, 20, 50],
  "max_depth": [1, 2, 3, 4, 5, 10, 20, 50],
  "n_estimators": [5, 10, 20, 40, 70, 100, 150, 200],
}

grid_model = GridSearchCV(
  estimator=model,
  param_grid=param_grid,
  scoring='accuracy',
  refit='accuracy',
  cv=5,
  n_jobs=10,
  verbose=0,
  return_train_score=True,
  error_score='raise'
)

grid_result = grid_model.fit(X_train, y_train)

In [7]:
print(f"Best Parameters:\n\t{grid_model.best_params_}")
print(f"Best Score:\n\t{grid_model.best_score_}")

Best Parameters:
	{'max_depth': 2, 'min_samples_split': 20, 'n_estimators': 200}
Best Score:
	0.990909090909091


In [8]:
result_df = pd.DataFrame(grid_model.cv_results_)
result_df = result_df.sort_values('rank_test_score')
result_df.to_csv('parameter_result_randomforest.csv')

In [9]:
ypred = grid_model.predict(X_test)

In [10]:
print(f"Test Accuracy: \n\t{accuracy_score(y_test, ypred)}")

Test Accuracy: 
	0.9722222222222222
