In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [2]:
training_data = pd.read_csv('../Datasets/full_scaled_data.csv')
X_scaled = training_data.drop(columns=['label'])
y_scaled = training_data['label']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.25, random_state=42)

In [4]:
classify_type = "binary"

Dùng công cụ `GridSearchCV` de tuning hyperparameter

`GridSearchCV` lấy một từ điển mô tả các tham số có thể được thử trên một mô hình để huấn luyện nó. Lưới tham số được định nghĩa như một từ điển, trong đó các khóa là các tham số và các giá trị là cài đặt cần kiểm tra.

Giải thích các hyperparameters của `GridSearchCV`:

- cv: int, cross-validation generator or an iterable

Determines the cross-validation splitting strategy. Possible inputs for cv are:

>1. None: to use the default 5-fold cross validation,

>2. Integer: to specify the number of folds in a (Stratified) KFold

- n_jobs : int

Number of jobs to run in parallel.

>1. None means 1 unless in a joblib.parallel_backend context.

>2. -1 means using all processors.

In [18]:
knn_params = { 'n_neighbors' : [1, 5, 7, 9, 11, 13, 15, 20],
               'weights' : ['uniform', 'distance'],
               'metric' : ['minkowski', 'euclidean', 'manhattan']}

In [19]:
gs_knn = GridSearchCV(KNeighborsClassifier(), knn_params, scoring = 'accuracy', cv = 5, n_jobs = -1)

In [20]:
g_res_knn = gs_knn.fit(X_train, y_train)

In [21]:
g_res_knn.best_params_

{'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}

In [22]:
gb_params = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.1, 0.5, 1.0],
    'max_depth': [3, 6, 9],
}

In [23]:
gs_gb = GridSearchCV(GradientBoostingClassifier(), gb_params, scoring = 'accuracy', cv=5, n_jobs = -1)

In [24]:
g_res_gb = gs_gb.fit(X_train, y_train)

In [25]:
g_res_gb.best_params_

{'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 200}

In [5]:
lr_params = {
    'penalty' : ['l1', 'l2', 'elasticnet', None],
    'C' : np.array([1, 2, 5, 8, 10.0]),
    'solver' : ['lbfgs','liblinear','sag','saga'],
    'max_iter' : [100, 150, 200]
}

In [6]:
gs_lr = GridSearchCV(LogisticRegression(), lr_params, scoring='accuracy', cv=5, n_jobs=-1)

In [7]:
g_res_lr = gs_lr.fit(X_train, y_train)

525 fits failed out of a total of 1200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
75 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\ADMIN\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\U

In [8]:
g_res_lr.best_params_

{'C': 8.0, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

In [5]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
dt_params = {
    'max_depth': [5, 10, 20, 30, None],
    'min_samples_split': [2, 5, 8, 10],
    'min_samples_leaf': [1, 2, 3, 4],
    'criterion': ['gini', 'entropy']
}

In [25]:
gs_dt = GridSearchCV(DecisionTreeClassifier(), dt_params, scoring='accuracy', cv=5, n_jobs=-1)

In [26]:
g_res_dt = gs_dt.fit(X_train, y_train)

In [27]:
g_res_dt.best_params_

{'criterion': 'entropy',
 'max_depth': 20,
 'min_samples_leaf': 2,
 'min_samples_split': 2}