In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

import seaborn as sns

sns.set_theme()

In [2]:
bean = pd.read_csv('D:\\2021 - Fall Semester\\1 - class\\577data\\Dry_Bean_Dataset.csv')

new_bean = bean.replace({'SEKER':0, 'BARBUNYA':1, 'BOMBAY':2, 'CALI':3, 'HOROZ':4, 'SIRA':5, 'DERMASON':6})

X = new_bean.iloc[:, 0:-1]
y = new_bean['Class']

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = X
X_scaled[['Area','Perimeter', 'MajorAxisLength','MinorAxisLength', 'ConvexArea', 'EquivDiameter']] = scaler.fit_transform(X_scaled[['Area','Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'ConvexArea', 'EquivDiameter']])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 0)

In [10]:
ada0 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, random_state=42), 
                             n_estimators = 10,
                             algorithm = "SAMME.R",
                             learning_rate = 0.5)
ada0.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
                                                         random_state=42),
                   learning_rate=0.5, n_estimators=10)

In [11]:
ada_y_pred = ada0.predict(X_test)

y_true = y_test

print('The accuracy score is: {a}; the precision score (on a macro averaged basis, the same as the following two index) is: {b}; the recall score is: {c}; and the f1 score is {d}'.format(
    a = accuracy_score(y_true=y_true, y_pred=ada_y_pred), 
    b = precision_score(y_true=y_true, y_pred=ada_y_pred, average='macro'), 
    c = recall_score(y_true=y_true, y_pred=ada_y_pred, average='macro'), 
    d = f1_score(y_true=y_true, y_pred=ada_y_pred, average='macro')))

The accuracy score is: 0.5683153770812929; the precision score (on a macro averaged basis, the same as the following two index) is: 0.4782680016952273; the recall score is: 0.5585833992273322; and the f1 score is 0.4836973189433519


  _warn_prf(average, modifier, msg_start, len(result))


#### The four scores show that the Adaboost model does not have a good fit to the data. We might need to increase max_depth and n_estimators.

In [12]:
ada1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10, random_state=0), 
                             n_estimators = 100,
                             algorithm = "SAMME.R",
                             learning_rate = 0.5)
ada1.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=10,
                                                         random_state=0),
                   learning_rate=0.5, n_estimators=100)

In [13]:
ada1_y_pred = ada1.predict(X_test)

y_true = y_test

print('The accuracy score is: {a}; the precision score (on a macro averaged basis, the same as the following two index) is: {b}; the recall score is: {c}; and the f1 score is {d}'.format(
    a = accuracy_score(y_true=y_true, y_pred=ada1_y_pred), 
    b = precision_score(y_true=y_true, y_pred=ada1_y_pred, average='macro'), 
    c = recall_score(y_true=y_true, y_pred=ada1_y_pred, average='macro'), 
    d = f1_score(y_true=y_true, y_pred=ada1_y_pred, average='macro')))

The accuracy score is: 0.9260528893241919; the precision score (on a macro averaged basis, the same as the following two index) is: 0.9406169776184873; the recall score is: 0.9367707537421986; and the f1 score is 0.9384498185117668


#### It is reasonable that by increasing the tree depth and iteration steps of Adaboost, the model could usually has a good fit on the data. Then, we will use GBDT instead.

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

gb0 = GradientBoostingClassifier(max_depth=10, n_estimators=100, learning_rate=0.5, random_state=0)

gb0.fit(X_train, y_train)

GradientBoostingClassifier(learning_rate=0.5, max_depth=10, random_state=0)

In [15]:
gb_y_pred = gb0.predict(X_test)

y_true = y_test

print('The accuracy score is: {a}; the precision score (on a macro averaged basis, the same as the following two index) is: {b}; the recall score is: {c}; and the f1 score is {d}'.format(
    a = accuracy_score(y_true=y_true, y_pred=gb_y_pred), 
    b = precision_score(y_true=y_true, y_pred=gb_y_pred, average='macro'), 
    c = recall_score(y_true=y_true, y_pred=gb_y_pred, average='macro'), 
    d = f1_score(y_true=y_true, y_pred=gb_y_pred, average='macro')))

The accuracy score is: 0.9277668952007836; the precision score (on a macro averaged basis, the same as the following two index) is: 0.9418405368758739; the recall score is: 0.9365335519859496; and the f1 score is 0.9390590132427655


#### Both models have almost the same performance in fitting the data, based on the same max_depth, n_estimators and learning rate.