### ガラスに含まれる酸化物データから7種類のガラス分類を行う。

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, GridSearchCV

import xgboost as xgb

In [2]:
glass = pd.read_csv('glass.csv')

In [3]:
glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [4]:
# 訓練データとテストデータに分割
train_set, test_set = train_test_split(glass, test_size=0.2, random_state=42)

In [5]:
# 特徴量とターゲットに分割
X_train = train_set.drop('Type', axis=1)
y_train = train_set['Type'].copy()

X_test = test_set.drop('Type', axis=1)
y_test = test_set['Type'].copy()


In [6]:
# ロジスティック回帰
# 訓練
LR = LogisticRegression(random_state=42)
LR.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
# テストデータで推測し評価
LR_test = LR.predict(X_test)

In [8]:
confusion_matrix(y_test, LR_test)

array([[ 9,  2,  0,  0,  0,  0],
       [ 3, 11,  0,  0,  0,  0],
       [ 1,  2,  0,  0,  0,  0],
       [ 0,  3,  0,  1,  0,  0],
       [ 0,  2,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  8]], dtype=int64)

In [9]:
accuracy_score(y_test, LR_test)

0.6744186046511628

In [10]:
# ランダムフォレスト
RF = RandomForestClassifier(random_state=42)

In [11]:
# グリッドサーチでハイパーパラメータの交差検証
param_grid = [{
    'n_estimators': [5, 10, 50, 100],
    'min_samples_split': [2, 5, 10],
    'bootstrap': ['Auto', 'sqrt'],
    'min_samples_leaf': [1, 5 ,10],
    'max_depth': [10, 50, 90],
    'max_features': ['auto', 'sqrt'],
    'random_state': [42]
}]

In [12]:
# グリッドサーチで交差検証
RF_CV = GridSearchCV(estimator=RF, param_grid=param_grid, cv=5)
RF_CV.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [5, 10, 50, 100], 'min_samples_split': [2, 5, 10], 'bootstrap': ['Auto', 'sqrt'], 'min_samples_leaf': [1, 5, 10], 'max_depth': [10, 50, 90], 'max_features': ['auto', 'sqrt'], 'random_state': [42]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
# 最適なパラメータ
RF_CV.best_params_

{'bootstrap': 'Auto',
 'max_depth': 50,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100,
 'random_state': 42}

In [14]:
# テストデータで推測と評価
RF_test = RF_CV.predict(X_test)

In [15]:
confusion_matrix(y_test, RF_test)

array([[11,  0,  0,  0,  0,  0],
       [ 4,  9,  0,  0,  0,  1],
       [ 1,  0,  2,  0,  0,  0],
       [ 0,  1,  0,  3,  0,  0],
       [ 0,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0,  8]], dtype=int64)

In [16]:
accuracy_score(y_test, RF_test)

0.8372093023255814

In [18]:
# SVM
SV = SVC(random_state=42)

In [19]:
# ハイパーパラメータ
param_grid_SV = [{
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf', 'poly', 'linear', 'sigmoid'],
    'random_state': [42]
}]

In [20]:
# グリッドサーチで交差検証
SV_CV = GridSearchCV(estimator=SV, param_grid=param_grid_SV, cv=5)
SV_CV.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=42,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1], 'kernel': ['rbf', 'poly', 'linear', 'sigmoid'], 'random_state': [42]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
SV_CV.best_params_

{'C': 10, 'gamma': 0.1, 'kernel': 'rbf', 'random_state': 42}

In [22]:
# テストデータで推測と評価
SV_test = SV_CV.predict(X_test)

In [23]:
confusion_matrix(y_test, SV_test)

array([[11,  0,  0,  0,  0,  0],
       [ 5,  9,  0,  0,  0,  0],
       [ 3,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  0,  0],
       [ 0,  1,  0,  0,  1,  1],
       [ 0,  0,  0,  0,  1,  7]], dtype=int64)

In [24]:
accuracy_score(y_test, SV_test)

0.7441860465116279

In [25]:
# ナイーブベイズ
NB = GaussianNB()

In [26]:
# 訓練
NB.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [27]:
# テストデータで推測と評価
NB_test = NB.predict(X_test)

In [28]:
confusion_matrix(y_test, NB_test)

array([[7, 1, 3, 0, 0, 0],
       [9, 3, 0, 1, 0, 1],
       [1, 0, 2, 0, 0, 0],
       [0, 3, 0, 1, 0, 0],
       [0, 0, 0, 0, 3, 0],
       [0, 0, 0, 0, 0, 8]], dtype=int64)

In [29]:
accuracy_score(y_test, NB_test)

0.5581395348837209

In [30]:
# 警告を非表示にする
import warnings
warnings.simplefilter('ignore', DeprecationWarning)

In [31]:
# XGBoost
XGB = xgb.XGBClassifier(random_state=42)

In [50]:
# グリッドサーチでハイパーパラメータの交差検証
param_grid_XGB = [{
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 6, 10],
    'min_child_weight': [1, 10],
    'subsample': [0.5, 0.9, 1.0],
    'colsample_bytree': [0.9, 1.0],
    'learning_rate': [0.01, 0.1, 0.3],
    'randam_state': [42]
}]

In [50]:
# グリッドサーチでハイパーパラメータの交差検証
param_grid_XGB2 = [{
    'n_estimators': [300],
    'max_depth': [6],
    'min_child_weight': [1],
    'subsample': [1.0],
    'colsample_bytree': [0.9],
    'learning_rate': [0.3],
    'randam_state': [42]
}]

In [51]:
XGB_CV = GridSearchCV(estimator=XGB, param_grid=param_grid_XGB, cv=5)
XGB_CV.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=42, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 6, 10], 'min_child_weight': [1, 10], 'subsample': [0.5, 0.9, 1.0], 'colsample_bytree': [0.9, 1.0], 'learning_rate': [0.01, 0.1, 0.3], 'randam_state': [42]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [52]:
XGB_CV.best_params_

{'colsample_bytree': 0.9,
 'learning_rate': 0.3,
 'max_depth': 5,
 'min_child_weight': 1,
 'n_estimators': 300,
 'randam_state': 42,
 'subsample': 1.0}

In [53]:
XGB_test = XGB_CV.predict(X_test)

In [54]:
confusion_matrix(y_test, XGB_test)

array([[11,  0,  0,  0,  0,  0],
       [ 2, 11,  0,  0,  0,  1],
       [ 1,  0,  2,  0,  0,  0],
       [ 0,  1,  0,  3,  0,  0],
       [ 0,  0,  0,  0,  3,  0],
       [ 0,  0,  0,  0,  0,  8]], dtype=int64)

In [55]:
accuracy_score(y_test, XGB_test)

0.8837209302325582