In [2]:
# importing dependencies
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
import pandas as pd
import numpy as np

import seaborn as sns
sns.set()

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.metrics import accuracy_score

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import learning_curve,validation_curve

In [3]:
# load the data
df = pd.read_csv('./Data/Final_18_features_extracted_V2.csv')

- First drop 'File Name' columns as we are gonna use default indexing
- Make X and y

In [5]:
X = df.drop(columns=["Region","File Name"],axis=1)
y = df['Region']

- Making train,test,split

In [9]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.3,random_state=17,shuffle=True
)

- Lets initialize our CV

In [10]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=17)

- Lets initialize our model with default parameters
  - loss: deviance
  - learning_rate: 0.1
  - n_estimators: 100
  - subsample: 1 (Choosing subsample < 1.0 leads to a reduction of variance and an increase in bias.)
  - criterion: friedman_mse
  - min_samples_split: 2
  - min_samples_leaf: 1
  - max_depth: 3
  - max_features: None (all features)
  - max_leaf_nodes: None
  - ccp_alpha: 0.0

In [11]:
gbt = GradientBoostingClassifier(random_state=17)

- Scores with default parameters

In [13]:
# CV Scores
results = cross_val_score(gbt,X_train,y_train,cv=skf,n_jobs=-1)
results.mean()*100

73.69487885092958

In [14]:
# Train set scores
gbt.fit(X_train,y_train)
pred = gbt.predict(X_train)
accuracy_score(pred,y_train)

0.8911843557798454

In [15]:
# Test set Scores
pred = gbt.predict(X_test)
accuracy_score(pred,y_test)

0.7376747608535688

- As some of the Hyperparamters of Random Forest and GBT are same, lets use the hyperparameter space we have found with validation curves in Random Forest here!

In [16]:
params = {
    'n_estimators':[10,15,20,25],
    'max_features': [4],
    'min_samples_leaf': [16,18,20],
    'max_depth': [7,10,13],
    'min_samples_split':[20,25,30]
}
gbt = GradientBoostingClassifier(random_state=17)
gcv = GridSearchCV(gbt,params,n_jobs=-1,cv=skf,verbose=1)

In [17]:
gcv.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=17, shuffle=True),
             estimator=GradientBoostingClassifier(random_state=17), n_jobs=-1,
             param_grid={'max_depth': [7, 10, 13], 'max_features': [4],
                         'min_samples_leaf': [16, 18, 20],
                         'min_samples_split': [20, 25, 30],
                         'n_estimators': [10, 15, 20, 25]},
             verbose=1)

In [18]:
# CV Scores
gcv.best_score_

0.7708558615666475

In [19]:
# Train set scores
accuracy_score(gcv.best_estimator_.predict(X_train),y_train)

0.9995268885033907

In [20]:
# Test set scores
accuracy_score(gcv.best_estimator_.predict(X_test),y_test)

0.7752023546725534

### Notes:
- So those parameter values dont work here!
- Lets make validation curves for this model!