In [37]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
import pandas as pd
import numpy as np

import seaborn as sns
sns.set()

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import learning_curve,validation_curve

from sklearn.preprocessing import PolynomialFeatures

In [38]:
# load the data
df = pd.read_csv('./Data/Final_18_features_extracted_V2.csv')

In [39]:
X = df.drop(columns=["Region","File Name"],axis=1)
y = df['Region']

In [40]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.3,random_state=17,shuffle=True
)

In [41]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=17)

rfc = RandomForestClassifier(random_state=17,n_jobs=-1)

- Making poly features

In [42]:
poly = PolynomialFeatures(3)
X_train_poly = poly.fit_transform(X_train)

In [43]:
X_train_poly.shape

(6341, 1330)

- Lets check the CV scores

In [44]:
results = cross_val_score(rfc,X_train_poly,y_train,cv=skf)
results.mean()*100

78.39463498668813

In [45]:
rfc.fit(X_train_poly,y_train)

RandomForestClassifier(n_jobs=-1, random_state=17)

In [46]:
rfc_pred = rfc.predict(X_train_poly)
accuracy_score(y_train,rfc_pred)

1.0

In [47]:
# accuracy on the test set
X_test_poly = poly.transform(X_test)
X_test_poly.shape
rfc_pred = rfc.predict(X_test_poly)
accuracy_score(y_test,rfc_pred)

0.7888153053715967

## Final tuning based on validation curves

In [48]:
forest_params = {
    'n_estimators':[10,15,20,25],
    'max_features': [4],
    'min_samples_leaf': [16,18,20],
    'max_depth': [7,10,13],
    'min_samples_split':[20,25,30]

}
rfc = RandomForestClassifier(n_estimators=50,random_state=17,n_jobs=-1)

gcv = GridSearchCV(rfc,forest_params,n_jobs=-1,cv=skf,verbose=1)

In [49]:
gcv.fit(X_train_poly,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=17, shuffle=True),
             estimator=RandomForestClassifier(n_estimators=50, n_jobs=-1,
                                              random_state=17),
             n_jobs=-1,
             param_grid={'max_depth': [7, 10, 13], 'max_features': [4],
                         'min_samples_leaf': [16, 18, 20],
                         'min_samples_split': [20, 25, 30],
                         'n_estimators': [10, 15, 20, 25]},
             verbose=1)

In [50]:
gcv.best_score_

0.6890089566040973

In [51]:
# accuracy on the training set
accuracy_score(gcv.best_estimator_.predict(X_train_poly),y_train)

0.8055511748935499