In [50]:
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%config InlineBackend.figure_format = 'retina'
plt.style.use('ggplot')
import pandas as pd
import numpy as np

import seaborn as sns
sns.set()

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier

In [51]:
# load the data
df = pd.read_csv('./Data/Final_18_features_extracted_V2.csv')

In [52]:
df.head()

Unnamed: 0,Region,File Name,MFCC1,MFCC2,MFCC3,MFCC4,MFCC5,MFCC6,MFCC7,MFCC8,MFCC9,MFCC10,MFCC11,MFCC12,MFCC13,AMP_ENV,RMSE,ZCR,SPEC_CENT,SPEC_BAND
0,Barisal,br1.wav,-371.58478,97.99681,1.269639,26.940123,-5.517467,-7.585458,-12.512117,-10.121937,-17.117098,-2.98086,-5.495219,-6.685885,-2.246712,0.094423,0.042012,0.082185,1389.44849,1475.018042
1,Barisal,br10.wav,-311.64117,71.8437,-20.855787,17.992466,-9.575283,9.301216,-38.153957,-14.808104,-10.210079,-14.74765,-10.387901,-1.234518,-3.261181,0.165459,0.074938,0.179697,2307.459302,1789.181398
2,Barisal,br100.wav,-314.17575,136.41866,-52.862125,-15.473975,-45.57,-17.382814,-19.885965,-25.647032,-8.209912,-17.986666,-11.351544,-1.021543,-9.109297,0.108455,0.048392,0.07092,1157.114627,1057.42618
3,Barisal,br1000.wav,-411.62485,180.74176,-21.907639,-30.858137,14.73692,-6.635318,-30.891031,-23.473873,-18.24306,-22.557762,-11.455458,0.803584,-9.340458,0.045525,0.023924,0.049118,755.8615,628.356257
4,Barisal,br1001.wav,-395.1891,115.52895,-60.058544,-21.238525,-16.252647,-4.06191,-12.67642,-23.779793,-25.017618,-18.328882,-6.299292,-8.56261,-16.118456,0.043661,0.021089,0.101046,1474.43887,1225.782428


- First drop 'File Name' columns as we are gonna use default indexing
- Make X and y

In [53]:
X = df.drop(columns=["Region","File Name"],axis=1)
y = df['Region']

In [54]:
X.shape, y.shape

((9059, 18), (9059,))

- Making train test split


In [55]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.3,random_state=17,shuffle=True
)

- Lets build our Random Forest model with default parameters now
  - Also we need our skf with shuffle true as CV

In [56]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=17)

rfc = RandomForestClassifier(random_state=17,n_jobs=-1)

- Lets check our CV scores with default parameters

In [57]:
results = cross_val_score(rfc,X,y,cv=skf)
results.mean()*100

79.57844080143055

**Note:** This CV scores can be counted as accuracy on the test set. As in Random Forest bagging is built in, we get OOB error!

- Lets do the same thing in default manner

In [58]:
rfc.fit(X_train,y_train)

RandomForestClassifier(n_jobs=-1, random_state=17)

In [60]:
# accuracy on the train set:
rfc_pred = rfc.predict(X_train)
accuracy_score(y_train,rfc_pred)

1.0

In [61]:
# accuracy on the test set
rfc_pred = rfc.predict(X_test)
accuracy_score(y_test,rfc_pred)

0.7829286239882266

### One very important thing to take note:
- The accuracy on the train set is 1 while the accuracy on the test set is .78! This is a case where the model is extremely biased towards the train set, and it cant generalize at all. Indicating **High variance** of the model.We might try three things here!
  - Add more training data
  - Reduce model complexity
  - Bagging!