In [22]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV

import warnings

In [23]:
# Loading X
X = joblib.load('X.pkl')
X

Unnamed: 0,ENE,ESE,East,NE,NNE,NNW,NW,North,SE,SSE,...,_rain,_snow,_tempm,_thunder,_tornado,_vism,_wdird,_wspdm,year,month
0,0,0,0,0,0,0,0,0,0,0,...,0,0,30.0,0,0,5.000000,280.000000,7.400000,1996,11
1,0,0,0,0,0,0,0,1,0,0,...,0,0,28.0,0,0,2.198981,0.000000,5.052735,1996,11
2,0,0,0,0,0,0,0,1,0,0,...,0,0,24.0,0,0,2.198981,0.000000,5.052735,1996,11
3,0,0,0,0,0,0,0,1,0,0,...,0,0,24.0,0,0,2.000000,0.000000,5.052735,1996,11
4,0,0,0,0,0,0,0,1,0,0,...,0,0,23.0,0,0,1.200000,0.000000,0.000000,1996,11
5,0,0,0,0,0,0,0,1,0,0,...,0,0,21.0,0,0,2.198981,0.000000,5.052735,1996,11
6,0,0,0,0,0,0,0,1,0,0,...,0,0,21.0,0,0,0.800000,0.000000,0.000000,1996,11
7,0,0,0,0,0,0,0,1,0,0,...,0,0,21.0,0,0,2.198981,0.000000,5.052735,1996,11
8,0,0,0,0,0,0,0,1,0,0,...,0,0,19.0,0,0,2.198981,0.000000,5.052735,1996,11
9,0,0,0,0,0,0,0,1,0,0,...,0,0,19.0,0,0,2.198981,0.000000,5.052735,1996,11


In [24]:
# Loading y
y = joblib.load('y.pkl')
y

array([31, 31, 31, ...,  5,  5,  5])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=0) # test size =0.25 or 25%

In [26]:
print("Shape of X_train", X_train.shape)
print("Shape of X_test", X_test.shape)
print("Shape of y_train", y_train.shape)
print("Shape of y_test", y_test.shape)

Shape of X_train (75742, 32)
Shape of X_test (25248, 32)
Shape of y_train (75742,)
Shape of y_test (25248,)


In [27]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [29]:
y_pred = clf.predict(X_test)
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.8174508871989861


# Using GridSearchCV for tuning the model

In [34]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [35]:
rfc=RandomForestClassifier(random_state=42)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 5, 6, 7, 8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [36]:
CV_rfc.best_params_

{'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 500}

In [37]:
print("accuracy :",CV_rfc.best_score_)

accuracy : 0.7523962926777745


# GridSearchCV didn't give better score so the earlier model is better for prediction