In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
preprocessed_df = pd.read_csv("C:/Users/Asus/Desktop/Master Thesis/bbdc_2020_public_data/bbdc_2020/preprocessed/train_left.csv")
X_train, X_test, y_train, y_test = train_test_split(preprocessed_df.iloc[:,:-1], preprocessed_df.iloc[:,-1], test_size = 0.70)

In [3]:
param_grid = { 
    'n_estimators': [50, 80, 100, 150, 200,300,400,500,1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [40,50,60,70,80,90,100],
    'criterion' :['gini', 'entropy'],
    'max_features' : [10,15,20,30,40,48,57],
    'bootstrap': [True, False]
}

In [4]:
model = RandomForestClassifier()
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [5]:
grid_search.fit(X_train,y_train)

Fitting 5 folds for each of 1764 candidates, totalling 8820 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   34.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 48.5min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed: 94.1min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed: 149.3min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed: 217.2min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 330.7min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 480.5min
[Parallel(n_jobs=-1)]: Done 3265 tasks      | elapsed: 688.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed: 933.1min
[Parallel(n_jobs=-1)]: Done 4885 tasks      | elapsed: 1166.3min
[Parallel(n_jobs=-1)]: Done 5816 tasks      | elapsed: 1396.2min
[Parallel(n_jobs=-1)]: Done 6829 tasks      | elapsed: 1688.3min
[Parallel(n_jobs=-1)]: Done 7922 tasks      | elapsed: 2279.8min
[Parallel(n_jobs=-1)]: Done 8820 

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [40, 50, 60, 70, 80, 90, 100],
                         'max_features': [10, 15, 20, 30, 40, 48, 57],
                         'n_estimators': [50, 80, 100, 150, 200, 300, 400, 500,
                                          1000]},
             verbose=2)

In [6]:
grid_search.best_params_

{'bootstrap': False,
 'criterion': 'gini',
 'max_depth': 60,
 'max_features': 10,
 'n_estimators': 1000}

In [7]:
best_grid = grid_search.best_estimator_
print(best_grid)
#grid_accuracy = predict(best_grid, X_test, y_test)

RandomForestClassifier(bootstrap=False, max_depth=60, max_features=10,
                       n_estimators=1000)


In [8]:
y_pred = grid_search.predict(X_test)

In [None]:
print(y_pred)

In [9]:
print("Accuracy score %s" %accuracy_score(y_test,y_pred))
print("Classification report  \n %s" %(classification_report(y_test, y_pred)))

Accuracy score 0.6985542322145086
Classification report  
               precision    recall  f1-score   support

           0       0.81      0.87      0.84      7366
           1       0.79      0.06      0.10       597
           2       0.00      0.00      0.00       257
           3       0.54      0.25      0.35      1458
           4       0.60      0.81      0.69      4659
           5       0.50      0.28      0.36      1364

    accuracy                           0.70     15701
   macro avg       0.54      0.38      0.39     15701
weighted avg       0.68      0.70      0.67     15701



  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
print(grid_search.best_score_)

0.7017388710595072


In [13]:
print("Confusion Matrix %s" %confusion_matrix(y_test,y_pred))

Confusion Matrix [[6394    3    0   70  807   92]
 [ 214   33    0   37  263   50]
 [  45    0    0    7  189   16]
 [ 361    1    0  371  617  108]
 [ 690    1    0   71 3785  112]
 [ 201    4    0  132  642  385]]


In [14]:
sorted(grid_search.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_bootstrap',
 'param_criterion',
 'param_max_depth',
 'param_max_features',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [17]:
grid_search.cv_results_['mean_test_score']

array([0.68554163, 0.68806774, 0.69252551, ..., 0.66711457, 0.66964024,
       0.66978839])

In [19]:
grid_search.cv_results_['std_test_score']

array([0.00765502, 0.00757878, 0.00597461, ..., 0.00952567, 0.01033298,
       0.00895138])

In [20]:
grid_search.cv_results_['rank_test_score']

array([1336, 1114,  462, ..., 1685, 1654, 1652])

In [21]:
grid_search.cv_results_['mean_fit_time']

array([   2.94731994,    4.7409256 ,    6.02030668, ...,  579.81210918,
        721.62971506, 1106.27304306])