In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

In [2]:
param_grid = { 
    'n_estimators': [50, 80, 100, 150, 200,300,400,500,1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [40,50,60,70,80,90,100],
    'criterion' :['gini', 'entropy'],
    'max_features' : [10,15,20,30,40,48,57],
    'bootstrap': [True, False]
}

In [3]:
preprocessed_df = pd.read_csv("C:/Users/Asus/Desktop/Master Thesis/bbdc_2020_public_data/bbdc_2020/preprocessed/train_left.csv")
X_train, X_test, y_train, y_test = train_test_split(preprocessed_df.iloc[:,:-1], preprocessed_df.iloc[:,-1], test_size = 0.70)

In [4]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [5]:
model = RandomForestClassifier()
rand_search = RandomizedSearchCV(estimator = model, param_distributions= param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

In [6]:
rand_search.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 15.0min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [40, 50, 60, 70, 80, 90,
                                                      100],
                                        'max_features': [10, 15, 20, 30, 40, 48,
                                                         57],
                                        'n_estimators': [50, 80, 100, 150, 200,
                                                         300, 400, 500, 1000]},
                   verbose=2)

In [7]:
rand_search.best_params_

{'n_estimators': 1000,
 'max_features': 10,
 'max_depth': 60,
 'criterion': 'entropy',
 'bootstrap': False}

In [8]:
rand_search.best_estimator_

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=60,
                       max_features=10, n_estimators=1000)

In [9]:
y_pred = rand_search.predict(X_test)

In [10]:
rand_search.best_score_

0.6966864232173533

In [11]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[6307   13    0   83  858  131]
 [ 182   65    0   27  256   43]
 [  35    1    0   12  185   14]
 [ 307    8    0  394  609  127]
 [ 644    5    0  132 3832  112]
 [ 135    7    0  107  683  387]]
              precision    recall  f1-score   support

           0       0.83      0.85      0.84      7392
           1       0.66      0.11      0.19       573
           2       0.00      0.00      0.00       247
           3       0.52      0.27      0.36      1445
           4       0.60      0.81      0.69      4725
           5       0.48      0.29      0.36      1319

    accuracy                           0.70     15701
   macro avg       0.51      0.39      0.41     15701
weighted avg       0.68      0.70      0.67     15701

0.6996369657983568


  _warn_prf(average, modifier, msg_start, len(result))
