In [1]:
# Numpy and Pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Pre-processing and setup functions
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.utils import to_categorical

# Algorithms
from sklearn.ensemble import RandomForestClassifier

# Report and model validation
from sklearn.metrics import classification_report

# Model persistence
from joblib import dump, load

pd.set_option('display.max_columns', 500)

In [2]:
main = pd.read_csv('main_test.csv')

In [3]:
y = main['response']
X = main.drop('response', 1)

In [4]:
X = pd.get_dummies(X)

In [5]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(X)

  return self.partial_fit(X, y)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y, random_state=42, stratify=y)

In [7]:
forest = RandomForestClassifier()
forest_grid = {
    'n_estimators': [25, 50, 100, 200, 400, 800],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [8]:
grid = GridSearchCV(
    forest, 
    forest_grid, 
    scoring='accuracy', 
    cv=3, 
    n_jobs=-1, 
    verbose=3
)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:    9.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [25, 50, 100, 200, 400, 800], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=3)

In [9]:
print(grid.best_params_)

{'bootstrap': False, 'criterion': 'gini', 'n_estimators': 50}


In [10]:
print(grid.best_score_)

0.6041237113402061


In [11]:
predictions = grid.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

  Collegiate       0.00      0.00      0.00         4
       Lower       0.34      0.31      0.32        36
         NHL       0.27      0.11      0.15        28
  Not Active       0.66      0.83      0.74        94

   micro avg       0.57      0.57      0.57       162
   macro avg       0.32      0.31      0.30       162
weighted avg       0.51      0.57      0.53       162

