In [1]:
# import some library
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
# read data
df = pd.concat([
    pd.read_csv('../data/feature_selection_positive.csv', index_col=0),
    pd.read_csv('../data/decomp_pos.csv', index_col=0).drop('Subclass', axis=1)
], axis=1)

# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)
features = df.drop('Subclass', axis=1)

# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [3]:
param_grid = {
    'max_depth': [1, 5, 10, None],
    'max_features': [1, 'auto', None],
    'min_samples_leaf': [1, 2, 4,],
    'min_samples_split': [5, 6, 7, 9, 11],
}

grid_search = GridSearchCV(
    estimator = rf(), 
    param_grid = param_grid, 
    cv = 3, 
    n_jobs = -1, 
    verbose=2
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5 
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5 
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5, total=   0.1s
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5 
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5, total=   0.1s
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5, total=   0.1s
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6 
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6 
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6 
[CV] max_depth=1, max_features=1, min

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.3s


[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=6 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=6, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_d

[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.1s
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   0.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   0.1s
[CV] max_depth=1, max_features=auto, min_samples_leaf=4, min_s

[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=9, total=   0.4s
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=9, total=   0.3s
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=9, total=   0.3s
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   0.3s
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=6 
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   0.4s
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   0.4s
[CV]  max_depth=1, max_features=None, min_samples

[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    8.8s


[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=7, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=7 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=7, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=11 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=11, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=11 
[CV]  m

[CV]  max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=9, total=   0.1s
[CV] max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11, total=   0.1s
[CV] max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11, total=   0.1s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11, total=   0.1s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5, total=   0.1s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5, total=   0.1s
[CV] max_depth=5, max_features=auto, min_sample

[CV]  max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=9, total=   2.2s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=9, total=   2.3s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=7 
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=11, total=   2.2s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=11, total=   2.2s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=5, total=   2.2s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=5, max_features=None, min_samples_leaf=2, min_sam

[CV]  max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=9, total=  12.0s
[CV] max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=7, total=  12.8s
[CV]  max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.3s
[CV] max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.5s
[CV] max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=11, total=  13.6s
[CV]  max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=9, total=   0.3s
[CV] max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=9, total=  14.6s
[CV]  max_depth=10, max_features=1, min_samples_le

[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7, total=   1.0s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7, total=   0.8s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7, total=   1.0s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.2s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   1.0s


[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.1min


[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.9s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   0.8s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   1.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   0.8s
[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, total=   0.9s
[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=10, max_features=auto, min_samples

[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=9, total=  23.1s
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=9, total=  18.0s
[CV]  max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=9, total=  19.8s
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=11, total=  17.6s
[CV]  max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=11, total=  16.7s
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=7 
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=9 
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=10, max_features=None, min_samples_lea

[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=9, total=   3.0s
[CV]  max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=9, total=   3.1s
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=6, total=   0.0s
[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7, total=   0.0s
[CV]  max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=11, total=   2.8s
[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=11, total=   2.7s
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7, total=   0.0s
[CV]  max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=9, total=   3.0s
[CV] max_depth=

[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=6, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=6, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=6, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=None, max_features

[CV] max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   3.6s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=6, total=   3.1s
[CV] max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=9 
[CV] max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=11 
[CV] max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=11 
[CV]  max_depth=None, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   3.9s
[CV] max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=11 
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=6, total=   3.4s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=6, total=   3.7s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=7, total=   3.0s
[CV]  max_dept

[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  2.9min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 5, 10, None], 'max_features': [1, 'auto', None], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [5, 6, 7, 9, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [4]:
f = grid_search.best_estimator_

pickle.dump(f, open('../model/rf_gs_fs+decomp_pos.sav', 'wb'))
print('Train score: {}'.format(f.score(X_train, y_train)))
print('Test score: {}'.format(f.score(X_test, y_test)))

Train score: 0.9835820895522388
Test score: 0.7648809523809523


In [5]:
# read data
df = pd.concat([
    pd.read_csv('../data/feature_selection_negative.csv', index_col=0),
    pd.read_csv('../data/decomp_neg.csv', index_col=0).drop('Subclass', axis=1)
], axis=1)

# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)
features = df.drop('Subclass', axis=1)

# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [6]:
param_grid = {
    'max_depth': [1, 5, 10, None],
    'max_features': [1, 'auto', None],
    'min_samples_leaf': [1, 2, 4,],
    'min_samples_split': [5, 6, 7, 9, 11],
}

grid_search = GridSearchCV(
    estimator = rf(), 
    param_grid = param_grid, 
    cv = 3, 
    n_jobs = -1, 
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:    6.0s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 5, 10, None], 'max_features': [1, 'auto', None], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [5, 6, 7, 9, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [7]:
f = grid_search.best_estimator_

pickle.dump(f, open('../model/rf_gs_fs_deco_neg.sav', 'wb'))
print('Train score: {}'.format(f.score(X_train, y_train)))
print('Test score: {}'.format(f.score(X_test, y_test)))

Train score: 0.9088471849865952
Test score: 0.7021276595744681


In [8]:
# read data
df = pd.read_csv('../data/feature_selection_positive.csv', index_col=0)

# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)
features = df.drop('Subclass', axis=1)

# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [9]:
param_grid = {
    'max_depth': [1, 5, 10, None],
    'max_features': [1, 'auto', None],
    'min_samples_leaf': [1, 2, 4,],
    'min_samples_split': [5, 6, 7, 9, 11],
}

grid_search = GridSearchCV(
    estimator = rf(), 
    param_grid = param_grid, 
    cv = 3, 
    n_jobs = -1, 
    verbose=2
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5 
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5, total=   0.1s
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5 
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5 
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6 
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=5, total=   0.1s
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6 
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6, total=   0.1s
[CV]  max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=1, min_samples_split=6 
[CV]  max_depth=1, max_features=1, mi

[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.2s


[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=6, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_depth=1, max_features=1, min_samples_leaf=2, min_samples_split=9 
[CV]  max_

[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   0.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   0.0s
[CV] max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=1, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   0.0s
[CV] max_depth=1, max_features=auto, min_sampl

[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=5 
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=9, total=   0.4s
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=9, total=   0.4s
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   0.4s
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   0.3s
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=1, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   0.4s
[CV] max_depth=1, max_features=None, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=1, max_features=None, min_samples_leaf=4, min_sa

[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:    8.5s


[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=7 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=7, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=7 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=7, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=9, total=   0.0s
[CV] max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=11 
[CV]  max_depth=5, max_features=1, min_samples_leaf=1, min_samples_split=11, total=   0.0s
[CV] max

[CV]  max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=9, total=   0.0s
[CV] max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11 
[CV] max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11, total=   0.1s
[CV] max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11, total=   0.1s
[CV]  max_depth=5, max_features=auto, min_samples_leaf=1, min_samples_split=11, total=   0.0s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5, total=   0.0s
[CV] max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5 
[CV]  max_depth=5, max_features=auto, min_samples_leaf=2, min_samples_split=5, total=   0.0s
[CV] max_depth=5, max_features=auto, min_sampl

[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=6 
[CV]  max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=7, total=   2.6s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=9, total=   2.4s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=7 
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=11, total=   2.1s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=11, total=   2.1s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=5, max_features=None, min_samples_leaf=2, min_samples_split=5, total=   2.1s
[CV] max_depth=5, max_features=None, min_samples_leaf=2, min_samp

[CV]  max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=6, total=   0.0s
[CV]  max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=6, total=   0.0s
[CV] max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=6 
[CV]  max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=11, total=   1.9s
[CV]  max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=6, total=   0.0s
[CV] max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.0s
[CV]  max_depth=5, max_features=None, min_samples_leaf=4, min_samples_split=11, total=   2.0s
[CV] max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7, total=   0.0s
[CV] max_depth=10, max_features=1, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=10, max_features=1, min_samples_leaf=

[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=7, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=9, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=11 
[CV]  max_depth=10, max_features=auto, min_samples_leaf

[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   26.6s


[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=2, min_samples_split=11, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6, total=   0.1s
[CV] max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=10, max_features=auto, min_samples_leaf

[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=9, total=   3.1s
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   2.9s
[CV]  max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=5, total=   2.8s
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=11, total=   3.0s
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=5, total=   2.8s
[CV] max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=11 
[CV]  max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=5, total=   2.7s
[CV] max_depth=10, max_features=None, 

[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=6 
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=6, total=   0.0s
[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7, total=   0.0s
[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7, total=   0.0s
[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=7, total=   0.0s
[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=9, total=   0.0s
[CV] max_depth=None, max_features=1, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=None, max_features=1, min_samples_leaf=4, min_sampl

[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=7, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=9, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=9 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=9, total=   0.1s
[CV] max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=11 
[CV]  max_depth=None, max_features=auto, min_samples_leaf=4, min_samples_split=9, total=   0.1s
[CV] max_depth=Non

[CV] max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=11 
[CV] max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=11 
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=6, total=   3.0s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=6, total=   2.7s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=7, total=   2.6s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=7, total=   2.9s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=7, total=   2.8s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=9, total=   2.5s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=9, total=   2.4s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_samples_split=9, total=   2.5s
[CV]  max_depth=None, max_features=None, min_samples_leaf=4, min_sam

[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:   56.9s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 5, 10, None], 'max_features': [1, 'auto', None], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [5, 6, 7, 9, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [10]:
f = grid_search.best_estimator_

pickle.dump(f, open('../model/rf_gs_pos_fs.sav', 'wb'))
print('Train score: {}'.format(f.score(X_train, y_train)))
print('Test score: {}'.format(f.score(X_test, y_test)))

Train score: 0.9850746268656716
Test score: 0.7738095238095238


In [11]:
# read data
df = pd.read_csv('../data/feature_selection_negative.csv', index_col=0)

# divide objective and target
objective = df.Subclass
le = preprocessing.LabelEncoder()
objective = le.fit_transform(objective)
features = df.drop('Subclass', axis=1)

# train test split
random_state=np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(
    features, 
    objective,
    test_size=0.2
)

In [12]:
param_grid = {
    'max_depth': [1, 5, 10, None],
    'max_features': [1, 'auto', None],
    'min_samples_leaf': [1, 2, 4,],
    'min_samples_split': [5, 6, 7, 9, 11],
}

grid_search = GridSearchCV(
    estimator = rf(), 
    param_grid = param_grid, 
    cv = 3, 
    n_jobs = -1, 
    verbose=1
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 328 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:    6.1s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [1, 5, 10, None], 'max_features': [1, 'auto', None], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [5, 6, 7, 9, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [13]:
f = grid_search.best_estimator_

pickle.dump(f, open('../model/rf_gs_fs_neg.sav', 'wb'))
print('Train score: {}'.format(f.score(X_train, y_train)))
print('Test score: {}'.format(f.score(X_test, y_test)))

Train score: 0.9865951742627346
Test score: 0.6382978723404256
