In [1]:
import pandas as pd

# read the data
X_train = pd.read_csv('X23241.csv').values[:,1:]
y_train = pd.read_csv('Y23241.csv').values[:,1].ravel().astype(int)


In [2]:
from sklearn.model_selection import train_test_split

# split the data
X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0])

Train/Valid/Test sizes: 23241 4649


In [5]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

#modeling and grid search
forest = RandomForestClassifier(n_estimators=100,
                                random_state=123)

param_grid =  {
    'max_depth': [1, 5, 10, 15, None],
    'criterion': ["gini", "entropy"]
}

gs = GridSearchCV(estimator=forest,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10)

gs.fit(X_train_sub, y_train_sub)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=123),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 5, 10, 15, None]},
             scoring='accuracy')

In [6]:
print('Best Params: %s' % gs.best_params_) #best parameters

Best Params: {'criterion': 'entropy', 'max_depth': None}


In [7]:
print(f"Training Accuracy: {gs.best_estimator_.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Validation Accuracy: {gs.best_estimator_.score(X_valid, y_valid)*100:0.3f}%")

Training Accuracy: 100.000%
Validation Accuracy: 83.760%


In [46]:
forest_best = RandomForestClassifier(n_estimators=100,
                                     criterion="entropy",
                                     random_state=123)

%timeit forest_best.fit(X_train_sub, y_train_sub) #training(fitting) time
%timeit forest_best.score(X_valid, y_valid) #test time

342 ms ± 14.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
11.1 ms ± 297 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [47]:
# Compute with reduced data

X_train = pd.read_csv('NewX.csv').values
y_train = pd.read_csv('Y23241.csv').values[:,1].ravel().astype(int)

X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

In [35]:
# Compute with reduced data

forest = RandomForestClassifier(n_estimators=100,
                                random_state=123)

param_grid =  {
    'max_depth': [1, 5, 10, 15, None],
    'criterion': ["gini", "entropy"]
}

gs = GridSearchCV(estimator=forest,
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=10)

gs.fit(X_train_sub, y_train_sub)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=123),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 5, 10, 15, None]},
             scoring='accuracy')

In [36]:
print('Best Params: %s' % gs.best_params_)
print(f"Training Accuracy: {gs.best_estimator_.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Validation Accuracy: {gs.best_estimator_.score(X_valid, y_valid)*100:0.3f}%")

Best Params: {'criterion': 'entropy', 'max_depth': None}
Training Accuracy: 100.000%
Validation Accuracy: 84.169%


In [48]:
forest_best = RandomForestClassifier(n_estimators=100,
                                     criterion="entropy",
                                     random_state=123)

%timeit forest_best.fit(X_train_sub, y_train_sub) #training time
%timeit forest_best.score(X_valid, y_valid) #test time

404 ms ± 4.62 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
10.3 ms ± 664 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
