In [1]:
#Getting the data from openml
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784')
cifar = fetch_openml('CIFAR_10_small')

In [11]:
#Save dataset in pickle just in case my computer decides to pull a fast one and restart on me
import pickle

pickle.dump(mnist, open( "mnist.p", "wb" ) )
pickle.dump(cifar, open( "cifar.p", "wb" ) )

In [1]:
import pickle

mnist = pickle.load( open("mnist.p", "rb"))
cifar = pickle.load( open("cifar.p", "rb"))

In [2]:
import pandas as pd

X = pd.DataFrame(mnist.data)
Y = pd.DataFrame(mnist.target)
print(X.shape)
print(Y.shape)

(70000, 784)
(70000, 1)


In [3]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.25,random_state=42)
x_reduced, y_reduced = resample(X,Y,replace=False,n_samples=10000,random_state=42)
x_train_reduced, x_test_reduced, y_train_reduced, y_test_reduced = train_test_split(x_reduced,y_reduced,test_size=0.25,random_state=42)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

forest = RandomForestClassifier()
gradientboosting = XGBClassifier()



In [5]:
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
import numpy as np

In [6]:
params1 = {'n_estimators' : np.arange(10,250),
          'criterion' : ['gini', 'entropy'],
          'min_samples_split' : np.arange(2,10),
          'max_depth' : np.arange(3,15),
          }

params2 = {'n_estimators': [10],
              'learning_rate': stats.uniform(0.01, 1),
              'subsample': stats.uniform(0.3, 0.7),
              'max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
              'colsample_bytree': stats.uniform(0.5, 0.45),
              'min_child_weight': [1, 2, 3]
             }

In [53]:
clf1 = RandomizedSearchCV(forest,params1,n_iter=100,cv=5,verbose=10,random_state=42,n_jobs=-1)
clf1 = clf1.fit(x_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:  4.9min
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed:  9.5min
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed: 1

In [54]:
print(clf1.best_params_)
print(clf1.score(x_test,y_test)) #MNIST, Random Forest

{'random_state': 42, 'n_estimators': 102, 'min_samples_split': 4, 'max_depth': 14, 'criterion': 'entropy'}
0.9614285714285714


In [8]:
clf2 = RandomizedSearchCV(gradientboosting,params2,n_iter=24,cv=5,verbose=10,random_state=42,n_jobs=-1)
clf2 = clf2.fit(x_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed: 17.7min
[Parallel(n_jobs=-1)]: Done  86 out of 120 | elapsed: 21.4min remaining:  8.4min
[Parallel(n_jobs=-1)]: Done  99 out of 120 | elapsed: 25.0min remaining:  5.3min
[Parallel(n_jobs=-1)]: Done 112 out of 120 | elapsed: 26.2min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 26.9min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [9]:
print(clf2.best_params_)
print(clf2.score(x_test,y_test)) #MNIST, Grad Boosting

{'colsample_bytree': 0.5024849527056211, 'learning_rate': 0.8254614284548342, 'max_depth': 19, 'min_child_weight': 3, 'n_estimators': 10, 'subsample': 0.8531228783718439}
0.9568571428571429


In [23]:
X = pd.DataFrame(cifar.data)
Y = pd.DataFrame(cifar.target)
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size=0.25,random_state=42)

In [62]:
clf3 = RandomizedSearchCV(forest,params1,n_iter=100,cv=5,verbose=10,random_state=42,n_jobs=-1)
clf3 = clf3.fit(x_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   12.4s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   28.9s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done  65 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed: 15.3min
[Parallel(n_jobs=-1)]: Done 194 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 217 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done 240 tasks      | elapsed: 2

In [63]:
print(clf3.best_params_)
print(clf3.score(x_test,y_test)) #CIFAR, Random Forest

{'random_state': 42, 'n_estimators': 187, 'min_samples_split': 4, 'max_depth': 14, 'criterion': 'gini'}
0.4384


In [31]:
clf4 = RandomizedSearchCV(gradientboosting,params2,n_iter=1,cv=5,verbose=10,random_state=42,n_jobs=-1)
clf4 = clf4.fit(x_train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  6.6min remaining: 10.0min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  6.6min remaining:  4.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.7min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [26]:
print(clf4.best_params_)
print(clf4.score(x_test,y_test)) #CIFAR, Grad boosting

{'colsample_bytree': 0.6236248068455289, 'learning_rate': 0.5712434258477012, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 10, 'subsample': 0.9801984667723727}
0.4066
