CSYE7105 High Performance Parallel Machine Learning and AI

Instructor: Dr. Handan Liu

Example: GridSearch CV in parallel

In [1]:
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.datasets import make_classification
from time import time

In [2]:
X_train, y_train = make_classification(n_samples=int(10000), n_features=50, random_state=0)

In [3]:
model = ExtraTreesClassifier(class_weight='balanced')
parameters = {'criterion': ['gini', 'entropy'],
                       'min_samples_split' : [2, 4, 8],
                       'max_depth' : [3, 10, 20]}

In [4]:
start = time()
clf = GridSearchCV(model, parameters, verbose=3, scoring='roc_auc', cv=StratifiedKFold(shuffle=True), n_jobs=4)
clf.fit(X_train, y_train)
elapsed = time() - start
print('Elasped time: ', elapsed)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END criterion=gini, max_depth=3, min_samples_split=2;, score=0.914 total time=   1.2s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_split=2;, score=0.909 total time=   1.1s
[CV 4/5] END criterion=gini, max_depth=3, min_samples_split=2;, score=0.896 total time=   1.1s
[CV 3/5] END criterion=gini, max_depth=3, min_samples_split=2;, score=0.903 total time=   1.3s
[CV 5/5] END criterion=gini, max_depth=3, min_samples_split=2;, score=0.905 total time=   1.1s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_split=4;, score=0.889 total time=   1.1s
[CV 1/5] END criterion=gini, max_depth=3, min_samples_split=4;, score=0.915 total time=   1.1s
[CV 3/5] END criterion=gini, max_depth=3, min_samples_split=4;, score=0.894 total time=   1.3s
[CV 4/5] END criterion=gini, max_depth=3, min_samples_split=4;, score=0.910 total time=   1.1s
[CV 5/5] END criterion=gini, max_depth=3, min_samples_split=4;, score=0.925 total ti

In [4]:
start = time()
clf = GridSearchCV(model, parameters, scoring='roc_auc', cv=StratifiedKFold(shuffle=True), n_jobs=8)
clf.fit(X_train, y_train)
elapsed = time() - start
print('Elasped time: ', elapsed)

Elasped time:  76.2965099811554


In [None]:
start = time()
clf = GridSearchCV(model, parameters, scoring='roc_auc', cv=StratifiedKFold(shuffle=True))
clf.fit(X_train, y_train)
elapsed = time() - start
print('Elasped time: ', elapsed)