# Necessary Imports

In [58]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing
from sklearn import ensemble
from dask.distributed import Client
from dask_ml import preprocessing as pre
import dask
import joblib
import xgboost as xgb

# Reading dataset

In [42]:
%%time
df = pd.read_csv("train.csv")
df.head()

CPU times: user 1.19 s, sys: 42.5 ms, total: 1.24 s
Wall time: 1.59 s


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


## pandas dataframe takes around 1.12 s to load the dataset

In [43]:
%%time
import dask.dataframe as dd
dask_df=dd.read_csv("train.csv")
type(dask_df)
dask_df.head()

CPU times: user 275 ms, sys: 56.7 ms, total: 332 ms
Wall time: 2.37 s


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


## Dask dataframe takes around 91.5 ms to load the dataset

# Converting Cat Features into Numerical 

In [44]:
categorical_feature_mask = df.dtypes==object
categorical_cols = df.columns[categorical_feature_mask].tolist()
print(categorical_cols)

['bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']


In [45]:
lb = preprocessing.LabelEncoder()
df[categorical_cols] = df[categorical_cols].apply(lambda col: lb.fit_transform(col))
df[categorical_cols].head(10)

Unnamed: 0,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_1,ord_2,ord_3,ord_4,ord_5
0,1,1,1,5,5,3,0,78,120,491,1686,2175,2,1,7,3,136
1,1,1,1,4,3,5,2,159,510,260,650,11635,2,3,0,0,93
2,0,1,0,4,4,5,3,44,14,766,1932,8078,1,4,7,17,31
3,0,1,2,4,5,0,1,209,165,1121,629,6056,2,0,8,3,134
4,0,0,2,4,4,0,1,90,61,34,1760,8231,2,2,0,17,158
5,1,0,0,1,4,2,1,65,86,1213,1574,3832,4,2,9,4,53
6,1,0,1,4,1,1,2,169,110,928,65,5860,2,4,6,15,185
7,1,1,2,5,2,5,1,106,177,188,1318,11922,4,4,9,10,17
8,1,1,0,2,3,0,0,63,131,272,1416,2445,4,0,4,21,160
9,0,1,2,4,4,1,2,154,282,353,385,693,1,2,7,16,11


In [5]:
y = df["target"]
x = df.drop(["target"],axis=1)

# Splitting into Train and Test or Train and Valid split

In [6]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.33, random_state=42)

# Simple Random Forest Model

In [7]:
%%time
random_forest = ensemble.RandomForestClassifier()
random_forest.fit(X_train,y_train)
y_pred_random_forest = random_forest.predict(X_test)
acc = metrics.accuracy_score(y_test,y_pred_random_forest)
print("The accuracy of Random Forest model:",acc)

The accuracy of Random Forest model: 0.7288686868686869
CPU times: user 58.2 s, sys: 236 ms, total: 58.4 s
Wall time: 58.7 s


# Grid Search

In [9]:
%%time
nEstimator = [1,2,3,4,5]
depth = [1,2,3,4,5]
RF =ensemble.RandomForestClassifier()
hyperParam = {'n_estimators':nEstimator,'max_depth': depth}
grid_search = model_selection.GridSearchCV(RF,hyperParam,n_jobs=-1)
grid_search.fit(X_train,y_train)

24.1 s ± 621 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Dask Approach

In [26]:
client = Client()
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 35715 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://127.0.0.1:37459  Dashboard: http://127.0.0.1:35715/status,Cluster  Workers: 4  Cores: 4  Memory: 8.27 GB


## This shows us the current working configuration which is going to utilize by dask

In [11]:
%%time
rf = ensemble.RandomForestClassifier()
with joblib.parallel_backend('dask',scatter=[X_train,y_train]):
    rf.fit(X_train,y_train)
nomalization(df,['nom_9'])
print("---")y_pred = rf.predict(X_test)
acc = metrics.accuracy_score(y_test,y_pred)
print("The accuracy of Random Forest model:",acc)

The accuracy of Random Forest model: 0.7286262626262626
CPU times: user 11.8 s, sys: 1.85 s, total: 13.6 s
Wall time: 38.2 s


In [15]:
%%time
nEstimator = [1,2,3,4,5]
depth = [1,2,3,4,5]
RF = ensemble.RandomForestClassifier()
hyperParam = [{'n_estimators':nEstimator,'max_depth': depth}]
grid_search = model_selection.GridSearchCV(RF,hyperParam,scoring='f1_weighted')

with joblib.parallel_backend('dask',scatter=[X_train, y_train]):
    grid_search.fit(X_train, y_train)

CPU times: user 6.89 s, sys: 754 ms, total: 7.65 s
Wall time: 30.3 s
