
-Replicate all the data cleaning and machine learning parts of your supervised learning project using Dask counterparts. 
-Instead of Pandas dataframes, you should use Dask dataframes whenever possible.
-Instead of NumPy arrays, you should use Dask array whenever possible.
-You should parallelize your model trainings using Dask.

In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import roc_auc_score

import joblib
from dask.distributed import Client, progress
from dask_ml.model_selection import train_test_split
import dask.dataframe as dd
import warnings
warnings.filterwarnings("ignore")

In [124]:
client = Client(n_workers=4, threads_per_worker=2, memory_limit='2GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:64128  Dashboard: http://127.0.0.1:64127/status,Cluster  Workers: 4  Cores: 8  Memory: 8.00 GB


# Explore the data

In [125]:
df = dd.read_csv(r'C:\Users\ojiang1\Desktop\Data Science\Capstone2_Supervised learning\PlacementDataFullClass.csv')

In [126]:
df

Unnamed: 0_level_0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,int64,object,float64,object,float64,object,object,float64,object,object,float64,object,float64,object,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [127]:
df.head()

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.0,Others,91.0,Others,Commerce,58.0,Sci&Tech,No,55.0,Mkt&HR,58.8,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.0,Central,68.0,Central,Arts,64.0,Comm&Mgmt,No,75.0,Mkt&Fin,57.8,Placed,250000.0
3,4,M,56.0,Central,52.0,Central,Science,52.0,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.8,Central,73.6,Central,Commerce,73.3,Comm&Mgmt,No,96.8,Mkt&Fin,55.5,Placed,425000.0


In [29]:
df.degree_t.unique().compute()

0     Sci&Tech
1    Comm&Mgmt
2       Others
Name: degree_t, dtype: object

In [128]:
df.groupby(['status']).mean().compute()

Unnamed: 0_level_0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Not Placed,23652.977612,57.54403,58.395522,61.134179,69.58791,61.612836,
Placed,23649.378378,71.721486,69.926554,68.740541,73.238041,62.579392,288655.405405


In [31]:
df_placed = df[(df['status'] == 'Placed')].compute()
df_notplaced = df[(df['status'] == 'Not Placed')].compute()


In [129]:
df.describe().compute()

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p,salary
count,47300.0,47300.0,47300.0,47300.0,47300.0,47300.0,32560.0
mean,23650.5,67.303395,66.333163,66.370186,72.100558,62.278186,288655.405405
std,13654.478203,10.802111,10.872252,7.341688,13.245186,5.819864,93142.612972
min,1.0,40.89,37.0,50.0,50.0,51.21,200000.0
25%,11825.75,60.4,60.8,61.0,60.0,57.9,240000.0
50%,23650.5,67.0,65.0,66.0,71.0,62.0,265000.0
75%,35475.25,76.0,73.0,72.0,84.0,66.28,300000.0
max,47300.0,89.4,97.7,91.0,98.0,77.89,940000.0


In [80]:
df.isnull().sum().compute()

sl_no                 0
gender                0
ssc_p                 0
ssc_b                 0
hsc_p                 0
hsc_b                 0
hsc_s                 0
degree_p              0
degree_t              0
workex                0
etest_p               0
specialisation        0
mba_p                 0
status                0
salary            68608
dtype: int64

the data looks pretty good that only salary has null values.

this affect my question asked earlier-instead of asking:
Which factor influenced a candidate's salary?
i would rather ask:
Which factor influenced a candidate in getting placed?

i will drop salary in the original data set, since it does not affect status.

In [130]:
df=df.drop(['salary'],axis=1)

In [131]:
# Numeric Columns
numeric_columns = df.select_dtypes(['int64', 'float64']).columns
print(numeric_columns)
print(f"The number of numerical columns is {len(numeric_columns)}")

Index(['sl_no', 'ssc_p', 'hsc_p', 'degree_p', 'etest_p', 'mba_p'], dtype='object')
The number of numerical columns is 6


In [132]:
# NON-Numeric Columns
non_numeric_columns = df.select_dtypes(['object']).columns
non_numeric_columns
len(non_numeric_columns) #8
non_numeric_columns

Index(['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex',
       'specialisation', 'status'],
      dtype='object')

In [98]:
features = ['sl_no', 'gender', 'ssc_p', 'ssc_b', 'hsc_p', 'hsc_b', 'hsc_s',
       'degree_p', 'degree_t', 'workex', 'etest_p', 'specialisation', 'mba_p',
      'status']
len(features)


14

In [99]:
df[numeric_columns].corr().compute()

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,etest_p,mba_p
sl_no,1.0,-7.6e-05,-8.4e-05,-8.6e-05,6.2e-05,2.2e-05
ssc_p,-7.6e-05,1.0,0.511472,0.538404,0.261993,0.388478
hsc_p,-8.4e-05,0.511472,1.0,0.434206,0.245113,0.354823
degree_p,-8.6e-05,0.538404,0.434206,1.0,0.22447,0.402364
etest_p,6.2e-05,0.261993,0.245113,0.22447,1.0,0.218055
mba_p,2.2e-05,0.388478,0.354823,0.402364,0.218055,1.0


In [133]:
# transfer non numerical to dummies
from dask_ml.preprocessing import DummyEncoder, Categorizer

non_numeric_columns = ['gender', 'ssc_b', 'hsc_b', 'hsc_s', 'degree_t', 'workex',
                       'specialisation','status']

 
categorizer = Categorizer(columns=non_numeric_columns)
categorizer = categorizer.fit(df)

result = categorizer.transform(df)
result.dtypes


sl_no                int64
gender            category
ssc_p              float64
ssc_b             category
hsc_p              float64
hsc_b             category
hsc_s             category
degree_p           float64
degree_t          category
workex            category
etest_p            float64
specialisation    category
mba_p              float64
status            category
dtype: object

In [134]:
dummyencoder = DummyEncoder(columns=non_numeric_columns)
dummyencoder = dummyencoder.fit(result)
dummy_encoded = dummyencoder.transform(result)

In [135]:
dummy_encoded.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 24 entries, sl_no to status_Not Placed
dtypes: float64(5), int64(1), uint8(18)

In [136]:
dummy_encoded.dtypes

sl_no                       int64
ssc_p                     float64
hsc_p                     float64
degree_p                  float64
etest_p                   float64
mba_p                     float64
gender_M                    uint8
gender_F                    uint8
ssc_b_Others                uint8
ssc_b_Central               uint8
hsc_b_Others                uint8
hsc_b_Central               uint8
hsc_s_Commerce              uint8
hsc_s_Science               uint8
hsc_s_Arts                  uint8
degree_t_Sci&Tech           uint8
degree_t_Comm&Mgmt          uint8
degree_t_Others             uint8
workex_No                   uint8
workex_Yes                  uint8
specialisation_Mkt&HR       uint8
specialisation_Mkt&Fin      uint8
status_Placed               uint8
status_Not Placed           uint8
dtype: object

In [137]:
Y=dummy_encoded.status_Placed 
X=dummy_encoded.drop("status_Placed", axis=1)

X.info()

<class 'dask.dataframe.core.DataFrame'>
Columns: 23 entries, sl_no to status_Not Placed
dtypes: float64(5), int64(1), uint8(17)

# Model prep

In [138]:
# split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)


X_train.persist()
X_test.persist()
y_train.persist()
y_test.persist()

Dask Series Structure:
npartitions=1
    uint8
      ...
Name: status_Placed, dtype: uint8
Dask Name: split, 1 tasks

In [139]:
# define result
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score, accuracy_score

def print_result(model, X_test, y_test):
    
    y_pred = model.predict(X_test.values.compute()) #prediction
    train_score = model.score(X_train.values.compute(),y_train.values.compute())
    test_score = model.score(X_test.values.compute(),y_test.values.compute())
    accuracy = metrics.accuracy_score(y_test.values.compute(), y_pred)
    recall = recall_score(y_test.values.compute(),y_pred) 
    precision = precision_score(y_test.values.compute(), y_pred)
    score = cross_val_score(model, X_train.values.compute(),y_train.values.compute(), cv=5)

    

     
    print('train_score:%.3f'%train_score)
    print('test_score:%.3f'%test_score)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print('Cross validation Score is ', score.mean())


# 1. Logistic Regression


In [120]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000, penalty='l2')

with joblib.parallel_backend('dask'):
    lr.fit(X_train.compute(), y_train.compute())

print_result(lr, X_test, y_test)

train_score:0.855
test_score:0.857
accuracy:0.857
recall:0.921
precision:0.877
Cross validation Score is  0.9299733093190182


# 2. KNN

In [140]:
knn = KNeighborsClassifier(n_neighbors=5)

with joblib.parallel_backend('dask'):
    knn.fit(X_train.compute(), y_train.compute())

print_result(knn, X_test, y_test)

train_score:0.862
test_score:0.773
accuracy:0.773
recall:0.901
precision:0.794
Cross validation Score is  0.5007489456127203


In [142]:
knn = KNeighborsClassifier(n_neighbors=15, weights = 'distance')

with joblib.parallel_backend('dask'):
    knn.fit(X_train.compute(), y_train.compute())

print_result(knn, X_test, y_test)

train_score:1.000
test_score:0.775
accuracy:0.775
recall:0.950
precision:0.772
Cross validation Score is  0.5007224940851446


# 3. Decision Tree

In [146]:
decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=1,
    max_depth=5,
    random_state = 1337
)

with joblib.parallel_backend('dask'):
    decision_tree.fit(X_train.compute(), y_train.compute())

print_result(decision_tree, X_test, y_test)


train_score:0.722
test_score:0.717
accuracy:0.717
recall:0.859
precision:0.759
Cross validation Score is  0.6333150866777373


In [147]:
# find best param

param_grid = {'max_depth':[3, 5, 7, 9, 15, 20],'max_features':[1,5,9,15]}

CV_rfc = GridSearchCV(estimator=tree.DecisionTreeClassifier(), param_grid=param_grid, cv= 5)

with joblib.parallel_backend('dask'):
    CV_rfc.fit(X_train.compute(), y_train.compute())

print(CV_rfc.best_params_)

{'max_depth': 3, 'max_features': 15}


In [148]:
# now fit the result from above to the model

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=15,
    max_depth=3,
    random_state = 1337
)

with joblib.parallel_backend('dask'):
    decision_tree.fit(X_train.compute(), y_train.compute())

print_result(decision_tree, X_test, y_test)


#overfitting

train_score:1.000
test_score:1.000
accuracy:1.000
recall:1.000
precision:1.000
Cross validation Score is  1.0


# 4. Random forest

In [150]:
rfc = RandomForestClassifier(n_estimators = 1000, random_state = 456)

with joblib.parallel_backend('dask'):
    rfc.fit(X_train.compute(), y_train.compute())
    
print_result(rfc, X_test, y_test)

#overfitting

train_score:1.000
test_score:1.000
accuracy:1.000
recall:1.000
precision:1.000
Cross validation Score is  1.0


# 5. Gradient Boosting

In [152]:
params = {'n_estimators': 500,
          'max_depth': 2,
          'loss': 'deviance'}

clf = GradientBoostingClassifier(**params)

with joblib.parallel_backend('dask'):
    clf.fit(X_train.compute(), y_train.compute())
    
print_result(clf, X_test, y_test)

#overfitting

train_score:1.000
test_score:1.000
accuracy:1.000
recall:1.000
precision:1.000
Cross validation Score is  1.0


In [156]:
params = {'n_estimators': 500,
          'max_depth': 3,
          'loss': 'deviance',
         'learning_rate': 0.2}

clf = GradientBoostingClassifier(**params)

with joblib.parallel_backend('dask'):
    clf.fit(X_train.compute(), y_train.compute())

print_result(clf, X_test, y_test)

train_score:1.000
test_score:1.000
accuracy:1.000
recall:1.000
precision:1.000
Cross validation Score is  1.0


Logistic Regression Classifier is the best performer due to:

1. smaller diff of train score and test score, meaning less overfit
2. higher cross validation score, meaning the model is more efficient
3. high accuracy, recall, and precision score