In [1]:
# Importing required Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [2]:
# Loading the dataset

df = pd.read_csv('emp_turn_over.csv').rename(columns={'sales': 'job_type','average_montly_hours': 'average_monthly_hours'})

# Display first 5 rows of data

print("Shape of Dataset :",df.shape)
df.head()

Shape of Dataset : (14999, 10)


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,job_type,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [3]:
# Function returning Categorical columns

def cat_cols(df):
    return list(df.select_dtypes(include='object').columns)

print("Categorical Columns :\n",cat_cols(df))

# Function returning Numerical columns

def num_cols(df):
    return list(df.select_dtypes(include=np.number).columns)

print("\nNumerical Columns :\n",num_cols(df))


Categorical Columns :
 ['job_type', 'salary']

Numerical Columns :
 ['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years']


In [4]:
# Info about data

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_monthly_hours    14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
job_type                 14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [5]:
# Statistics of the data

df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


# Missing Values

In [6]:
# Function for viewing number of missing values in a column

def missing_values(df):
    missing = df.isnull().sum()
    percent = df.isnull().sum()/len(df)*100
    return pd.DataFrame({'Missing Count':missing,'Percentage':percent})

missing_values(df)

Unnamed: 0,Missing Count,Percentage
satisfaction_level,0,0.0
last_evaluation,0,0.0
number_project,0,0.0
average_monthly_hours,0,0.0
time_spend_company,0,0.0
Work_accident,0,0.0
left,0,0.0
promotion_last_5years,0,0.0
job_type,0,0.0
salary,0,0.0


# Preprocessing

In [7]:
# Checking all departments

df.job_type.value_counts()

sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: job_type, dtype: int64

In [8]:
# Combining support, IT and technical job types into a single type i.e. "Technical"

df['job_type'] = np.where(df['job_type']=='support','technical',df['job_type'])
df['job_type'] = np.where(df['job_type']=='IT','technical',df['job_type'])

df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_monthly_hours,time_spend_company,Work_accident,left,promotion_last_5years,job_type,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [9]:
# Splitting Data into features and target

X = df.drop(columns=['left'])
y = df.left

# Train Test Split of data

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

In [10]:
# Encoding training and test features

X_train = pd.get_dummies(X_train,drop_first=True)
X_test = pd.get_dummies(X_test,drop_first=True)

In [11]:
# Scaling the features 

scalar = MinMaxScaler()

X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled = scalar.transform(X_test)

# Model  Building

In [12]:
# Logistic Regression

logistic = LogisticRegression()
logistic.fit(X_train_scaled,y_train)
logistic_pred = logistic.predict(X_test_scaled)
print("Accuracy Score : ",accuracy_score(y_test,logistic_pred))
print("\nF1 Score : ",f1_score(y_test,logistic_pred))
print("\nConfusion Matrix : \n",confusion_matrix(y_test,logistic_pred))


Accuracy Score :  0.7913333333333333

F1 Score :  0.4499121265377856

Confusion Matrix : 
 [[3177  252]
 [ 687  384]]


In [13]:
# List of scorinng parameters available fro cross validation

sorted(sklearn.metrics.SCORERS.keys())

<IPython.core.display.Javascript object>

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_ovr',
 'roc_auc_ovr_weighted',
 'v_measure_score']

In [14]:
cross_val_score(estimator=logistic, X=X_train_scaled, y=y_train, scoring='accuracy', cv=5)

array([0.79666667, 0.79666667, 0.79428571, 0.79619048, 0.80276322])

In [15]:
# SVC with linear kernel

svc = SVC(kernel='linear')
svc.fit(X_train_scaled, y_train)
svc_pred = svc.predict(X_test_scaled)
print("Accuracy Score : ",accuracy_score(y_test, svc_pred))
print("\nF1 Score : ",f1_score(y_test, svc_pred))
print("\nConfusion Matrix : \n",confusion_matrix(y_test, svc_pred))


Accuracy Score :  0.7751111111111111

F1 Score :  0.3487773487773488

Confusion Matrix : 
 [[3217  212]
 [ 800  271]]


In [16]:
# SVC with rbf kernel

svc = SVC(kernel='rbf')
svc.fit(X_train_scaled, y_train)
svc_pred = svc.predict(X_test_scaled)
print("Accuracy Score : ",accuracy_score(y_test, svc_pred))
print("\nF1 Score : ",f1_score(y_test, svc_pred))
print("\nConfusion Matrix : \n",confusion_matrix(y_test, svc_pred))


Accuracy Score :  0.9397777777777778

F1 Score :  0.8722300801508722

Confusion Matrix : 
 [[3304  125]
 [ 146  925]]


In [17]:
# SVC with poly kernel

svc = SVC(kernel='poly')
svc.fit(X_train_scaled, y_train)
svc_pred = svc.predict(X_test_scaled)
print("Accuracy Score : ",accuracy_score(y_test, svc_pred))
print("\nF1 Score : ",f1_score(y_test, svc_pred))
print("\nConfusion Matrix : \n",confusion_matrix(y_test, svc_pred))


Accuracy Score :  0.9446666666666667

F1 Score :  0.882158069096072

Confusion Matrix : 
 [[3319  110]
 [ 139  932]]


In [18]:
# Decision Tree

dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train,y_train)
dt_pred = dt.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test, dt_pred))
print("\nF1 Score : ",f1_score(y_test, dt_pred))
print("\nConfusion Matrix : \n",confusion_matrix(y_test, dt_pred))


Accuracy Score :  0.9564444444444444

F1 Score :  0.9100091827364555

Confusion Matrix : 
 [[3313  116]
 [  80  991]]


In [19]:
# Trainng Random Forest classifier 

rf = RandomForestClassifier()
%timeit rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test, rf_pred))
print("\nF1 Score : ",f1_score(y_test, rf_pred))
print("\nConfusion Matrix : \n",confusion_matrix(y_test, rf_pred))

895 ms ± 37.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Accuracy Score :  0.9924444444444445

F1 Score :  0.9839319470699434

Confusion Matrix : 
 [[3425    4]
 [  30 1041]]


In [20]:
# Cross Value score for Random Forest

cross_val_score(estimator=rf, X=X_train, y=y_train, scoring='accuracy', cv=5)

array([0.98619048, 0.98714286, 0.99      , 0.98904762, 0.9890424 ])

## Feature Selection

In [21]:
# Feature Selection using RFE

model = RandomForestClassifier()
rfe = RFE(model,10)
rfe.fit(X_train,y_train)
print("RFE Support :", rfe.support_)
selected_columns = pd.Series(index=X_train.columns,data=rfe.ranking_).sort_values()
selected_columns = list(selected_columns[selected_columns==1].index)
print(selected_columns)

RFE Support : [ True  True  True  True  True  True False False False False False False
  True  True  True  True]
['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 'time_spend_company', 'Work_accident', 'job_type_sales', 'job_type_technical', 'salary_low', 'salary_medium']


In [22]:
# Trainng Random Forest classifier with selected features from RFE

rf = RandomForestClassifier()
%timeit rf.fit(X_train[selected_columns], y_train)
rf_pred = rf.predict(X_test[selected_columns])
print("Accuracy Score : ",accuracy_score(y_test, rf_pred))
print("\nF1 Score : ",f1_score(y_test, rf_pred))
print("\nConfusion Matrix : \n",confusion_matrix(y_test, rf_pred))

827 ms ± 18.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Accuracy Score :  0.9922222222222222

F1 Score :  0.983451536643026

Confusion Matrix : 
 [[3425    4]
 [  31 1040]]


## Hyperparameter Tuning

In [23]:
# Grid Search/ Randomized Search Parameters

# Creating a Random Forest Classifier Object

random = RandomForestClassifier()

# Parameters for Hyperparameter Tuning

n_est = [i for i in range(100,501,50)]
criteria = ['gini','entropy']
max_depth = [i for i in range(2,15)]
min_samples_split = [2,3,5]
min_samples_leaf = [1,2,3]
bootstrap = [True,False]
max_features = ['auto','sqrt','log2',None]

# Creating a paramter grid for Grid Search

params_grid = {'n_estimators': n_est,
              'criterion': criteria,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf':min_samples_leaf,
              'bootstrap':bootstrap,
              'max_features':max_features,
              'n_jobs':[-1]}


In [24]:
# Randomized Search for Hyperparamter tuning

random_search = RandomizedSearchCV(estimator=random,param_distributions=params_grid,
                                   n_iter=200,scoring='accuracy',cv=3,verbose=2)
random_search.fit(X_train[selected_columns],y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
[CV] n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=entropy, bootstrap=False 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=entropy, bootstrap=False, total=   2.1s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=entropy, bootstrap=False 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.0s remaining:    0.0s


[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=entropy, bootstrap=False, total=   1.0s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=entropy, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=entropy, bootstrap=False, total=   0.9s
[CV] n_jobs=-1, n_estimators=500, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=4, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=500, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=4, criterion=entropy, bootstrap=True, total=   1.4s
[CV] n_jobs=-1, n_estimators=500, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=4, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=500, min_samples_split=5, min_samples_leaf=1, max_features=auto, ma

[CV]  n_jobs=-1, n_estimators=350, min_samples_split=5, min_samples_leaf=3, max_features=log2, max_depth=9, criterion=entropy, bootstrap=True, total=   1.3s
[CV] n_jobs=-1, n_estimators=450, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=450, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=entropy, bootstrap=True, total=   1.7s
[CV] n_jobs=-1, n_estimators=450, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=450, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=entropy, bootstrap=True, total=   1.9s
[CV] n_jobs=-1, n_estimators=450, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=450, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max

[CV]  n_jobs=-1, n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=7, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_jobs=-1, n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=7, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=7, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_jobs=-1, n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=7, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=7, criterion=entropy, bootstrap=True, total=   0.7s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=3, max_features=None, max_depth=8, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=3, max_features=None, max_depth=8

[CV]  n_jobs=-1, n_estimators=500, min_samples_split=2, min_samples_leaf=3, max_features=None, max_depth=3, criterion=gini, bootstrap=True, total=   1.7s
[CV] n_jobs=-1, n_estimators=500, min_samples_split=2, min_samples_leaf=3, max_features=None, max_depth=3, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=500, min_samples_split=2, min_samples_leaf=3, max_features=None, max_depth=3, criterion=gini, bootstrap=True, total=   1.8s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=3, max_features=auto, max_depth=12, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=3, max_features=auto, max_depth=12, criterion=entropy, bootstrap=True, total=   0.5s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=3, max_features=auto, max_depth=12, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=3, max_features=auto, max_depth=12, 

[CV]  n_jobs=-1, n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=4, criterion=gini, bootstrap=False, total=   0.8s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, criterion=gini, bootstrap=False, total=   0.5s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, criterion=gini, bootstrap=False, total=   0.5s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, c

[CV]  n_jobs=-1, n_estimators=350, min_samples_split=3, min_samples_leaf=3, max_features=auto, max_depth=6, criterion=entropy, bootstrap=True, total=   1.2s
[CV] n_jobs=-1, n_estimators=350, min_samples_split=3, min_samples_leaf=3, max_features=auto, max_depth=6, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=350, min_samples_split=3, min_samples_leaf=3, max_features=auto, max_depth=6, criterion=entropy, bootstrap=True, total=   1.2s
[CV] n_jobs=-1, n_estimators=350, min_samples_split=3, min_samples_leaf=3, max_features=auto, max_depth=6, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=350, min_samples_split=3, min_samples_leaf=3, max_features=auto, max_depth=6, criterion=entropy, bootstrap=True, total=   1.2s
[CV] n_jobs=-1, n_estimators=200, min_samples_split=3, min_samples_leaf=3, max_features=auto, max_depth=5, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=200, min_samples_split=3, min_samples_leaf=3, max_features=auto, max_dept

[CV]  n_jobs=-1, n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, criterion=entropy, bootstrap=True, total=   0.8s
[CV] n_jobs=-1, n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=14, criterion=entropy, bootstrap=True, total=   0.8s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=5, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=5, criterion=entropy, bootstrap=True, total=   1.8s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=5, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=None, max_d

[CV]  n_jobs=-1, n_estimators=400, min_samples_split=5, min_samples_leaf=2, max_features=None, max_depth=6, criterion=entropy, bootstrap=True, total=   2.1s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=3, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=3, criterion=gini, bootstrap=True, total=   1.1s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=3, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=3, criterion=gini, bootstrap=True, total=   1.1s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=3, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=3, criterion=

[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True, total=   0.4s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True, total=   0.4s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=3, criterion=entropy, bootstrap=True, total=   0.4s
[CV] n_jobs=-1, n_estimators=500, min_samples_split=5, min_samples_leaf=3, max_features=None, max_depth=3, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=500, min_samples_split=5, min_samples_leaf=3, max_features=None, max_depth=

[CV]  n_jobs=-1, n_estimators=300, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=10, criterion=entropy, bootstrap=False, total=   1.2s
[CV] n_jobs=-1, n_estimators=300, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=10, criterion=entropy, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=300, min_samples_split=2, min_samples_leaf=3, max_features=auto, max_depth=10, criterion=entropy, bootstrap=False, total=   1.3s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=5, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=5, criterion=gini, bootstrap=True, total=   1.3s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=5, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=5

[CV]  n_jobs=-1, n_estimators=150, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=12, criterion=gini, bootstrap=True, total=   0.7s
[CV] n_jobs=-1, n_estimators=300, min_samples_split=3, min_samples_leaf=2, max_features=auto, max_depth=9, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=300, min_samples_split=3, min_samples_leaf=2, max_features=auto, max_depth=9, criterion=gini, bootstrap=True, total=   1.1s
[CV] n_jobs=-1, n_estimators=300, min_samples_split=3, min_samples_leaf=2, max_features=auto, max_depth=9, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=300, min_samples_split=3, min_samples_leaf=2, max_features=auto, max_depth=9, criterion=gini, bootstrap=True, total=   1.1s
[CV] n_jobs=-1, n_estimators=300, min_samples_split=3, min_samples_leaf=2, max_features=auto, max_depth=9, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=300, min_samples_split=3, min_samples_leaf=2, max_features=auto, max_depth=9, criterion=gi

[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=None, max_depth=8, criterion=entropy, bootstrap=False, total=   0.8s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=None, max_depth=8, criterion=entropy, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=None, max_depth=8, criterion=entropy, bootstrap=False, total=   0.8s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=None, max_depth=8, criterion=entropy, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=None, max_depth=8, criterion=entropy, bootstrap=False, total=   0.9s
[CV] n_jobs=-1, n_estimators=200, min_samples_split=3, min_samples_leaf=2, max_features=log2, max_depth=7, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=200, min_samples_split=3, min_samples_leaf=2, max_features=log2, max

[CV]  n_jobs=-1, n_estimators=150, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=6, criterion=entropy, bootstrap=True, total=   0.6s
[CV] n_jobs=-1, n_estimators=150, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=6, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=150, min_samples_split=5, min_samples_leaf=1, max_features=sqrt, max_depth=6, criterion=entropy, bootstrap=True, total=   0.6s
[CV] n_jobs=-1, n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=8, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=8, criterion=gini, bootstrap=False, total=   1.1s
[CV] n_jobs=-1, n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=8, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=8, c

[CV]  n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=3, max_features=sqrt, max_depth=5, criterion=gini, bootstrap=True, total=   0.4s
[CV] n_jobs=-1, n_estimators=250, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=250, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=gini, bootstrap=False, total=   1.1s
[CV] n_jobs=-1, n_estimators=250, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=250, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=gini, bootstrap=False, total=   1.1s
[CV] n_jobs=-1, n_estimators=250, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=13, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=250, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=13, cr

[CV]  n_jobs=-1, n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=log2, max_depth=2, criterion=gini, bootstrap=False, total=   0.4s
[CV] n_jobs=-1, n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=log2, max_depth=2, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=log2, max_depth=2, criterion=gini, bootstrap=False, total=   0.4s
[CV] n_jobs=-1, n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=log2, max_depth=2, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=150, min_samples_split=2, min_samples_leaf=1, max_features=log2, max_depth=2, criterion=gini, bootstrap=False, total=   0.4s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=5, min_samples_leaf=2, max_features=log2, max_depth=10, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=5, min_samples_leaf=2, max_features=log2, max_depth=10, crite

[CV]  n_jobs=-1, n_estimators=150, min_samples_split=3, min_samples_leaf=2, max_features=sqrt, max_depth=8, criterion=gini, bootstrap=False, total=   0.6s
[CV] n_jobs=-1, n_estimators=150, min_samples_split=3, min_samples_leaf=2, max_features=sqrt, max_depth=8, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=150, min_samples_split=3, min_samples_leaf=2, max_features=sqrt, max_depth=8, criterion=gini, bootstrap=False, total=   0.7s
[CV] n_jobs=-1, n_estimators=250, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=5, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=250, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=5, criterion=gini, bootstrap=True, total=   1.1s
[CV] n_jobs=-1, n_estimators=250, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=5, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=250, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=5, criterion=

[CV]  n_jobs=-1, n_estimators=200, min_samples_split=3, min_samples_leaf=3, max_features=None, max_depth=3, criterion=gini, bootstrap=False, total=   0.8s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=log2, max_depth=9, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=log2, max_depth=9, criterion=gini, bootstrap=True, total=   0.4s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=log2, max_depth=9, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=log2, max_depth=9, criterion=gini, bootstrap=True, total=   0.4s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=log2, max_depth=9, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=log2, max_depth=9, criterion=gi

[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=5, criterion=gini, bootstrap=False, total=   1.3s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=5, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=5, criterion=gini, bootstrap=False, total=   1.3s
[CV] n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=5, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=400, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=5, criterion=gini, bootstrap=False, total=   1.2s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=log2, max_depth=3, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_features=log2, max_depth=3, criterio

[CV]  n_jobs=-1, n_estimators=250, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=11, criterion=gini, bootstrap=False, total=   1.1s
[CV] n_jobs=-1, n_estimators=250, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=11, criterion=gini, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=250, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=11, criterion=gini, bootstrap=False, total=   1.1s
[CV] n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=14, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=14, criterion=entropy, bootstrap=True, total=   1.8s
[CV] n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=14, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_dept

[CV]  n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=2, criterion=entropy, bootstrap=True, total=   1.2s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=gini, bootstrap=True, total=   0.7s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=gini, bootstrap=True, total=   0.7s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=None, max_depth=10, crit

[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=sqrt, max_depth=5, criterion=entropy, bootstrap=False, total=   0.4s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=sqrt, max_depth=5, criterion=entropy, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=sqrt, max_depth=5, criterion=entropy, bootstrap=False, total=   0.4s
[CV] n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=sqrt, max_depth=5, criterion=entropy, bootstrap=False 
[CV]  n_jobs=-1, n_estimators=100, min_samples_split=3, min_samples_leaf=1, max_features=sqrt, max_depth=5, criterion=entropy, bootstrap=False, total=   0.4s
[CV] n_jobs=-1, n_estimators=450, min_samples_split=5, min_samples_leaf=2, max_features=None, max_depth=5, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=450, min_samples_split=5, min_samples_leaf=2, max_features=None, max

[CV]  n_jobs=-1, n_estimators=350, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=9, criterion=gini, bootstrap=True, total=   1.4s
[CV] n_jobs=-1, n_estimators=350, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=9, criterion=gini, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=350, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=9, criterion=gini, bootstrap=True, total=   1.4s
[CV] n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=2, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=2, criterion=entropy, bootstrap=True, total=   1.5s
[CV] n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=2, criterion=entropy, bootstrap=True 
[CV]  n_jobs=-1, n_estimators=450, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=2, crit

[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed: 12.2min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [25]:
# Randomized Search Output

print("Best parameters :\n", random_search.best_params_)
print("\nBest Scores :\n", random_search.best_score_)
print("\nBest Estimator :\n", random_search.best_estimator_)

Best parameters :
 {'n_jobs': -1, 'n_estimators': 450, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 14, 'criterion': 'entropy', 'bootstrap': False}

Best Scores :
 0.9871418228337349

Best Estimator :
 RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=14, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=450,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)


In [26]:
# Random Forest with Hyperparameter Tuning 

best_random = random_search.best_estimator_
best_random.fit(X_train,y_train)
y_pred = best_random.predict(X_test)
print("Accuracy Score :", accuracy_score(y_test,y_pred))
print("F1 Score :", f1_score(y_test,y_pred))
print("\nConfusion Matrix : \n",confusion_matrix(y_test,y_pred))

Accuracy Score : 0.9891111111111112
F1 Score : 0.9767220902612828

Confusion Matrix : 
 [[3423    6]
 [  43 1028]]


**RandomForestClassifier with default parameters are giving lower number of True Negatives and False Positives compared to the model with tuned parameters**