# Week7 - Decision Tree Lab

* Train-test split
* Train a decison tree model
* Train a random forest model
* Evaluate the models
* Explain findings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/log_reg/employee-turnover-balanced.csv')
# y = df['left_company']
# X = df.iloc[:, 1:]

In [2]:
df.head()

Unnamed: 0,left_company,age,frequency_of_travel,department,commuting_distance,education,satisfaction_with_environment,gender,seniority_level,position,satisfaction_with_job,married_or_single,last_raise_pct,last_performance_rating,total_years_working,years_at_company,years_in_current_job,years_since_last_promotion,years_with_current_supervisor
0,No,37,Travel_Rarely,Sales,16,4,4,Male,2,Sales Executive,3,Divorced,19,3,9,1,0,0,0
1,No,39,Travel_Rarely,Research & Development,3,2,3,Male,2,Laboratory Technician,3,Divorced,15,3,11,10,8,0,7
2,No,52,Travel_Frequently,Research & Development,25,4,3,Female,4,Manufacturing Director,4,Married,22,4,31,9,8,0,0
3,No,50,Non-Travel,Sales,1,3,4,Female,2,Sales Executive,3,Married,12,3,19,18,7,0,13
4,No,44,Travel_Rarely,Research & Development,4,3,4,Male,2,Healthcare Representative,2,Single,12,3,10,5,2,2,3


### Converting all the categorical values into numeric

In [3]:
dummies_df = pd.get_dummies(df[['frequency_of_travel','department','gender','position','married_or_single',]])
dummies_df

Unnamed: 0,frequency_of_travel_Non-Travel,frequency_of_travel_Travel_Frequently,frequency_of_travel_Travel_Rarely,department_Human Resources,department_Research & Development,department_Sales,gender_Female,gender_Male,position_Healthcare Representative,position_Human Resources,position_Laboratory Technician,position_Manager,position_Manufacturing Director,position_Research Director,position_Research Scientist,position_Sales Executive,position_Sales Representative,married_or_single_Divorced,married_or_single_Married,married_or_single_Single
0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0
1,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0
2,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0
3,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0
4,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0
996,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1
997,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1
998,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0


### Creating a data frame with numerical values and converted categorical values

In [4]:
final_df = pd.concat([df.drop(['frequency_of_travel','department','gender','position','married_or_single','left_company'],axis='columns'),dummies_df],axis=1)
final_df

Unnamed: 0,age,commuting_distance,education,satisfaction_with_environment,seniority_level,satisfaction_with_job,last_raise_pct,last_performance_rating,total_years_working,years_at_company,...,position_Laboratory Technician,position_Manager,position_Manufacturing Director,position_Research Director,position_Research Scientist,position_Sales Executive,position_Sales Representative,married_or_single_Divorced,married_or_single_Married,married_or_single_Single
0,37,16,4,4,2,3,19,3,9,1,...,0,0,0,0,0,1,0,1,0,0
1,39,3,2,3,2,3,15,3,11,10,...,1,0,0,0,0,0,0,1,0,0
2,52,25,4,3,4,4,22,4,31,9,...,0,0,1,0,0,0,0,0,1,0
3,50,1,3,4,2,3,12,3,19,18,...,0,0,0,0,0,1,0,0,1,0
4,44,4,3,4,2,2,12,3,10,5,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,39,3,2,4,2,3,18,3,12,1,...,0,0,0,0,0,1,0,0,1,0
996,26,4,4,4,2,4,12,3,8,8,...,0,0,0,0,0,1,0,0,0,1
997,18,5,3,2,1,2,14,3,0,0,...,0,0,0,0,0,0,1,0,0,1
998,28,2,4,1,1,4,13,3,5,3,...,0,0,0,0,1,0,0,0,1,0


### Spliting the data into Test and Train

In [5]:
X = final_df
y = df['left_company']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=3)

### DecisionTreeClassifier

In [7]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

DecisionTreeClassifier()

In [8]:
from sklearn.metrics import classification_report
print('Scores on Testing Data of Decision Tree Classifier\n')
predictions_test = tree_model.predict(X_test)
testing_score = classification_report(y_test, predictions_test)
print(testing_score)

Scores on Testing Data of Decision Tree Classifier

              precision    recall  f1-score   support

          No       0.82      0.66      0.73        99
         Yes       0.72      0.86      0.78       101

    accuracy                           0.76       200
   macro avg       0.77      0.76      0.76       200
weighted avg       0.77      0.76      0.76       200



### Applying GridSearchCV to DecisionTreeClassifier

In [9]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [20,40,60],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(tree_model, param_grid=param_grid, cv=10)
grid_search.fit(X_train,y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

Best hyperparameters:  {'criterion': 'gini', 'max_depth': 60, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best accuracy score:  0.81375


In [10]:
print('Scores on Testing Data of Decision Tree Classifier with GridSearchCV \n')
predictions_test_gcv = grid_search.predict(X_test)
testing_score_gcv = classification_report(y_test, predictions_test_gcv)
print(testing_score_gcv)

Scores on Testing Data of Decision Tree Classifier with GridSearchCV 

              precision    recall  f1-score   support

          No       0.83      0.66      0.73        99
         Yes       0.72      0.87      0.79       101

    accuracy                           0.77       200
   macro avg       0.78      0.76      0.76       200
weighted avg       0.78      0.77      0.76       200



In [11]:
# the top freatures used by DecisionTreeClassifier 
tree_model.feature_names_in_[tree_model.feature_importances_ > 0.09]

array(['commuting_distance', 'seniority_level', 'last_raise_pct'],
      dtype=object)

### RandomForestClassifier

In [12]:
from sklearn.ensemble import RandomForestClassifier

Rf =  RandomForestClassifier()
Rf.fit(X_train,y_train)

RandomForestClassifier()

In [13]:
print('Scores on Testing Data of Random Forest Classifier\n')
Rf_predictions_test = Rf.predict(X_test)
Rf_testing_score = classification_report(y_test, Rf_predictions_test)
print(Rf_testing_score)

Scores on Testing Data of Random Forest Classifier

              precision    recall  f1-score   support

          No       0.86      0.86      0.86        99
         Yes       0.86      0.86      0.86       101

    accuracy                           0.86       200
   macro avg       0.86      0.86      0.86       200
weighted avg       0.86      0.86      0.86       200



In [14]:
# the top freatures used by RandomForestClassifier
Rf.feature_names_in_[Rf.feature_importances_ > 0.075]

array(['age', 'commuting_distance', 'total_years_working'], dtype=object)

### Applying GridSearchCV to RandomForestClassifier

In [15]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4],
}
grid_search = GridSearchCV(Rf, param_grid=param_grid, cv=10)
grid_search.fit(X_train,y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best accuracy score: ", grid_search.best_score_)

Best hyperparameters:  {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 200}
Best accuracy score:  0.8675


In [16]:
print('Scores on Testing Data of Random Forest Classifier with grid search CV\n')
Rf_predictions_test_gcv = grid_search.predict(X_test)
Rf_testing_score_gcv = classification_report(y_test, Rf_predictions_test_gcv)
print(Rf_testing_score_gcv)

Scores on Testing Data of Random Forest Classifier with grid search CV

              precision    recall  f1-score   support

          No       0.85      0.83      0.84        99
         Yes       0.84      0.86      0.85       101

    accuracy                           0.84       200
   macro avg       0.85      0.84      0.84       200
weighted avg       0.85      0.84      0.84       200



- Random Forest Classifier performed better than Decision Tree Classifier
- There are few features that are considered prominent by both  Decision Tree Classifier  and Random Forest Classifier
- Ramdaom forest has given best values for the precision and recall
- In the both classifiers GridSearchCv didn't bring huge difference