# Week7 - Decision Tree Lab

* Train-test split
* Train a decison tree model
* Train a random forest model
* Evaluate the models
* Explain findings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/log_reg/employee-turnover-balanced.csv')
features = [x for x in df.columns if x != 'left_company' and x != 'years_in_current_job' and x != 'years_with_current_supervisor']

y = df['left_company']
X = df[features]
df

Unnamed: 0,left_company,age,frequency_of_travel,department,commuting_distance,education,satisfaction_with_environment,gender,seniority_level,position,satisfaction_with_job,married_or_single,last_raise_pct,last_performance_rating,total_years_working,years_at_company,years_in_current_job,years_since_last_promotion,years_with_current_supervisor
0,No,37,Travel_Rarely,Sales,16,4,4,Male,2,Sales Executive,3,Divorced,19,3,9,1,0,0,0
1,No,39,Travel_Rarely,Research & Development,3,2,3,Male,2,Laboratory Technician,3,Divorced,15,3,11,10,8,0,7
2,No,52,Travel_Frequently,Research & Development,25,4,3,Female,4,Manufacturing Director,4,Married,22,4,31,9,8,0,0
3,No,50,Non-Travel,Sales,1,3,4,Female,2,Sales Executive,3,Married,12,3,19,18,7,0,13
4,No,44,Travel_Rarely,Research & Development,4,3,4,Male,2,Healthcare Representative,2,Single,12,3,10,5,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Yes,39,Travel_Rarely,Sales,3,2,4,Female,2,Sales Executive,3,Married,18,3,12,1,0,0,0
996,Yes,26,Travel_Rarely,Sales,4,4,4,Male,2,Sales Executive,4,Single,12,3,8,8,7,7,4
997,Yes,18,Travel_Frequently,Sales,5,3,2,Male,1,Sales Representative,2,Single,14,3,0,0,0,0,0
998,Yes,28,Travel_Rarely,Research & Development,2,4,1,Male,1,Research Scientist,4,Married,13,3,5,3,2,2,2


#### Train-test split

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=124)
print(f'Records in training data: {X_train.shape[0]:,}')
print(f'Records in test data: {X_test.shape[0]:,}')

Records in training data: 800
Records in test data: 200


#### Train a decison tree model

In [3]:
numerical_features = []
categorical_features = []
for i in df:
    if i in [ 'frequency_of_travel', 'department','gender','position','satisfaction_with_job','married_or_single','satisfaction_with_environment']:
        categorical_features.append(i)
    else:
        numerical_features.append(i)
numerical_features.remove('left_company')
numerical_features.remove('years_in_current_job')
numerical_features.remove('years_with_current_supervisor')
numerical_features

['age',
 'commuting_distance',
 'education',
 'seniority_level',
 'last_raise_pct',
 'last_performance_rating',
 'total_years_working',
 'years_at_company',
 'years_since_last_promotion']

In [4]:
categorical_features

['frequency_of_travel',
 'department',
 'satisfaction_with_environment',
 'gender',
 'position',
 'satisfaction_with_job',
 'married_or_single']

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([('impute_missing', SimpleImputer(strategy='median')),
                           ('standardize_num', StandardScaler())
                        ])

In [6]:
cat_pipeline = Pipeline([('impute_missing_cats', SimpleImputer(strategy='most_frequent')),
                          ('create_dummies_cats', OneHotEncoder(handle_unknown='ignore', drop='first'))])

In [7]:
processing_pipeline = ColumnTransformer(transformers=[('proc_numeric', num_pipeline, numerical_features),
                                                      ('create_dummies', cat_pipeline, categorical_features)])
print(processing_pipeline)

ColumnTransformer(transformers=[('proc_numeric',
                                 Pipeline(steps=[('impute_missing',
                                                  SimpleImputer(strategy='median')),
                                                 ('standardize_num',
                                                  StandardScaler())]),
                                 ['age', 'commuting_distance', 'education',
                                  'seniority_level', 'last_raise_pct',
                                  'last_performance_rating',
                                  'total_years_working', 'years_at_company',
                                  'years_since_last_promotion']),
                                ('create_dummies',
                                 Pipeline(steps=[('impute_missing_cats',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('create_dummies_cats',
                  

In [8]:
X_train_processed = processing_pipeline.fit_transform(X_train)

# check the shape of the processed dataset
print(X_train_processed.shape)

(800, 30)


In [9]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier(criterion='gini', random_state=1)

tree_pipeline = Pipeline([('preprocess', processing_pipeline),
                          ('classifier', tree_model)])

tree_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('impute_missing',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardize_num',
                                                                   StandardScaler())]),
                                                  ['age', 'commuting_distance',
                                                   'education',
                                                   'seniority_level',
                                                   'last_raise_pct',
                                                   'last_performance_rating',
                                                   'total_years_working',
                                                   'years_at_company',
                                    

In [10]:
tree_pipeline.score(X_train, y_train)

1.0

In [11]:
tree_pipeline.score(X_test, y_test)

0.805

#### Evaluation

In [12]:
y_hat_train = tree_pipeline.predict(X_train)
y_hat_test = tree_pipeline.predict(X_test)



In [13]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_hat_train = le.fit_transform(y_hat_train)
y_hat_test = le.fit_transform(y_hat_test)
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)


In [14]:
# Calculate ROC-AUC score
from sklearn.metrics import roc_auc_score
train_auc = roc_auc_score(y_train, y_hat_train)
test_auc = roc_auc_score(y_test, y_hat_test)

print("Training ROC-AUC score:", train_auc)
print("Test ROC-AUC score:", test_auc)


Training ROC-AUC score: 1.0
Test ROC-AUC score: 0.7994673902120389


#### Explain Findings

* This might be a sign that the model has overfitted the training set and needs more regularization or tuning. It's also important to note that the relatively small difference in ROC-AUC scores between the training and test groups indicates that the model is not significantly overfitting. The model appears to perform reasonably well overall, but additional research and testing might be required to improve its performance.

#### Train a random forest model

In [15]:
from sklearn.ensemble import RandomForestClassifier
RandomForestClassifier_model = RandomForestClassifier(n_estimators=20)

RandomForest_pipeline = Pipeline([('preprocess', processing_pipeline),
                          ('classifier', RandomForestClassifier_model)])

RandomForest_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('proc_numeric',
                                                  Pipeline(steps=[('impute_missing',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardize_num',
                                                                   StandardScaler())]),
                                                  ['age', 'commuting_distance',
                                                   'education',
                                                   'seniority_level',
                                                   'last_raise_pct',
                                                   'last_performance_rating',
                                                   'total_years_working',
                                                   'years_at_company',
                                    

In [16]:
RandomForest_pipeline.score(X_train, y_train)

1.0

In [17]:
RandomForest_pipeline.score(X_test, y_test)

0.845

#### Evaluation

In [18]:
y_hat_train = RandomForest_pipeline.predict(X_train)
y_hat_test = RandomForest_pipeline.predict(X_test)


In [19]:
y_hat_train = le.fit_transform(y_hat_train)
y_hat_test = le.fit_transform(y_hat_test)
y_train = le.fit_transform(y_train)
y_test = le.fit_transform(y_test)


In [20]:
# Calculate ROC-AUC score
from sklearn.metrics import roc_auc_score
train_auc = roc_auc_score(y_train, y_hat_train)
test_auc = roc_auc_score(y_test, y_hat_test)

print("Training ROC-AUC score:", train_auc)
print("Test ROC-AUC score:", test_auc)


Training ROC-AUC score: 1.0
Test ROC-AUC score: 0.842478142900211


#### Explain Findings

* The model can successfully distinguish between positive and negative samples in the training data, as evidenced by the high training score. The model may be overfitting to the training data and not generalizing well to new, unseen data, according to the lower test score, though.