In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Upload the data
df = pd.read_csv('../input/WA_Fn-UseC_-HR-Employee-Attrition.csv')


# Data Exploration

In [None]:
df.info()
# There are no missing data in the dataframe

In [None]:
pd.options.display.max_columns  = 50
df.head()

# Decision tree without One-Hot-Encoding

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.tree import DecisionTreeClassifier

## Data preparation

In [None]:
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})
df['Over18'] = df['Over18'].map({'Y': 1, 'N' : 0})
df['OverTime'] = df['OverTime'].map({'Yes':1, 'No':0})
df['Gender'] = df['Gender'].map( {'Male': 1, 'Female':0} )

df['Age'] = df['Age'].astype('int')

## Split the data set for X and y


In [None]:
y = df['Attrition'].astype('int')

#Dropping these categorical data for now. Later to process them according to this acrticle working 
#with categorical data https://www.kaggle.com/dansbecker/using-categorical-data-with-one-hot-encoding
X = df.drop(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'MaritalStatus', 
             'JobRole'], axis = 1)

In [None]:
X.shape, y.shape

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17)

In [None]:
X_train.shape, X_valid.shape

In [None]:
dec_tree = DecisionTreeClassifier(random_state = 17, criterion='entropy')
np.mean(cross_val_score(dec_tree, X_train, y_train, cv=5))

## Selecting the best parameters with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
tree_params = {'max_depth' : np.arange(1,11), 
              'max_features': [.5 , .7, 1.]}

In [None]:
tree_grid = GridSearchCV(dec_tree, tree_params, cv=5, n_jobs = -1 )

In [None]:
%%time
tree_grid.fit(X_train, y_train);

In [None]:
tree_grid.best_score_ , tree_grid.best_params_

# Checking the accuracy on X_valid

In [None]:
tree_valid_pred = tree_grid.predict(X_valid)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_valid, tree_valid_pred)

# Visualizing the tree

In [None]:
from sklearn.tree import export_graphviz

In [None]:
best_tree = DecisionTreeClassifier(max_depth = 3, criterion='entropy').fit(X_train, y_train)
best_tree.score(X_valid, y_valid)

In [None]:
export_graphviz(best_tree, out_file='HR_tree.dot',
               feature_names=X.columns, filled=True)

!dot -Tpng 'HR_tree.dot' -o 'HR_tree.png'
!rm HR_tree.dot
from IPython.display import Image
Image(filename = 'HR_tree.png')

# Decision Tree with One-Hot-Encoding for categorical variables

In [None]:
X_ohe = pd.concat([df, 
               pd.get_dummies(df['BusinessTravel'], prefix = 'BusinessTravel'),
               pd.get_dummies(df['Department'], prefix='Department'),
               pd.get_dummies(df['EducationField'], prefix='EducationField'),
               pd.get_dummies(df['MaritalStatus'], prefix='MaritalStatus'),
               pd.get_dummies(df['JobRole'], prefix = 'JobRole')
              ], 
              axis = 1)

In [None]:
y_ohe = X_ohe['Attrition'].astype('int')


X_ohe = X_ohe.drop(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'MaritalStatus', 
             'JobRole'], axis = 1)

In [None]:
X_ohe_train, X_ohe_valid, y_ohe_train,  y_ohe_valid = train_test_split(X_ohe, y_ohe)

In [None]:
dec_tree2 = DecisionTreeClassifier(random_state = 17, criterion='entropy')
np.mean(cross_val_score(dec_tree2, X_ohe_train, y_ohe_train, cv=5))

In [None]:
tree_grid2 = GridSearchCV(dec_tree2, tree_params, cv = 5, n_jobs = -1)

In [None]:
tree_grid2.fit(X_ohe_train, y_ohe_train)

In [None]:
tree_grid2.best_params_

In [None]:
tree2_prediction = tree_grid2.predict(X_ohe_valid)

In [None]:
accuracy_score(y_ohe_valid, tree2_prediction)

In [None]:
tree2_optimal = DecisionTreeClassifier(max_depth= 3, criterion = 'entropy', random_state=17).fit(X_ohe_train, y_ohe_train)

In [None]:
tree2_optimal.score(X_ohe_valid, y_ohe_valid)

# Visualizing the tree with One-Hot-Encoding for categorical variables

In [None]:
export_graphviz(tree2_optimal, out_file='HR_tree_OHE.dot',
               feature_names=X_ohe.columns, filled=True)

!dot -Tpng 'HR_tree_OHE.dot' -o 'HR_tree_OHE.png'
!rm HR_tree_OHE.dot
from IPython.display import Image
Image(filename = 'HR_tree_OHE.png')

# EDA basing on Decision Tree with One-Hot-Encoding output

### Curios to match the leafs of the tree with the some graphs. 

In [None]:
data = pd.read_csv('../input/WA_Fn-UseC_-HR-Employee-Attrition.csv')


In [None]:
data['Attrition'] = data['Attrition'].map({'Yes': 1, 'No': 0})
data['Over18'] = data['Over18'].map({'Y': 1, 'N' : 0})
data['OverTime'] = data['OverTime'].map({'Yes':1, 'No':0})
data['Age'] = data['Age'].astype('int')

In [None]:
sns.countplot(hue=data['Attrition'], y=data['OverTime']);

In [None]:
sns.countplot(hue=data['Attrition'], y=data['StockOptionLevel']);

In [None]:
sns.boxplot(x=df['Attrition'], y=df['MonthlyIncome'], hue=df['OverTime']);

In [None]:
sns.boxplot(x=df['Attrition'], y=df['MonthlyIncome'], hue=df['StockOptionLevel']);

In [None]:
sns.boxplot(x=df['Attrition'], y=df['MonthlyIncome'], hue=df['JobRole']);
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);

In [None]:
# Version 10
 # Changed the grahps in EDA
# Version 9: 
  #fixed Decision Tree graph output
  # aded One-Hot-Encoding for categorical vairables