In [1]:
# Importing the Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [49]:
# Loading the dataset

data = pd.read_csv(r"C:\Users\Shashi\Model Building\Decision Tree\HR_DT.csv")

In [50]:
# import Warnings

import warnings
warnings.filterwarnings('ignore')

In [51]:
# Checking the head of data

data.head()

Unnamed: 0,Position of the employee,no of Years of Experience of employee,monthly income of employee
0,Business Analyst,1.1,39343
1,Junior Consultant,1.3,46205
2,Senior Consultant,1.5,37731
3,Manager,2.0,43525
4,Country Manager,2.2,39891


In [52]:
# Renaming the Column names

data.columns = ['Position_of_the_employee', 'no_of_Years_of_Experience_of_employee', 'monthly_income_of_employee']

In [53]:
data['monthly_income_of_employee'].max()

122391

In [54]:
# Checking the duplicates

data.duplicated().sum()

38

In [55]:
data = data.drop_duplicates()

In [56]:
data.duplicated().sum()

0

In [57]:
# Checking the Zero Variance

data.var() == 0

no_of_Years_of_Experience_of_employee    False
monthly_income_of_employee               False
dtype: bool

In [58]:
# Checking the Missing Values

data.isna().sum()

Position_of_the_employee                 0
no_of_Years_of_Experience_of_employee    0
monthly_income_of_employee               0
dtype: int64

In [59]:
# Discretization the Taxable.Income column

bin_Tax = ['No','Yes']# list of labels under which countinuos data grouped

#Creating new cols TaxBin and dividing 'Taxable.Income' cols on the basis of [10002,30000,99620] for Risky and Good
data["monthly_income_of_employee"] = pd.cut(data["monthly_income_of_employee"], bins = [30000,70000,122391], labels = bin_Tax)
count= data['monthly_income_of_employee'].value_counts()
count

No     97
Yes    61
Name: monthly_income_of_employee, dtype: int64

In [60]:
data

Unnamed: 0,Position_of_the_employee,no_of_Years_of_Experience_of_employee,monthly_income_of_employee
0,Business Analyst,1.1,No
1,Junior Consultant,1.3,No
2,Senior Consultant,1.5,No
3,Manager,2.0,No
4,Country Manager,2.2,No
...,...,...,...
191,Region Manager,3.9,No
192,Partner,4.0,No
193,Senior Partner,4.0,No
194,C-level,4.1,No


In [62]:
data['Position_of_the_employee'].value_counts()

Partner              20
Region Manager       19
Senior Partner       19
C-level              19
CEO                  16
Manager              15
Country Manager      15
Senior Consultant    14
Junior Consultant    13
Business Analyst      8
Name: Position_of_the_employee, dtype: int64

In [63]:
# Encoding - LabelEncoder

from sklearn.preprocessing import LabelEncoder

In [64]:
enc = LabelEncoder()

data['Position_of_the_employee'] = enc.fit_transform(data['Position_of_the_employee'])

In [65]:
data

Unnamed: 0,Position_of_the_employee,no_of_Years_of_Experience_of_employee,monthly_income_of_employee
0,0,1.1,No
1,4,1.3,No
2,8,1.5,No
3,5,2.0,No
4,3,2.2,No
...,...,...,...
191,7,3.9,No
192,6,4.0,No
193,9,4.0,No
194,1,4.1,No


In [67]:
# Partner              6
# Region Manager       7
# Senior Partner       9
# C-level              1
# CEO                  2
# Manager              5
# Country Manager      3
# Senior Consultant    8
# Junior Consultant    4
# Business Analyst     0

In [68]:
# Spliting the data

X = data.drop('monthly_income_of_employee', axis =1)
Y = data.monthly_income_of_employee

In [69]:
# Importing the train_test_split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [70]:
# Creating the model

from sklearn.tree import DecisionTreeClassifier as DT

In [71]:
model = DT()
model.fit(x_train, y_train)

# Checking the model score

model.score(X, Y)

1.0

In [72]:
# Prediction on Test Data
y_pred = model.predict(x_test)
pd.crosstab(y_pred, y_test)

np.mean(y_pred == y_test) # Test Data Accuracy 

1.0

In [73]:
# Prediction on Train Data
y_preds = model.predict(x_train)
pd.crosstab(y_preds, y_train)

np.mean(y_preds == y_train) # Train Data Accuracy

1.0

In [74]:
# Creating another model called Random Forest

from sklearn.ensemble import RandomForestClassifier

In [75]:
model = RandomForestClassifier()
model.fit(x_train, y_train)

# Checking the model Score
model.score(X, Y)

1.0

In [76]:
# Prediction on Test Data
y_pred = model.predict(x_test)
pd.crosstab(y_pred, y_test)

np.mean(y_pred == y_test) # Test Data Accuracy 

1.0

In [77]:
# Prediction on Train Data
y_preds = model.predict(x_train)
pd.crosstab(y_preds, y_train)

np.mean(y_preds == y_train) # Train Data Accuracy

1.0

In [81]:
model.predict([[2, 4.5]])

array(['No'], dtype=object)

In [83]:
# Here 2 -> CEO, 4.5 -> no_of_Years_of_Experience_of_employee, monthly_income_of_employee -> 67938 < 70,000, that's why result is No

In [84]:
model.predict([[3, 10.3]])

array(['Yes'], dtype=object)

In [85]:
# Here 3 -> Country Manager, 10.3 -> no_of_Years_of_Experience_of_employee, monthly_income_of_employee -> 1,20,000 > 70,000, that's why result is Yes

## Result : In this model, I created a good model which gives 100% correct result, which i explained at bottom, concluding this model is right fit.  