# Decision tree and Random Forest

In [1]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Titanic Dataset

In [3]:
train = pd.read_csv('Dataset\\tintanic_train.csv')
test = pd.read_csv('Dataset\\titanic_test.csv')

In [4]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
cols = ['Sex','Age','Fare']
train_data = train[cols]

In [6]:
train_data.head()

Unnamed: 0,Sex,Age,Fare
0,male,22.0,7.25
1,female,38.0,71.2833
2,female,26.0,7.925
3,female,35.0,53.1
4,male,35.0,8.05


In [7]:
train_y = train['Survived']

In [8]:
train_data.Sex = train_data.Sex.map({'male':1 , 'female':0})

In [9]:
train_data.head()

Unnamed: 0,Sex,Age,Fare
0,1,22.0,7.25
1,0,38.0,71.2833
2,0,26.0,7.925
3,0,35.0,53.1
4,1,35.0,8.05


### Creating the decision tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [11]:
dt_model = DecisionTreeClassifier(max_depth=8)
dt_model.fit(X = train_data,y = train_y)

DecisionTreeClassifier(max_depth=8)

In [12]:
dt_model.score(X=train_data,y=train_y)

0.8706411698537683

## Inference

We have created the Decision tree using the columns Sex , Fare and Age from the titanic_train dataset and the accuracy score for the model we created is 87.06%.

To improve accuracy we have to find the more important columns using the random forest algorithm and create a decision tree out of it.

# Attrition analysis

In [13]:
data = pd.read_csv('Dataset\\general_data.csv')
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [14]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [15]:
data.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeID                   int64
Gender                      object
JobLevel                     int64
JobRole                     object
MaritalStatus               object
MonthlyIncome                int64
NumCompaniesWorked         float64
Over18                      object
PercentSalaryHike            int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears          float64
TrainingTimesLastYear        int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [16]:
data.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [17]:
data['NumCompaniesWorked'].mean() , data['TotalWorkingYears'].mean()

(2.6948303347756775, 11.279936378095888)

In [18]:
data['NumCompaniesWorked'].fillna(3,inplace=True)
data['TotalWorkingYears'].fillna(11,inplace = True)

In [19]:
## Label encoding categorical variables
le = LabelEncoder()
le_cols = ['Attrition','BusinessTravel','Gender','MaritalStatus']
data[le_cols] = data[le_cols].apply(le.fit_transform)

In [20]:
train_cols = ['Age','BusinessTravel','Gender','MaritalStatus','MonthlyIncome','PercentSalaryHike','TotalWorkingYears',
             'YearsAtCompany','YearsWithCurrManager']
attrition = data['Attrition']
att_train = data[train_cols]

In [21]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=1000 , max_features=2,max_depth=8,oob_score=True)
rf_model.fit(X = att_train,y=attrition)

RandomForestClassifier(max_depth=8, max_features=2, n_estimators=1000,
                       oob_score=True)

In [22]:
rf_model.oob_score_

0.8825396825396825

In [23]:
for col,imp in zip(train_cols,rf_model.feature_importances_):
    print(col ," = ",imp)

Age  =  0.18064913928407755
BusinessTravel  =  0.05972701997170863
Gender  =  0.022467096858411063
MaritalStatus  =  0.0852955653939722
MonthlyIncome  =  0.1765684125371445
PercentSalaryHike  =  0.10274976881786392
TotalWorkingYears  =  0.16871907887476376
YearsAtCompany  =  0.1210688414341626
YearsWithCurrManager  =  0.08275507682789587


## Inference


-> We have created a RandomForestClassifier model with the above columns to predict the attrition of the employee.

-> The following are the important features from all the features (Selecting the features which are having importances above 10%)

            -> Age
            -> MonthlyIncome
            -> PercentSalaryHike
            -> TotalWorkingYears
            -> YearsAtCompany
-> Using the above features we are creating the Decision Tree

In [24]:
cols = ['Age','MonthlyIncome','PercentSalaryHike','TotalWorkingYears','YearsAtCompany']
dt_cols_att = data[cols]

In [25]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth=8)

In [26]:
dt_clf.fit(X=dt_cols_att,y=attrition)

DecisionTreeClassifier(max_depth=8)

In [27]:
dt_clf.score(X = dt_cols_att , y = attrition)

0.8979591836734694

## Inference

-> By creating a DecisionTree Classifier with the selected features from the randomforest classifier we got the accuracy of 89.79% 

-> To increase the model performance we have find the importance of all the other columns and use the important column in creating the Decisiontree classifier.

# Bank Loan

In [28]:
bank_data = pd.read_excel('Dataset\\Bank_Personal_Loan_Modelling.xlsx',sheet_name='Data')
bank_data.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [29]:
y_data = bank_data['Personal Loan']
bank_data.drop(['ID','Personal Loan','ZIP Code'],inplace = True , axis = 1)

In [30]:
random_forest = RandomForestClassifier(n_estimators=1000 , max_features=2,oob_score=True,max_depth=8)
random_forest.fit(X = bank_data,y=y_data)

RandomForestClassifier(max_depth=8, max_features=2, n_estimators=1000,
                       oob_score=True)

In [31]:
random_forest.oob_score_

0.9844

In [32]:
for col,imp in zip(bank_data.columns , random_forest.feature_importances_):
    print(col,'= ',imp)

Age =  0.02802548888407586
Experience =  0.02693512768772716
Income =  0.3639561683230638
Family =  0.09822746111428239
CCAvg =  0.1935905524174134
Education =  0.1540034266758771
Mortgage =  0.04504427662317889
Securities Account =  0.0049832271046110444
CD Account =  0.07075396460787214
Online =  0.005699739701338121
CreditCard =  0.00878056686056001


## Inference

We have created a RandomForest Classifier algorithm with some columns and the main features for the prediction of bank loan are

    -> Income
    -> CCAvg
    -> Education
    
 -- By using the above columns creating the DecisionTree Classifier.

In [33]:
dec_cols = ['Income','CCAvg','Education']
dec_data = bank_data[dec_cols]

In [34]:
dec_tree = DecisionTreeClassifier(max_depth=8)
dec_tree.fit(X = dec_data , y = y_data)

DecisionTreeClassifier(max_depth=8)

In [35]:
dec_tree.score(X = dec_data, y = y_data)

0.9816

## Inference

The accuracy of 98.14% is achieved by using the columns that have more importance in the RandomForest Classifier

Hence to create a Decision Tree Classifier model for any dataset do the following steps,

    1. Create a RandomForest Classifier with all the columns.
    
    2. Using the feature_importances_ find the important columns.
    
    3. Create a DecisionTree Classifier with the important columns.
    
    4. Review the result.