In [1]:
# Loading the Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [28]:
# Loading the Dataset

data = pd.read_csv(r"C:\Users\Shashi\Model Building\Decision Tree\Fraud_check.csv")

In [29]:
# Checking the Head od data

data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [30]:
data['Taxable.Income'].max()

99619

In [31]:
# Checking the duplicates

data.duplicated().sum()

0

In [32]:
# Checking the Zero Variance

data.var() == 0

  data.var() == 0


Taxable.Income     False
City.Population    False
Work.Experience    False
dtype: bool

In [33]:
# Checking the Missing Values

data.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [34]:
# Discretization the Taxable.Income column

bin_Tax = ['Risky','Good']# list of labels under which countinuos data grouped

#Creating new cols TaxBin and dividing 'Taxable.Income' cols on the basis of [10002,30000,99620] for Risky and Good
data["Taxable.Income"] = pd.cut(data["Taxable.Income"], bins = [10002,30000,99620], labels = bin_Tax)
count= data['Taxable.Income'].value_counts()
count

Good     476
Risky    124
Name: Taxable.Income, dtype: int64

In [35]:
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,Good,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Good,193264,15,YES
4,NO,Married,Good,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,Good,39492,7,YES
596,YES,Divorced,Good,55369,2,YES
597,NO,Divorced,Good,154058,0,YES
598,YES,Married,Good,180083,17,NO


In [36]:
# Encoding - LabelEncoder

from sklearn.preprocessing import LabelEncoder

In [37]:
enc = LabelEncoder()

data['Undergrad'] = enc.fit_transform(data['Undergrad'])
data['Marital.Status'] = enc.fit_transform(data['Marital.Status'])
data['Taxable.Income'] = enc.fit_transform(data['Taxable.Income'])


In [38]:
data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,0,50047,10,YES
1,1,0,0,134075,18,YES
2,0,1,0,160205,30,YES
3,1,2,0,193264,15,YES
4,0,1,0,27533,28,NO
...,...,...,...,...,...,...
595,1,0,0,39492,7,YES
596,1,0,0,55369,2,YES
597,0,0,0,154058,0,YES
598,1,1,0,180083,17,NO


In [39]:
# Here in Taxable.Income 1 = Risk, 0 = good

In [41]:
# Spliting the data

X = data.drop('Urban', axis =1)
Y = data.Urban

In [42]:
# Importing the train_test_split

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [43]:
# Creating the model

from sklearn.tree import DecisionTreeClassifier as DT

In [44]:
model = DT()
model.fit(x_train, y_train)

# Checking the model score

model.score(X, Y)

0.8983333333333333

In [46]:
# Prediction on Test Data
y_pred = model.predict(x_test)
pd.crosstab(y_pred, y_test)

np.mean(y_pred == y_test) # Test Data Accuracy 

0.49166666666666664

In [47]:
# Prediction on Train Data
y_preds = model.predict(x_train)
pd.crosstab(y_preds, y_train)

np.mean(y_preds == y_train) # Train Data Accuracy

1.0

In [49]:
# Creating another model called Random Forest

from sklearn.ensemble import RandomForestClassifier

In [50]:
model = RandomForestClassifier(n_estimators=80)
model.fit(x_train, y_train)

# Checking the model Score
model.score(X, Y)

0.9066666666666666

In [51]:
# Prediction on Test Data
y_pred = model.predict(x_test)
pd.crosstab(y_pred, y_test)

np.mean(y_pred == y_test) # Test Data Accuracy 

0.5333333333333333

In [52]:
# Prediction on Train Data
y_preds = model.predict(x_train)
pd.crosstab(y_preds, y_train)

np.mean(y_preds == y_train) # Train Data Accuracy

1.0

In [48]:
# Overfitting so going for Hyper tunning

In [53]:
# Automatic Tuning - Hyperparameters
######
# GridSearchCV

from sklearn.model_selection import GridSearchCV

In [54]:
model2 = DT(criterion = 'entropy')

In [55]:
param_grid = {'min_samples_leaf': [1, 5, 10, 20],
              'max_depth': [2, 4, 6, 8, 10],
              'max_features': ['sqrt']}


In [56]:
grid_search = GridSearchCV(estimator = model2, param_grid = param_grid, 
                                scoring = 'accuracy', n_jobs = -1, cv = 5, 
                                refit=True, return_train_score=True)

In [57]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(criterion='entropy'),
             n_jobs=-1,
             param_grid={'max_depth': [2, 4, 6, 8, 10],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [1, 5, 10, 20]},
             return_train_score=True, scoring='accuracy')

In [58]:
grid_search.best_params_

{'max_depth': 6, 'max_features': 'sqrt', 'min_samples_leaf': 5}

In [59]:
cv_dt_clf_grid = grid_search.best_estimator_

In [60]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [61]:
confusion_matrix(y_test, cv_dt_clf_grid.predict(x_test))
accuracy_score(y_test, cv_dt_clf_grid.predict(x_test))

0.5833333333333334

In [62]:
# Evaluation on Training Data
confusion_matrix(y_train, cv_dt_clf_grid.predict(x_train))
accuracy_score(y_train, cv_dt_clf_grid.predict(x_train))

0.6625

In [63]:
######
# RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV

model3 = DT(criterion = 'entropy')

param_dist = {'min_samples_leaf': list(range(1, 50)),
              'max_depth': list(range(2, 20)),
              'max_features': ['sqrt']}

n_iter = 50

In [64]:
model_random_search = RandomizedSearchCV(estimator = model3,
                                         param_distributions = param_dist,
                                         n_iter = n_iter)

In [65]:
model_random_search.fit(x_train, y_train)

RandomizedSearchCV(estimator=DecisionTreeClassifier(criterion='entropy'),
                   n_iter=50,
                   param_distributions={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19],
                                        'max_features': ['sqrt'],
                                        'min_samples_leaf': [1, 2, 3, 4, 5, 6,
                                                             7, 8, 9, 10, 11,
                                                             12, 13, 14, 15, 16,
                                                             17, 18, 19, 20, 21,
                                                             22, 23, 24, 25, 26,
                                                             27, 28, 29, 30, ...]})

In [66]:
model_random_search.best_params_

{'min_samples_leaf': 27, 'max_features': 'sqrt', 'max_depth': 11}

In [67]:
dT_random = model_random_search.best_estimator_

In [68]:
#prediciton on test data 

pred_random = dT_random.predict(x_test)
pd.crosstab(y_test, pred_random, rownames=['Actual'], colnames=['Predictions'])

np.mean(y_test == pred_random)

0.5583333333333333

In [69]:
#predicition on train data 
pred_random = dT_random.predict(x_train)
pd.crosstab(y_train, pred_random, rownames = ['Actual'], colnames = ['Predictions'])

np.mean(y_train == pred_random)


0.5916666666666667

## Result : In this model the score for DT and Random Tree got overfitting, I used Grid SearchCV and Random to get good score, now the score is more than 55% which means Right Fit.