# Decision tree

Import

In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Import Excel File

In [44]:
df = pd.read_excel('BankNote_Authentication.xlsx')
df

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.62160,8.66610,-2.8073,-0.44699,authentic
1,4.54590,8.16740,-2.4586,-1.46210,authentic
2,3.86600,-2.63830,1.9242,0.10645,authentic
3,3.45660,9.52280,-4.0112,-3.59440,authentic
4,0.32924,-4.45520,4.5718,-0.98880,authentic
...,...,...,...,...,...
1357,0.40614,1.34920,-1.4501,-0.55949,counterfeit
1358,-1.38870,-4.87730,6.4774,0.34179,counterfeit
1359,-3.75030,-13.45860,17.5932,-2.77710,counterfeit
1360,-3.56370,-8.38270,12.3930,-1.28230,counterfeit


## Data Preprocessing

Check whether there are missing data or not

In [45]:
df.isnull().sum()

variance    0
skewness    0
curtosis    0
entropy     0
class       0
dtype: int64

Check whether it is numerical data

In [46]:
df.dtypes

variance    float64
skewness    float64
curtosis    float64
entropy     float64
class        object
dtype: object

Since we have 1 column which is not numerical, we need to change it

In [47]:
# 0 is authentic, 1 is counterfeit

df['class'] = df['class'].str.replace('authentic', '0')
df['class'] = df['class'].str.replace('counterfeit', '1')
df['class'] = df['class'].astype(int)

df

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.62160,8.66610,-2.8073,-0.44699,0
1,4.54590,8.16740,-2.4586,-1.46210,0
2,3.86600,-2.63830,1.9242,0.10645,0
3,3.45660,9.52280,-4.0112,-3.59440,0
4,0.32924,-4.45520,4.5718,-0.98880,0
...,...,...,...,...,...
1357,0.40614,1.34920,-1.4501,-0.55949,1
1358,-1.38870,-4.87730,6.4774,0.34179,1
1359,-3.75030,-13.45860,17.5932,-2.77710,1
1360,-3.56370,-8.38270,12.3930,-1.28230,1


## Training

In [48]:
X = df.iloc[:,0:4]
y = df.iloc[:,-1]

print(X)

      variance  skewness  curtosis  entropy
0      3.62160   8.66610   -2.8073 -0.44699
1      4.54590   8.16740   -2.4586 -1.46210
2      3.86600  -2.63830    1.9242  0.10645
3      3.45660   9.52280   -4.0112 -3.59440
4      0.32924  -4.45520    4.5718 -0.98880
...        ...       ...       ...      ...
1357   0.40614   1.34920   -1.4501 -0.55949
1358  -1.38870  -4.87730    6.4774  0.34179
1359  -3.75030 -13.45860   17.5932 -2.77710
1360  -3.56370  -8.38270   12.3930 -1.28230
1361  -2.54190  -0.65804    2.6842  1.19520

[1362 rows x 4 columns]


Split the data into train and variation

In [49]:
#Preparing the model
from sklearn.model_selection import train_test_split, KFold
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 99)

Hyperparameter Tuning

In [50]:
#cv = KFold(n_splits=10, random_state=1, shuffle=True)

from sklearn.tree import DecisionTreeClassifier
dc = DecisionTreeClassifier()
dc = dc.fit(X_train, y_train)

y_pred = dc.predict(X_test)
y_pred


array([1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0])

## Model evaluation

In [51]:
import sklearn.metrics as metrics

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.989010989010989


# New Data

In [52]:
newDf = pd.read_excel('New_data.xlsx')
newDf

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,-2.2153,11.9625,0.078538,-7.7853,authentic
1,2.1265,6.8783,0.44784,-2.2224,authentic
2,0.007125,8.3661,0.50781,-3.8155,authentic
3,2.2893,3.733,0.6312,-0.39786,authentic
4,4.3398,-5.3036,3.8803,-0.70432,authentic
5,-5.4414,7.2363,0.10938,-7.5642,counterfeit
6,-4.1244,3.7909,-0.6532,-4.1802,counterfeit
7,-4.1958,-8.1819,12.1291,-1.6017,counterfeit
8,-0.87834,3.257,-3.6778,-3.2944,counterfeit
9,-0.89809,-4.4862,2.2009,0.50731,counterfeit


In [53]:
X = newDf.iloc[:,0:4]
y = newDf.iloc[:,-1]

print(X)

      variance  skewness  curtosis  entropy
0      3.62160   8.66610   -2.8073 -0.44699
1      4.54590   8.16740   -2.4586 -1.46210
2      3.86600  -2.63830    1.9242  0.10645
3      3.45660   9.52280   -4.0112 -3.59440
4      0.32924  -4.45520    4.5718 -0.98880
...        ...       ...       ...      ...
1357   0.40614   1.34920   -1.4501 -0.55949
1358  -1.38870  -4.87730    6.4774  0.34179
1359  -3.75030 -13.45860   17.5932 -2.77710
1360  -3.56370  -8.38270   12.3930 -1.28230
1361  -2.54190  -0.65804    2.6842  1.19520

[1362 rows x 4 columns]
