In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

### BankNote Data
* Variance of WTI (Continuous).
* Skewness of WTI (Continuous).
* Kurtosis of WTI (Continuous).
* Entropy of image (Continuous).
* Class (Integer).

In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Variance (WTI),Skewness (WTI),Kurotsis (WTI),Entropy,Class
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


In [3]:
data.shape

(1372, 5)

In [4]:
data['Class'].value_counts()

0    762
1    610
Name: Class, dtype: int64

### Gini Index

$ \large{Gain = H(S) - \sum_{\text{v }\in \text{ values}} \frac{|S_v|}{S}} H(S_v)$


In [5]:
def gini_index(groups, classes):
    totalInstances = 0
    giniIndex = 0.0
     
    for group in groups:
        groupSize = float(len(group))
        totalInstances += groupSize
        
        if groupSize <= 0:
            continue
        
        score = 0.0
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / groupSize
            score += p * p
        giniIndex += (1.0 - score) * (groupSize / totalInstances)
        
    return giniIndex

### Creating a split

This function creates a split in the given group.

The row which has a value smaller than the given value it is put in left group else in the right group.

In [6]:
def test_split(index, value, dataset):
    left, right = list(), list()
    
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    
    return left, right

In [7]:
dataColumns = data.columns
dataColumns

Index(['Variance (WTI)', 'Skewness (WTI)', 'Kurotsis (WTI)', 'Entropy',
       'Class'],
      dtype='object')

### Getting best splits based on Gini Index

In [8]:
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    bestIndex, bestValue, bestScore, bestGroups = 999, 999, 999, None
    
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
            
            if gini < bestScore:
                bestIndex, bestValue, bestScore, bestGroups = index, row[index], gini, groups
    
    return {'colName':dataColumns[bestIndex], 'index':bestIndex, 'value':bestValue, 'groups':bestGroups}

In [9]:
def set_class(group):
    classes = [obs[-1] for obs in group]
    return max(set(classes), key=classes.count)

### Building the nodes

In [10]:
def split(node, maxDepth, minSize, currDepth):
    left, right = node['groups']
    del(node['groups'])
    
    if not left or not right:
        node['left'] = node['right'] = set_class(left + right)
        return
    
    if currDepth >= maxDepth:
        node['left'], node['right'] = set_class(left), set_class(right)
        return
    
    if len(left) <= minSize:
        node['left'] = set_class(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], maxDepth, minSize, currDepth+1)
    
    if len(right) <= minSize:
        node['right'] = set_class(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], maxDepth, minSize, currDepth+1)

### Root Node

In [11]:
def decision_tree(train, maxDepth, minSize):
    rootNode = get_split(train)
    split(rootNode, maxDepth, minSize, 1)
    return rootNode

### Printing the entire Tree

In [12]:
def tree_verbose(currNode, currDepth=0):
    if isinstance(currNode, dict):
        print('%s[%s < %.3f]' % ((currDepth*2*' ', (currNode['colName']), currNode['value'])))
        tree_verbose(currNode['left'], currDepth+1)
        tree_verbose(currNode['right'], currDepth+1)
    else:
        print('%s[%s]' % ((currDepth*2*' ', currNode)))

In [13]:
def predict(currNode, data):
    if data[currNode['index']] < currNode['value']:
        if isinstance(currNode['left'], dict):
            return predict(currNode['left'], data)
        else:
            return currNode['left']
    else:
        if isinstance(currNode['right'], dict):
            return predict(currNode['right'], data)
        else:
            return currNode['right']

In [14]:
def accuracy(predictions, actual):
    acc = 0
    size = len(predictions)
    for i in range(size):
        if predictions[i] == actual[i]:
            acc += 1
    
    return (acc/size)*100

In [15]:
X = data.drop(['Class'],axis=1)
Y = data['Class']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.40, random_state=42)
X_train = pd.concat([X_train, Y_train], axis=1)

### Building the tree

In [16]:
tree = decision_tree(X_train.values, 5, 10)
tree_verbose(tree)

[Variance (WTI) < -0.206]
  [Skewness (WTI) < 8.188]
    [Skewness (WTI) < -4.606]
      [Variance (WTI) < -5.048]
        [1.0]
        [Variance (WTI) < -5.048]
          [1.0]
          [1.0]
      [Kurotsis (WTI) < 6.220]
        [Skewness (WTI) < 7.327]
          [1.0]
          [1.0]
        [Entropy < -0.056]
          [0.0]
          [0.0]
    [Variance (WTI) < -4.286]
      [1.0]
      [Variance (WTI) < -0.382]
        [Variance (WTI) < -0.787]
          [0.0]
          [0.0]
        [0.0]
  [Kurotsis (WTI) < -4.786]
    [1.0]
    [Skewness (WTI) < -4.515]
      [Variance (WTI) < 5.030]
        [Variance (WTI) < 0.706]
          [0.0]
          [0.0]
        [0.0]
      [Skewness (WTI) < -4.470]
        [1.0]
        [Skewness (WTI) < -4.006]
          [0.0]
          [0.0]


In [17]:
prediction = []
for obs in X_test.values:
    prediction.append(predict(tree, obs))

In [18]:
accuracy(prediction, Y_test.values)

90.7103825136612