# Method 2: Decision Trees

We'll be applying the CART-algorithm using the Gini cost-function.

Step 1/6: Import data

In [2]:
import pandas as pd
import numpy as np
train = pd.read_csv("Final_project_DATA/df_train.csv")
test = pd.read_csv("Final_project_DATA/df_test.csv")

train.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,type
0,1.51839,12.85,3.67,1.24,72.57,0.62,8.68,0.0,0.35,2
1,1.52081,13.78,2.28,1.43,71.99,0.49,9.85,0.0,0.17,2
2,1.51708,13.72,3.68,1.81,72.06,0.64,7.88,0.0,0.0,2
3,1.52739,11.02,0.0,0.75,73.08,0.0,14.96,0.0,0.0,2
4,1.5221,13.73,3.84,0.72,71.76,0.17,9.74,0.0,0.0,1


Find the occurences of every label in the data set.

In [3]:
def count_occ_labels(data):
    # Counts the amount of occurences in a pandas series.
    out = {}
    labels = data['type'].to_numpy()
    val, occ = np.unique(labels, return_counts=True)
    for A, B in zip(val, occ):
        out[A] = B
    
    return out

In [4]:
print(count_occ_labels(train))
train['type'].to_numpy()

{1: 49, 2: 53, 3: 12, 5: 9, 6: 6, 7: 20}


array([2, 2, 2, 2, 1, 2, 2, 5, 2, 7, 1, 2, 7, 1, 1, 1, 7, 2, 2, 1, 2, 7,
       7, 6, 3, 2, 6, 2, 6, 7, 1, 2, 6, 2, 7, 1, 2, 1, 1, 2, 7, 2, 1, 1,
       2, 1, 3, 2, 2, 2, 2, 1, 7, 7, 1, 5, 1, 1, 1, 1, 1, 3, 1, 2, 3, 7,
       2, 2, 2, 1, 3, 6, 5, 2, 1, 6, 2, 2, 7, 2, 1, 1, 3, 5, 2, 1, 1, 1,
       1, 2, 1, 1, 1, 2, 2, 1, 2, 2, 2, 3, 2, 2, 1, 3, 1, 3, 7, 2, 5, 1,
       2, 1, 1, 1, 7, 3, 1, 3, 5, 2, 2, 5, 7, 2, 1, 7, 2, 3, 7, 7, 2, 1,
       7, 1, 1, 2, 2, 2, 5, 2, 5, 1, 2, 1, 1, 2, 7, 1, 1])

Step 2: Create the split condition object class

The split condition should hold the feature(column number) and the threshold/value that will be used as the condition.
It also needs a method to compare the question value against the value of the passed sample.

In [5]:
class SplitCondition:
    def __init__(self, feature, threshold):
        self.feature = feature
        self.threshold = threshold
        
    def compare(self, sample):
        # Take sample value for feature, compare with threshold
        s_value = sample[self.feature]
        if isinstance(s_value,float) or isinstance(s_value,int):
            return s_value >= self.threshold
        else:
            return s_value == self.threshold

Next we need a method to split the dataset based on the questions.
So we need to divide the samples in X into true and false samples.

In [6]:
def split(data, SplitCondition):
    # Bad practise to append to DataFrames, so work with lists
    trueSamples = []
    falseSamples = []
    for i in range(data.shape[0]):
        if SplitCondition.compare(data.iloc[i]):
            trueSamples.append(data.iloc[i])
        else:
            falseSamples.append(data.iloc[i])
    return pd.DataFrame(trueSamples), pd.DataFrame(falseSamples)

In [7]:
trueSamples, falseSamples = split(train, SplitCondition('RI',1.521))
trueSamples.iloc[5]

RI       1.52196
Na      14.36000
Mg       3.85000
Al       0.89000
Si      71.36000
K        0.15000
Ca       9.15000
Ba       0.00000
Fe       0.00000
type     1.00000
Name: 59, dtype: float64

Now we need a cost function of sorts, a way to determine the "purity" a split condition or just a method that assigns some value indicating the quality of a node so we can identify the best question to use at this point. 

CART uses the Gini Impurity for this purpose.

In [8]:
def gini(samples):
    cnt = count_occ_labels(samples)
    impurity = 1
    for i in cnt:
        probability = cnt[i]/float(len(samples))
        impurity -= probability**2
    return impurity

In [9]:
print(gini(trueSamples))

0.7352537722908092


To really use Gini we need to measure the information gain of a question. This way we can actually qualify a question over another, since just the impurity of a single set of samples is not enough. We need to know the uncertainty of the starting/previous node and the impurity of the two new child nodes which will be introduced based on the question.

In [10]:
def InformationGain(true, false, previous):
    p = float(len(true)) / (len(true) + len(false))
    return previous - p * gini(true) - (1 - p) * gini(false)

So how do we use this "Information Gainzzz" to find our best split condition? Try every possible question and see what works best.

In [11]:
def BestCondition(samples):
    mostGains = 0 
    bestCondition = None 
    previous = gini(samples) # needed for IG 
    n = samples.shape[1] - 1 # number of features
    
    for i in range(n):
        uniques = samples.iloc[:,i].unique() # brute force // 0 refinement let's go
        
        for j in uniques:
            condition = SplitCondition(samples.columns[i], j) 
            trueSamples, falseSamples = split(samples, condition)
            
            if len(trueSamples.index) == 0 or len(falseSamples.index) == 0: # minor refinement
                continue
                
            gains = InformationGain(trueSamples, falseSamples, previous)
            if gains > mostGains:
                mostGains, bestCondition = gains, condition
                
    return mostGains, bestCondition

Now all we need is treeponents, such as nodes and leafs and a method tree builder.

In [12]:
class Leaf:
    def __init__(self, samples):
        self.pred = count_occ_labels(samples)

In [13]:
class Node:
    def __init__(self, condition, left, right):
        self.condition = condition
        self.left = left
        self.right = right

Treebuilder essentially just recursively creates splits in the data until there's no more information gain, at which point it puts a leaf down.

In [16]:
def Treebuilder(samples):
    gainz, condition = BestCondition(samples)
    
    if gainz == 0:
        return Leaf(samples)
    
    trueSamples, falseSamples = split(samples, condition)
    
    left = Treebuilder(trueSamples)
    right = Treebuilder(falseSamples)
    
    return Node(condition, left, right)

In [17]:
tree = Treebuilder(train)

Now we need to classify based on the just now generated nodes and their branches.

In [21]:
def classify(sample, Node):
    if isinstance(Node, Leaf):
        return Node.pred
    
    if Node.condition.compare(sample):
        return classify(sample, Node.left)
    else:
        return classify(sample, Node.right)

In [49]:
#len(test.index)
print(test.iloc[40])
print(test.iloc[40]['type'])

RI       1.52227
Na      14.17000
Mg       3.81000
Al       0.78000
Si      71.35000
K        0.00000
Ca       9.69000
Ba       0.00000
Fe       0.00000
type     1.00000
Name: 40, dtype: float64
1.0


In [56]:
biggus = []

for i in range(len(test.index)):
    pred = list(classify(test.iloc[i],tree).keys())[0]
    
    if pred == test.iloc[i]['type']:
        biggus.append(list(classify(test.iloc[i],tree).keys())[0])
    
acc = len(biggus)/len(test.index)*100
print("Accuracy: " , acc)

Accuracy:  67.6923076923077
