# Decition Tree part one, D3

## First steps
The first steps is to do all the imports and example data

In [72]:
import numpy as np
import pandas as pd
import random as rnd

## Math
Using the D3 algorithm there is a lot of mathematical formulas we need. Theese includes entropy and information gain.

In [5]:
#Takes a list of probabilities and returns the entropy
def entropy(xs):
    return sum([-p*np.log2(p) for p in xs])

#Takes a list of an attribute and returns its entropy
def entropy_el(xs):
    return entropy(probability_el(xs))

def probability_el(xs):
    return [x / sum(xs) for x in xs]

In [93]:
#Takes a list of attributes, and counts them into dictionary
def labels(xs):
    lbs = {}
    for x in xs:
        if x in lbs:
            lbs[x] += 1
        else:
            lbs[x] = 1
    return lbs

#Returns entropy of a list with lables
def labels_entropy(xs):
    return entropy_el(labels(np.asarray(xs)).values())

#Takes two lists of lables and returns entropy
def entropy_given(xs, ys):
    labels_x = labels(xs)
    probs_x = probability_el(labels_x.values())
    total = 0
    for lab, prob in zip(labels_x.keys(), probs_x):
        total += prob * labels_entropy([y for x, y in zip(xs, ys) if x == lab])
    return total

def information_gain(xs, ys):
    return labels_entropy(ys) - entropy_given(xs, ys)

def information_gain_all(xs, ys):
    columns = list(pd.DataFrame(xs))
    res = {}
    for p in columns:
        res[p] = information_gain(ys, xs[p])
    return res

def best_question(xs, ys):
    igs = information_gain_all(xs, ys)
    return max(igs, key=igs.get)

## Create the three objects

In [25]:
class DecitionTree:
    def __init__(self):
        self.rootNode = ""
        
    def learn(self, X, y, impurity_measure):
        assert(impurity_measure=='entropy')
        self.rootNode = Node(X, y, "root")

    def predict(self, X):
        res = []
        for index, rw in X.iterrows():
            res.append(self.rootNode.search(rw))
        df = pd.DataFrame()
        df["result"] = res
        return df
    

class Node:        
    def __init__(self, nX, ny, path):
        self.X = nX.copy()
        self.y = ny.copy()
        self.path = path
        self.result = ""
        self.child = {}
        self.question = -1
        
        if labels_entropy(self.y) == 0:
            self.isLeaf = True
            self.result = self.y.iloc[0]
        else:
            self.isLeaf = False
            Node.createChildren(self)
    
    def createChildren(self):
        self.question = best_question(self.X, self.y)
        i = self.question
        self.X['y'] = self.y
        for lbs in np.unique(self.X[i]):
            newTable = self.X.loc[self.X.loc[:,i] == lbs].drop(i, axis=1)
            newX = newTable.iloc[:,:-1]
            newY = newTable.iloc[:,-1]
            print("creating node " + str(lbs))
            n = Node(newX, newY, lbs)
            self.child[lbs] = n
    
    def search(self, rw):
        if (self.isLeaf):
            return self.result
        else:
            return self.child[rw[self.question]].search(rw)
    
    def toString(self):
        mstr = ""
        if self.isLeaf:
            return "(" + self.path + " - " + self.result + ")"
        else:
            mstr += str(self.path) + " " + str(self.question) + ": ("
            for c in self.child:
                mstr += c.toString() + " "
            mstr += ")"
            return mstr

## Insert some test data

In [28]:
data = pd.read_csv('tennis.data',header=None)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

print(labels_entropy(y))
print(entropy_given(X[0], y))

0.9402859586706311
0.6935361388961918


In [22]:
cf = DecitionTree()
cf.learn(X, y, impurity_measure='entropy')

cf.predict(X)

creating node overcast
creating node rain
creating node strong
creating node weak
creating node sunny
creating node high
creating node normal


Unnamed: 0,result
0,no
1,no
2,yes
3,yes
4,yes
5,no
6,yes
7,no
8,yes
9,yes


## Import data

In [88]:
data = pd.read_csv('abalone.data',header=None)
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

#cf = DecitionTree()
#cf.learn(X, y, impurity_measure='entropy')

#cf.predict(X)

In [112]:
print(X)
print(y)

      0      1      2      3       4       5       6       7
0     M  0.455  0.365  0.095  0.5140  0.2245  0.1010  0.1500
1     M  0.350  0.265  0.090  0.2255  0.0995  0.0485  0.0700
2     F  0.530  0.420  0.135  0.6770  0.2565  0.1415  0.2100
3     M  0.440  0.365  0.125  0.5160  0.2155  0.1140  0.1550
4     I  0.330  0.255  0.080  0.2050  0.0895  0.0395  0.0550
...  ..    ...    ...    ...     ...     ...     ...     ...
4172  F  0.565  0.450  0.165  0.8870  0.3700  0.2390  0.2490
4173  M  0.590  0.440  0.135  0.9660  0.4390  0.2145  0.2605
4174  M  0.600  0.475  0.205  1.1760  0.5255  0.2875  0.3080
4175  F  0.625  0.485  0.150  1.0945  0.5310  0.2610  0.2960
4176  M  0.710  0.555  0.195  1.9485  0.9455  0.3765  0.4950

[4177 rows x 8 columns]
0       15
1        7
2        9
3       10
4        7
        ..
4172    11
4173    10
4174     9
4175    10
4176    12
Name: 8, Length: 4177, dtype: int64
