# Beslissingsbomen

## Imports

In [None]:
import pandas as pd
from sklearn import tree
import matplotlib.pyplot as plt
import csv
import id3
import graphviz
import math

## Data inlezen

In [None]:
simpsons_origineel = pd.read_csv('simpsons_origineel.csv')
simpsons_origineel.info()

In [None]:
print(simpsons_origineel)

In [None]:
# in categorieën splitsen
simpsons_categorical = simpsons_origineel.copy()
haarlengte = pd.cut(simpsons_categorical.haarlengte, bins=[-1, 3, 5, 100])
haarlengte = haarlengte.cat.rename_categories(['<=3', '3-5', '>5'])
simpsons_categorical.haarlengte = haarlengte
gewicht = pd.cut(simpsons_categorical.gewicht, bins=[0, 160, 1000])
gewicht = gewicht.cat.rename_categories(['<=160', '>160'])
simpsons_categorical.gewicht = gewicht
leeftijd = pd.cut(simpsons_categorical.leeftijd, bins=[0, 30, 40, 100])
leeftijd = leeftijd.cat.rename_categories(['<=30', '30-40', '>40'])
simpsons_categorical.leeftijd = leeftijd
print(simpsons_categorical.info())
simpsons_categorical.to_csv('simpsons_categorical.csv', index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)

In [None]:
print(simpsons_categorical)

## Hulpmiddelen

In [None]:
# kijken welke categorieën er zijn in de laatste kolom
print(simpsons_categorical.geslacht.unique())

In [None]:
# kijken welke categorieën er zijn in haarlengte
print(simpsons_categorical.haarlengte.unique())

In [None]:
# tellen hoeveel mannen en vrouwen er zijn
aantalM = (simpsons_categorical['geslacht'] == 'M').sum()
print(aantalM)
aantalV = (simpsons_categorical['geslacht'] == 'V').sum()
print(aantalV)

In [None]:
# kan je dit met absolute frequenties ook vinden?

In [None]:
# een kolom selecteren met een kolomnummer
simpsons_categorical.iloc[:, 1]

## Het ID3 algoritme

In [None]:
def calculate_entropy(target: pd.Series):
    freqs = target.value_counts(normalize=True)
    log_freqs = freqs.apply(math.log2)
    return - (freqs * log_freqs).sum()

In [None]:
# splits data van target en verwijder namen
namen = simpsons_categorical.naam
target = simpsons_categorical.geslacht
data = simpsons_categorical.drop(columns=['naam', 'geslacht'])

In [None]:
calculate_entropy(target)

In [None]:
def calculate_information_gain(data: pd.DataFrame, columnName: str, target: pd.Series):
    result = calculate_entropy(target)
    column = data[columnName]
    categories = column.unique()
    n = len(target)
    for category in categories:
        p = (column == category).sum()
        child_target = target[column == category]
        entropy = calculate_entropy(child_target)
        result = result - p/n * entropy
    return result

In [None]:
print(calculate_information_gain(simpsons_categorical, 'naam', target))
print(calculate_information_gain(simpsons_categorical, 'haarlengte', target))
print(calculate_information_gain(simpsons_categorical, 'gewicht', target))
print(calculate_information_gain(simpsons_categorical, 'leeftijd', target))
print(calculate_information_gain(simpsons_categorical, 'geslacht', target))

In [None]:
def find_column_with_highest_gain(data: pd.DataFrame, target: pd.Series):
    max = 0
    result = data.columns[0]
    for column in data.columns:
        gain = calculate_information_gain(data, column, target)
        if gain > max:
            max = gain
            result = column
    return result, max

In [None]:
find_column_with_highest_gain(data, target)

## Eenvoudige ID3 implementatie

In [None]:
def ID3(data: pd.DataFrame, target: pd.Series, depth=0):
    # check for inconsistent data
    if depth >= len(data.columns):
        print('ERROR: data is not consistent on lines:')
        print(data.index.tolist())
        return
    # determine if recursion can stop (target contains only one value)
    uniqueValues = target.unique()
    if len(uniqueValues) == 1:
        print('-->', uniqueValues[0])
        return
    column_name, gain = find_column_with_highest_gain(data, target)
    # print node-name
    print("[", column_name, "]")
    # loop through all possible values for this column
    column = data[column_name]
    categories = column.unique()
    for category in categories:
        # print the value
        print('    '*depth, '- ', category, ': ', end='')
        # calculate the childtable for this value
        child_table = data.loc[column == category]
        child_target = target[column == category]
        # call ID3 recursively
        ID3(child_table, child_target, depth + 1)

In [None]:
ID3(data, target)

## ID3 library

In [None]:
# data en target MOETEN gewone Python lists zijn, anders werkt het niet.
column_names = data.columns.tolist()
data = data.to_numpy().tolist()
target = target.to_numpy().tolist()

In [None]:
model = id3.Id3Estimator(max_depth=4)
model.fit(data, target)

In [None]:
model_tree = id3.export.DotTree()
id3.export_graphviz(model.tree_,model_tree,feature_names=column_names)
graphviz.Source(model_tree.dot_tree)

Continue gegevens:

In [None]:
target = simpsons_origineel.geslacht
data = simpsons_origineel.drop(columns=['naam', 'geslacht'])
column_names = data.columns.tolist()

In [None]:
model = id3.Id3Estimator(max_depth=4)
model.fit(data, target)

In [None]:
model_tree = id3.export.DotTree()
id3.export_graphviz(model.tree_,model_tree,feature_names=column_names)
graphviz.Source(model_tree.dot_tree)

## Het CART algoritme

In [None]:
target = simpsons_origineel.geslacht
data = simpsons_origineel.drop(columns=['naam', 'geslacht'])
column_names = data.columns.tolist()

In [None]:
model = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=42)
model.fit(data, target)

In [None]:
fig, ax = plt.subplots()
fig.dpi = 100
tree.plot_tree(model, rounded=True, feature_names=column_names, class_names=model.classes_, fontsize=10, ax=ax)
# fig.show()

In [None]:
print(tree.export_text(model, feature_names=column_names, show_weights=True, decimals=0))

### Voorspellingen doen

In [None]:
comics = pd.DataFrame({'haarlengte':[8, 10], 'gewicht':[290, 80], 'leeftijd':[38, 55]})
print(comics)

In [None]:
model.predict(comics).tolist()

## CART met kwalitatieve variabelen

### Ordinale gegevens

In [None]:
simpsons_categorical = pd.read_csv('simpsons_categorical.csv')
simpsons_categorical.haarlengte = pd.Categorical(simpsons_categorical.haarlengte, categories=['<=3', '3-5', '>5'], ordered=True)
simpsons_categorical.gewicht = pd.Categorical(simpsons_categorical.gewicht, categories=['<=160', '>160'], ordered=True)
simpsons_categorical.leeftijd = pd.Categorical(simpsons_categorical.leeftijd, categories=['<=30', '30-40', '>40'], ordered=True)
print(simpsons_categorical.info())

In [None]:
print(simpsons_categorical)

In [None]:
# omzetten naar getallen
simpsons_categorical.haarlengte = simpsons_categorical.haarlengte.cat.codes
simpsons_categorical.gewicht = simpsons_categorical.gewicht.cat.codes
simpsons_categorical.leeftijd = simpsons_categorical.leeftijd.cat.codes
print(simpsons_categorical)

In [None]:
target = simpsons_categorical.geslacht
data = simpsons_categorical.drop(columns=['naam', 'geslacht'])
column_names = data.columns.tolist()

In [None]:
model = tree.DecisionTreeClassifier(criterion='entropy')
model.fit(data, target)

In [None]:
fig, ax = plt.subplots()
fig.dpi = 100
tree.plot_tree(model, rounded=True, feature_names=column_names, class_names=model.classes_, fontsize=10, ax=ax)
# fig.show()

### Nominale gegevens

In [None]:
simpsons_categorical = pd.read_csv('simpsons_categorical.csv')
simpsons_categorical.haarlengte = pd.Categorical(simpsons_categorical.haarlengte, categories=['<=3', '3-5', '>5'], ordered=True)
simpsons_categorical.gewicht = pd.Categorical(simpsons_categorical.gewicht, categories=['<=160', '>160'], ordered=True)
simpsons_categorical.leeftijd = pd.Categorical(simpsons_categorical.leeftijd, categories=['<=30', '30-40', '>40'], ordered=True)
print(simpsons_categorical.info())

In [None]:
print(simpsons_categorical)

In [None]:
# one-hot encoding
data = pd.get_dummies(simpsons_categorical.drop(columns=['naam', 'geslacht']))
data

In [None]:
target = simpsons_origineel.geslacht

In [None]:
model = tree.DecisionTreeClassifier(criterion='entropy')
model.fit(data, target)

In [None]:
fig, ax = plt.subplots()
fig.dpi = 100
tree.plot_tree(model, rounded=True, feature_names=data.columns.tolist(), class_names=model.classes_, fontsize=10, ax=ax)
# fig.show()