In [3]:
import csv
import numpy as np

filename = "assignments/assignment1/data/ODI-2018_clean_relevant.csv"

with open(filename,'r') as csvfile:
    # Set the right quote character and delimiter
    csv_contents = [{k: v for k, v in row.items()}
        for row in csv.DictReader(csvfile, skipinitialspace=True, quotechar='"', delimiter=',')]


In [4]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction import DictVectorizer

feature_encoder = DictVectorizer()
target_encoder = preprocessing.LabelEncoder()


features = []
targets = []

for row in csv_contents:
    targets.append(row.pop('gender', None))
    features.append(row)


In [5]:
feature_to_index = dict()
index_to_feature = dict()

target_to_index = dict()
index_to_target = dict()

def map_features(features):
    index = 0
    for item in features:
        for k, v in item.items():
            if v.replace('.','',1).isdigit():
                if not k in feature_to_index:
                    feature_to_index[k] = index
                    index_to_feature[index] = k
                    index += 1
            else:
                unique = "=".join([k,v])
                if not unique in feature_to_index:
                    feature_to_index[unique] = index
                    index_to_feature[index] = unique
                    index += 1
                
def map_targets(targets):
    index = 0
    for item in targets:
        unique = item
        if not unique in target_to_index:
            target_to_index[unique] = index
            index_to_target[index] = unique
            index += 1
                
def transform_features(features):
    size = len(feature_to_index)
    result = []
    for item in features:
        feature = np.zeros(size)
        for k, v in item.items():
            if v.replace('.','',1).isdigit():
                feature[feature_to_index[k]] = v
            else:
                unique = "=".join([k,v])
                feature[feature_to_index[unique]] = 1
        result.append(feature.tolist())
    return result

def transform_targets(targets):
    result = []
    for item in targets:
        unique = item
        result.append(target_to_index[unique])
    return result

map_features(features)
map_targets(targets)

transformed_features = transform_features(features)
transformed_targets = transform_targets(targets)

In [10]:
from sklearn import tree
from sklearn.model_selection import cross_val_score

_max = 0
for i in range(9):
    for j in range(9):
        clf = tree.DecisionTreeClassifier(max_depth=i+1, min_samples_leaf=j+1, criterion = "entropy")
            
        scores = cross_val_score(clf, transformed_features, transformed_targets, cv=3)
        if scores.mean() > _max:
            print("Accuracy of Tree: %0.2f (+/- %0.2f), depth: %d, leaf: %d" % (scores.mean(), scores.std() * 2, i+1, j+1))
            _max = scores.mean()

#depth = 5, leaf = 4 best
clf = tree.DecisionTreeClassifier(max_depth=5, min_samples_leaf=4, criterion="entropy")
clf.fit(transformed_features, transformed_targets)

Accuracy of Tree: 0.67 (+/- 0.02), depth: 1, leaf: 1
Accuracy of Tree: 0.68 (+/- 0.04), depth: 2, leaf: 5
Accuracy of Tree: 0.70 (+/- 0.03), depth: 2, leaf: 7
Accuracy of Tree: 0.71 (+/- 0.05), depth: 3, leaf: 7
Accuracy of Tree: 0.72 (+/- 0.04), depth: 5, leaf: 4
Accuracy of Tree: 0.72 (+/- 0.07), depth: 8, leaf: 6


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [11]:
import graphviz 
#dot_data = tree.export_graphviz(clf, out_file=None) 
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=index_to_feature,
                   class_names=index_to_target,
                               filled=True)  
graph = graphviz.Source(dot_data) 
graph.render(filename="tree") 

'tree.pdf'