In [82]:
import csv
import numpy as np

filename = "assignments/assignment1/task1/data/ODI-2018_clean_relevant.csv"

with open(filename,'r') as csvfile:
    # Set the right quote character and delimiter
    csv_contents = [{k: v for k, v in row.items()}
        for row in csv.DictReader(csvfile, skipinitialspace=True, quotechar='"', delimiter=',')]


In [83]:
features = []
targets = []

for row in csv_contents:
    targets.append(row.pop('gender', None))
    features.append(row)


In [84]:
feature_to_index = dict()
index_to_feature = dict()

target_to_index = dict()
index_to_target = dict()

def map_features(features):
    index = 0
    for item in features:
        for k, v in item.items():
            if v.replace('.','',1).isdigit():
                if not k in feature_to_index:
                    feature_to_index[k] = index
                    index_to_feature[index] = k
                    index += 1
            else:
                unique = "=".join([k,v])
                if not unique in feature_to_index:
                    feature_to_index[unique] = index
                    index_to_feature[index] = unique
                    index += 1
                
def map_targets(targets):
    index = 0
    for item in targets:
        unique = item
        if not unique in target_to_index:
            target_to_index[unique] = index
            index_to_target[index] = unique
            index += 1
                
def transform_features(features):
    size = len(feature_to_index)
    result = []
    for item in features:
        feature = np.zeros(size)
        for k, v in item.items():
            if v.replace('.','',1).isdigit():
                feature[feature_to_index[k]] = v
            else:
                unique = "=".join([k,v])
                feature[feature_to_index[unique]] = 1
        result.append(feature.tolist())
    return result

def transform_targets(targets):
    result = []
    for item in targets:
        unique = item
        result.append(target_to_index[unique])
    return result

map_features(features)
map_targets(targets)

transformed_features = transform_features(features)
transformed_targets = transform_targets(targets)

In [93]:
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth':list(range(1, 10)), 'min_samples_leaf':list(range(1, 10)), 'min_samples_split':list(range(2,10)),'criterion': ('entropy', 'gini')}
tree_classifier = tree.DecisionTreeClassifier()
kf = KFold(n_splits=10)
clf = GridSearchCV(tree_classifier, parameters, cv=kf, return_train_score=False)
clf.fit(transformed_features, transformed_targets)


GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=False),
       error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9], 'criterion': ('entropy', 'gini')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring=None, verbose=0)

In [94]:
clf.best_score_
clf.best_params_

{'criterion': 'gini',
 'max_depth': 4,
 'min_samples_leaf': 2,
 'min_samples_split': 8}

In [95]:
import graphviz
dot_data = tree.export_graphviz(clf.best_estimator_, out_file=None, 
                         feature_names=index_to_feature,
                   class_names=index_to_target,
                               filled=True)  
graph = graphviz.Source(dot_data) 
graph.render(filename="tree") 

'tree.pdf'