In [1]:
import pandas as pd
from sklearn import tree
from sklearn import metrics
import numpy as np
import graphviz 

In [17]:
def split_sets(df, frac):
    training = df.sample(frac = frac)
    training_args = training.drop(after_match_attributes, axis=1)
    test = df[~df.isin(training)].dropna()
    test_args = test.drop(after_match_attributes, axis=1)
    training_answers = training['home_team_goal'] > training['away_team_goal']
    test_answers  = test['home_team_goal'] > test['away_team_goal']
    return training, training_args, training_answers, test, test_args, test_answers

df_match = pd.read_csv('../Project1/dataset.csv')
after_match_attributes = ['home_team_goal', 'away_team_goal', 'possession_home', 'shoton_home', 'shoton_away', 
                      'shotoff_home', 'shotoff_away', 'corner_home', 'corner_away', 'cross_home', 'cross_away', 
                      'foulcommit_home', 'foulcommit_away', 'rcard_home', 'rcard_away', 'ycard_home', 'ycard_away', 
                      'throwin_home', 'throwin_away']
outer_train, outer_train_args, outer_train_answers, outer_test, outer_test_args, outer_test_answers = split_sets(df_match, 0.9)
inner_train, inner_train_args, inner_train_answers, inner_test, inner_test_args, inner_test_answers = split_sets(out_train, 0.9)

In [19]:
def do_prediction(depth, train_args, train_answers, test_args, test_answers):
    clf = tree.DecisionTreeClassifier(max_depth = depth)
    clf.fit(train_args, train_answers)
    predictions = clf.predict(test_args)
    confusion = metrics.confusion_matrix(test_answers, predictions)
    return (confusion[0, 0] + confusion[1, 1]) / np.sum(confusion), clf

correct = np.empty(25)
for max_depth in range(1, len(correct)):
    correct[max_depth], _ = do_prediction(max_depth, inner_train_args, inner_train_answers, inner_test_args, inner_test_answers)
best_depth = np.argmax(correct)
outer_correct, clf = do_prediction(best_depth, outer_train_args, outer_train_answers, outer_test_args, outer_test_answers)
print (best_depth, correct[best_depth], outer_correct)

1 0.66511627907 0.665271966527


In [20]:
dot_data = tree.export_graphviz(clf, out_file=None, feature_names=training_args.columns)
graph = graphviz.Source(dot_data)
graph.render("home_win")

'home_win.pdf'