In [1]:
import pandas as pd
from sklearn import tree, neighbors, neural_network
from sklearn import metrics
import numpy as np
import graphviz 

In [2]:
def split_sets(df, frac):
    training = df.sample(frac = frac)
    training_args = training.drop(after_match_attributes, axis=1)
    test = df[~df.isin(training)].dropna()
    test_args = test.drop(after_match_attributes, axis=1)
    training_answers = training['home_team_goal'] > training['away_team_goal']
    test_answers  = test['home_team_goal'] > test['away_team_goal']
    return training, training_args, training_answers, test, test_args, test_answers

df_match = pd.read_csv('../Project1/dataset.csv')
after_match_attributes = ['home_team_goal', 'away_team_goal', 'possession_home', 'shoton_home', 'shoton_away', 
                      'shotoff_home', 'shotoff_away', 'corner_home', 'corner_away', 'cross_home', 'cross_away', 
                      'foulcommit_home', 'foulcommit_away', 'rcard_home', 'rcard_away', 'ycard_home', 'ycard_away', 
                      'throwin_home', 'throwin_away']
outer_train, outer_train_args, outer_train_answers, outer_test, outer_test_args, outer_test_answers = split_sets(df_match, 0.9)
inner_train, inner_train_args, inner_train_answers, inner_test, inner_test_args, inner_test_answers = split_sets(outer_train, 0.9)

In [3]:
def do_prediction(clf, train_args, train_answers, test_args, test_answers):
    clf.fit(train_args, train_answers)
    predictions = clf.predict(test_args)
    confusion = metrics.confusion_matrix(test_answers, predictions)
    return (confusion[0, 0] + confusion[1, 1]) / np.sum(confusion)

In [15]:
# Decision tree
correct_tree = np.zeros(25)
for max_depth in range(1, len(correct_tree)):
    correct_tree[max_depth] = do_prediction(tree.DecisionTreeClassifier(max_depth = max_depth), inner_train_args, inner_train_answers, inner_test_args, inner_test_answers)
best_depth = np.argmax(correct_tree)
clf_tree = tree.DecisionTreeClassifier(max_depth = best_depth)
outer_correct = do_prediction(clf_tree, outer_train_args, outer_train_answers, outer_test_args, outer_test_answers)
print (best_depth, correct_tree[best_depth], outer_correct)
dot_data = tree.export_graphviz(clf_tree, out_file=None, feature_names=outer_train_args.columns)
graph = graphviz.Source(dot_data)
graph.render("tree")

6 0.688372093023 0.627615062762


'tree.pdf'

In [16]:
# KNN
correct_knn = np.zeros(10)
for k in range(1, len(correct_knn)):
    correct_knn[k] = do_prediction(neighbors.KNeighborsClassifier(n_neighbors = k), inner_train_args, inner_train_answers, inner_test_args, inner_test_answers)
best_k = np.argmax(correct_knn)
clf_knn = neighbors.KNeighborsClassifier(n_neighbors = best_k)
outer_correct = do_prediction(clf_knn, outer_train_args, outer_train_answers, outer_test_args, outer_test_answers)
print (best_k, correct_knn[best_k], outer_correct)

9 0.632558139535 0.598326359833


In [17]:
# ANN
correct_ann = np.zeros(10)
n_units = 2 ** (np.arange(len(correct_ann)))
for i in range(len(correct_ann)):
    correct_ann[i] = do_prediction(neural_network.MLPClassifier(hidden_layer_sizes = n_units[i]), outer_train_args, outer_train_answers, outer_test_args, outer_test_answers)
best_i = np.argmax(correct_ann)
clf_ann = neural_network.MLPClassifier(hidden_layer_sizes = n_units[best_i])
outer_correct = do_prediction(clf_ann, outer_train_args, outer_train_answers, outer_test_args, outer_test_answers)
print (n_units[best_i], correct_ann[best_i], outer_correct)

1 0.682008368201 0.510460251046


In [18]:
print (correct_tree)
print (correct_knn)
print (correct_ann)

[ 0.          0.6372093   0.6372093   0.61395349  0.6372093   0.66511628
  0.68837209  0.66976744  0.6         0.65116279  0.61860465  0.60930233
  0.60465116  0.59069767  0.6372093   0.61395349  0.57674419  0.59069767
  0.60930233  0.6372093   0.59534884  0.59069767  0.61395349  0.59534884
  0.55348837]
[ 0.          0.50697674  0.59069767  0.57674419  0.61860465  0.5627907
  0.6         0.60930233  0.61860465  0.63255814]
[ 0.68200837  0.61924686  0.58577406  0.63598326  0.63598326  0.61506276
  0.61506276  0.66108787  0.62343096  0.53138075]
