In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [15]:
data = pd.read_csv('data_with_points.csv', sep=',', index_col=0)
print(data.columns)
data.head()

Index(['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'HS', 'AS', 'HST',
       'AST', 'HTHG', 'HTAG', 'HF', 'AF', 'FTR', 'HTR', 'HY', 'AY', 'HR', 'AR',
       'WHH', 'WHD', 'WHA', 'Season', 'watch', 'AGS', 'AGA', 'AYC', 'ARC',
       'AWW', 'HGS', 'HGA', 'HYC', 'HRC', 'HWW', 'HP', 'AP'],
      dtype='object')


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,HS,AS,HST,AST,HTHG,...,AYC,ARC,AWW,HGS,HGA,HYC,HRC,HWW,HP,AP
8,2000-08-26,Arsenal,Charlton,5,3,18,7,9,4,1,...,1.0,0.5,0.5,1.0,0.5,1.5,1.0,0.0,3,0
10,2000-08-26,Everton,Derby,2,2,12,7,9,4,2,...,2.0,0.0,1.0,1.5,1.0,2.5,0.0,0.0,3,0
11,2000-08-26,Ipswich,Sunderland,1,0,14,9,5,3,0,...,3.0,0.0,0.5,1.0,2.0,0.5,0.0,0.5,1,0
12,2000-08-26,Man City,Coventry,1,2,14,9,5,8,0,...,4.0,0.5,0.5,2.0,3.0,2.5,0.0,1.0,3,3
14,2000-08-26,Newcastle,Tottenham,2,0,15,10,6,2,1,...,0.5,0.0,0.5,1.5,2.0,0.5,0.5,0.5,3,1


In [16]:
data = data[['Date','WHH','WHD','WHA','HWW','AWW','HP','AP','watch']]
data = data.reindex(index=data.index[::-1])
data.reset_index(inplace=True)
data['Date'] =  pd.to_datetime(data['Date'])
data[['WHH','WHD','WHA','HWW','AWW','HP','AP','watch']] = data[['WHH','WHD','WHA','HWW','AWW','HP','AP','watch']].apply(pd.to_numeric)
data.drop('index',axis=1,inplace=True)

In [17]:
recent = data[data['Date'] > '2010-08-01']
recent = recent[recent['Date'] < '2018-05-13']

In [18]:
X = recent[['WHH','WHD','WHA','HWW','AWW','HP','AP']]
y = recent['watch']

In [19]:
X_train, X_vt, y_train, y_vt = train_test_split(X, y, test_size=0.3, random_state=101)
X_validate, X_test, y_validate, y_test = train_test_split(X_vt, y_vt, test_size=0.5, random_state=101)

In [20]:
dtc_clf = DecisionTreeClassifier(random_state=101)
rnd_clf = RandomForestClassifier(random_state=101, 
                                 n_estimators=100)
svm_clf = SVC(random_state=101, 
              gamma=1, 
              C=100, 
              kernel='rbf')

In [21]:
voting_clf = VotingClassifier(
    estimators=[('dt', dtc_clf), ('rf', rnd_clf), ('sv', svm_clf)],
    voting='hard')

In [22]:
for clf in (dtc_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_validate)
    print(clf.__class__.__name__)
    cf = confusion_matrix(y_validate, y_pred)
    print(cf)
    print(cf[1][1] / (cf[0][1] + cf[1][1]))

DecisionTreeClassifier
[[175  98]
 [ 94  56]]
0.36363636363636365
RandomForestClassifier
[[238  35]
 [117  33]]
0.4852941176470588
SVC
[[227  46]
 [120  30]]
0.39473684210526316
VotingClassifier
[[239  34]
 [118  32]]
0.48484848484848486


In [23]:
rnd_clf.fit(X_train.append(X_validate), y_train.append(y_validate))
y_pred = rnd_clf.predict(X_test)
cf = confusion_matrix(y_test, y_pred)
print(cf)
print(cf[1][1] / (cf[0][1] + cf[1][1]))

[[240  41]
 [123  19]]
0.31666666666666665


This is the best testing accuracy of a classifier so far and encouragingly predicts about 18% of games to be exciting to watch. So each week in the Premiership it will likely predict at least one game as interesting to watch.