In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn import decomposition
from sklearn import datasets, linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import explained_variance_score, classification_report, r2_score, accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
rankings = pd.read_csv('dataset.csv')
rankings = rankings.drop(['Unnamed: 0'], axis=1)
rankings.head()
print(rankings.columns)

Index(['date', 'away_team', 'home_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral', 'score_difference',
       'home_score_difference_lag', 'home_avg_goal_diff',
       'away_score_difference_lag', 'away_avg_goal_diff', 'home_win',
       'wc_home_wins', 'wc_away_wins', 'home_rank', 'home_country_abrv',
       'home_total_points', 'home_previous_points', 'home_rank_change',
       'home_cur_year_avg', 'home_cur_year_avg_weighted', 'home_last_year_avg',
       'home_last_year_avg_weighted', 'home_two_year_ago_avg',
       'home_two_year_ago_weighted', 'home_three_year_ago_avg',
       'home_three_year_ago_weighted', 'home_confederation', 'away_rank',
       'away_country_abrv', 'away_total_points', 'away_previous_points',
       'away_rank_change', 'away_cur_year_avg', 'away_cur_year_avg_weighted',
       'away_last_year_avg', 'away_last_year_avg_weighted',
       'away_two_year_ago_avg', 'away_two_year_ago_weighted',
       'away_three_year_ago_avg',

In [42]:
feature_columns_classification = ["home_score_difference_lag", "away_score_difference_lag",
                    "home_avg_goal_diff", "away_avg_goal_diff","rank_diff"]

In [43]:
#Classification data
Xtrain = rankings[(rankings['date'] < '2017-01-01')]
ytrain = Xtrain["home_win"]
Xtrain = Xtrain[feature_columns_classification]

Xtest = rankings[(rankings['date'] >= '2017-01-01')]
ytest = Xtest["home_win"]
Xtest = Xtest[feature_columns_classification]

print(len(Xtrain))
print(len(Xtest))

10533
473


In [7]:
rfc = RandomForestClassifier(max_depth=10, random_state=0)
rfc.fit(Xtrain, ytrain)
print(rfc.feature_importances_)

y_rfc = rfc.predict(Xtest)
print(sum(y_rfc == ytest) / len(Xtest))

[ 0.08282102  0.08173847  0.16168425  0.17501236  0.4987439 ]
0.5919661733615222


In [46]:
knn = KNeighborsClassifier(n_neighbors=100)
knn.fit(Xtrain, ytrain) 

y_knn = knn.predict(Xtest)
y_knn_prob = knn.predict_proba(Xtest)
print(sum(y_knn == ytest) / len(Xtest))

0.6046511627906976


In [44]:
knn_poly = make_pipeline(PolynomialFeatures(3), KNeighborsClassifier(n_neighbors=1000))
knn_poly.fit(Xtrain, ytrain)
y_knn2 = knn_poly.predict(Xtest)
print(sum(y_knn2 == ytest) / len(Xtest))

0.6004228329809725


In [10]:
# Regression data
Xtrain = rankings[(rankings['date'] < '2017-01-01')]
ytrain = Xtrain["home_score"]
Xtrain = Xtrain[feature_columns_classification]

Xtest = rankings[(rankings['date'] >= '2017-01-01')]
ytest = Xtest["home_score"]
Xtest = Xtest[feature_columns_classification]

print(len(Xtrain))
print(len(Xtest))

10533
473


In [11]:
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(Xtrain, ytrain)
y_regr = regr.predict(Xtest)

print(explained_variance_score(y_regr, ytest))
print(r2_score(ytest, y_regr))
print(sum(np.around(y_regr) == ytest) / len(Xtest))

-1.59034147855
0.274131132444
0.34460887949260044


In [69]:
feature_columns_statsmodels = ["home_score", "away_score", "home_score_difference_lag", "away_score_difference_lag",
                    "home_avg_goal_diff", "away_avg_goal_diff","rank_diff"]

# Regression data
Xtrain = rankings[(rankings['date'] < '2017-01-01')]
ytrain = Xtrain["home_score"]
Xtrain = Xtrain[feature_columns_statsmodels]

Xtest = rankings[(rankings['date'] >= '2017-01-01')]
ytest = Xtest["home_score"]
Xtest = Xtest[feature_columns_statsmodels]

In [48]:
# importing the tools required for the Poisson regression model
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import poisson


poisson_model_home = smf.glm(formula="home_score ~ home_score_difference_lag + away_score_difference_lag + home_avg_goal_diff + away_avg_goal_diff + rank_diff",
                        data=Xtrain,
                        family=sm.families.Poisson()).fit()

y_poisson_home = poisson_model_home.predict(Xtest)
print(sum(np.around(y_poisson_home) == ytest) / len(Xtest))


poisson_model_away = smf.glm(formula="away_score ~ home_score_difference_lag + away_score_difference_lag + home_avg_goal_diff + away_avg_goal_diff + rank_diff",
                        data=Xtrain,
                        family=sm.families.Poisson()).fit()

y_poisson_away = poisson_model_away.predict(Xtest)
print(sum(np.around(y_poisson_away) == ytest) / len(Xtest))

0.346723044397463
0.2832980972515856


In [54]:
team_pred = [[poisson.pmf(i, team_avg) for i in range(0, 4)] for team_avg in [y_poisson_home.iloc[-1], y_poisson_away.iloc[-1]]]
print("AVG home goal", y_poisson.iloc[-1])
goal_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))

print("WIN", np.sum(np.tril(goal_matrix, -1)))
print("DRAW", np.sum(np.diag(goal_matrix)))
print("LOSE", np.sum(np.triu(goal_matrix, 1)))
print(y_knn_prob[-1])
print(y_knn[-1])

AVG home goal 1.56597375229
WIN 0.469357666824
DRAW 0.253513924567
LOSE 0.192736219254
[ 0.25  0.31  0.44]
1


In [83]:
print(np.diag(goal_matrix))

a = np.tril(goal_matrix, -1)
home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)

[ 0.08951706  0.118783    0.03940422  0.00580963]


In [101]:
dummyTest = Xtest.iloc[-10:]

game_winner = knn.predict(dummyTest.drop(['home_score', 'away_score'], axis=1))

game_predictions = []

i = 0
for index, row in dummyTest.iterrows():
    home_goals = poisson_model_home.predict(row)
    away_goals = poisson_model_away.predict(row)
    team_pred = [[poisson.pmf(idx, team_avg) for idx in range(0, 4)] for team_avg in [home_goals, away_goals]]
    goal_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
    if game_winner[i] == 1:
        a = np.tril(goal_matrix, -1)
        home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)
    elif game_winner[i] == 0:
        a = np.diag(goal_matrix)
        home_goals, away_goals = a.argmax() + 1, a.argmax() + 1
    else:
        a = np.triu(goal_matrix, 1)
        home_goals, away_goals = np.unravel_index(a.argmax(), a.shape)

    game_predictions.append([home_goals, away_goals])
    i += 1


In [112]:
print(pd.DataFrame(game_predictions, columns=["home_score", "away_score"]))
print(Xtest[["home_score", "away_score"]].iloc[-10:])

   home_score  away_score
0           0           1
1           3           0
2           2           0
3           1           0
4           3           0
5           1           0
6           0           1
7           1           0
8           1           0
9           1           0
       home_score  away_score
10996           1           2
10997           7           0
10998           2           1
10999           2           0
11000           2           1
11001           2           1
11002           1           0
11003           2           1
11004           4           0
11005           1           1


In [49]:
from itertools import combinations

opponents = ['First match \nagainst', 'Second match\n against', 'Third match\n against']



world_cup['points'] = 0
world_cup['total_prob'] = 0

for group in set(world_cup['Group']):
    print('___Starting group {}:___'.format(group))
    for home, away in combinations(world_cup.query('Group == "{}"'.format(group)).Team, 2):
        import pdb; pdb.set_trace()
        print("{} vs. {}: ".format(home, away), end='')
        row = pd.DataFrame(np.array([[np.nan] * len(feature_columns)]), columns=feature_columns)
        row['rank_home'] = world_cup_rankings.loc[home, 'rank']
        home_points = world_cup_rankings.loc[home, 'cur_year_avg']
        opp_rank = world_cup_rankings.loc[away, 'rank']
        opp_points = world_cup_rankings.loc[away, 'cur_year_avg']
        row['rank_difference'] = row['rank_home'] - opp_rank
        row['point_difference'] = home_points - opp_points
        
        home_win_prob = model.predict_proba(row)[:,1][0]
        world_cup.loc[home, 'total_prob'] += home_win_prob
        world_cup.loc[away, 'total_prob'] += 1-home_win_prob
        
        points = 0
        if home_win_prob <= 0.5 - margin:
            print("{} wins with {:.2f}".format(away, 1-home_win_prob))
            world_cup.loc[away, 'points'] += 3
        if home_win_prob > 0.5 - margin:
            points = 1
        if home_win_prob >= 0.5 + margin:
            points = 3
            world_cup.loc[home, 'points'] += 3
            print("{} wins with {:.2f}".format(home, home_win_prob))
        if points == 1:
            print("Draw")
            world_cup.loc[home, 'points'] += 1
            world_cup.loc[away, 'points'] += 1

___Starting group C:___
> <ipython-input-49-88e9bfa21f48>(14)<module>()
-> print("{} vs. {}: ".format(home, away), end='')
(Pdb) home
'France'
(Pdb) away
'Australia'
(Pdb) c
France vs. Australia: 

NameError: name 'world_cup_rankings' is not defined