In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('display.max_columns', None)

In [2]:
def make_query(query, filepath='../database.sqlite'):
    """
    Execute a query on a SQLite database file and return the results as a pandas DataFrame.

    Parameters
    ----------
    query : str
        SQL query to be executed on the database file.
    filepath : str, optional
        Path to the database file, by default 'database.sqlite'.

    Returns
    -------
    pd.DataFrame
        A pandas DataFrame containing the results of the executed query.
    """
    with sqlite3.connect(filepath) as connection:
        cursor = connection.cursor()
        execution = cursor.execute(query)
        result = pd.DataFrame(execution.fetchall(), columns=[i[0] for i in execution.description])
        return result

match = make_query(""" SELECT * FROM match """)

## <font color='blue'><b>1. Probabilities Bookmakers</b></font>

In [3]:
betting = pd.DataFrame()
bookmaker_odds = ['WHH', 'WHD', 'WHA']
for odds in bookmaker_odds: betting[odds] = 100 / match[odds]
betting['total'] = betting.sum(axis=1)

display(betting)

Unnamed: 0,WHH,WHD,WHA,total
0,58.823529,30.303030,23.094688,112.221248
1,54.644809,30.303030,27.777778,112.725617
2,40.000000,30.769231,41.666667,112.435897
3,69.444444,26.666667,16.666667,112.777778
4,23.809524,29.411765,58.823529,112.044818
...,...,...,...,...
25974,,,,0.000000
25975,,,,0.000000
25976,,,,0.000000
25977,,,,0.000000


### Wat valt op?

We hebben de percentages berekend. Wat erg opvalt is dat als je de percentages bij elkaar opteld je een groter getal dan 100 krijgt. Dit kan niet want als je de kans uitdrukt in percentages zul je nooit een kans groter dan 100 kunnen krijgen.
Dit betekent dat dit dus niet echt kansen zijn. Het kan zo zijn dat sommige kansen wat opgeblazen zijn zodat de bookmakers misschien meer geld kunnen verdienen.

## <font color='blue'><b>2. Predict Bookmakers</b></font>

In [4]:
df = pd.read_csv('attributes_difference.csv')
df = pd.concat([betting, df], axis=1)
df = df.dropna(subset=['WHH', 'WHD', 'WHA'])

In [5]:
bookmaker_prediction = []
for row in df.itertuples():
    if row.WHD < row.WHH > row.WHA:
        bookmaker_prediction.append('win')
    elif row.WHH < row.WHA > row.WHD:
        bookmaker_prediction.append('loss')
    else:
        bookmaker_prediction.append('draw')
df['bookmaker_prediction'] = bookmaker_prediction

display(df)

Unnamed: 0,WHH,WHD,WHA,total,result,overall_rating_difference,potential_difference,crossing_difference,finishing_difference,heading_accuracy_difference,short_passing_difference,volleys_difference,dribbling_difference,curve_difference,free_kick_accuracy_difference,long_passing_difference,ball_control_difference,acceleration_difference,sprint_speed_difference,agility_difference,reactions_difference,balance_difference,shot_power_difference,jumping_difference,stamina_difference,strength_difference,long_shots_difference,aggression_difference,interceptions_difference,positioning_difference,vision_difference,penalties_difference,marking_difference,standing_tackle_difference,sliding_tackle_difference,gk_diving_difference,gk_handling_difference,gk_kicking_difference,gk_positioning_difference,gk_reflexes_difference,bookmaker_prediction
0,58.823529,30.303030,23.094688,112.221248,draw,0.836364,1.383333,0.069697,-0.604545,2.593939,-0.957576,-1.398485,0.934848,-1.537879,-2.657576,-0.690909,1.592424,2.884848,2.962121,-1.525758,1.257576,0.204545,-0.769697,3.798485,-0.440909,2.996970,-1.174242,5.187879,1.350000,-0.486364,-2.584848,-1.586364,1.863636,1.953030,2.421212,-1.330303,-2.795455,-8.098485,-2.886364,-2.348485,win
1,54.644809,30.303030,27.777778,112.725617,draw,-0.554545,-0.177273,-2.042424,-6.386364,-0.471212,-0.592424,-7.427273,-4.689394,-6.443939,-5.119697,-1.051515,0.113636,0.854545,1.746970,-0.148485,1.095455,2.134848,-2.651515,4.457576,-1.477273,-1.783333,-8.593939,1.337879,0.843939,-5.078788,-0.212121,-5.295455,1.630303,1.224242,1.700000,-1.650000,-2.139394,-8.183333,-2.074242,-2.340909,win
2,40.000000,30.769231,41.666667,112.435897,win,-1.304545,-1.727273,-2.340909,1.007576,1.662121,-0.006061,-4.340909,-2.875758,-7.104545,-1.687879,-0.634848,-0.683333,-1.336364,0.737879,-3.033333,-0.772727,-0.086364,-2.904545,0.598485,-1.443939,-0.930303,-3.893939,2.474242,2.318182,0.380303,1.815152,-0.819697,1.513636,2.259091,2.087879,-1.880303,-1.883333,-3.345455,-0.862121,-2.889394,loss
3,69.444444,26.666667,16.666667,112.777778,loss,1.987121,1.738636,3.522727,-0.628788,7.068939,2.479545,-3.802273,0.364394,2.046212,-0.748485,3.729545,2.359848,0.815152,0.881818,-0.889394,3.115152,3.377273,2.496212,3.509091,3.254545,5.178788,-2.552273,6.603788,4.287879,1.973485,2.832576,2.268182,3.721970,5.809091,6.203788,-1.730303,-2.219697,-6.884848,-1.578788,-3.271970,win
4,23.809524,29.411765,58.823529,112.044818,loss,-4.192424,-3.103030,-4.975000,-5.484848,-3.889394,-6.358333,-16.517382,-6.375000,-13.791204,-3.639394,-5.990152,-4.550000,-2.343182,-0.792424,-8.065530,-1.187879,-9.802694,-5.662121,-4.789815,-5.181818,0.752273,-4.948485,1.717424,-1.347727,-7.248485,-5.824327,-4.725758,-0.849242,-1.114394,0.294655,-1.688636,-1.321212,-3.396212,0.259091,-1.914394,loss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24552,61.728395,30.303030,14.285714,106.317140,draw,-5.777964,-8.200703,-6.806813,-6.907349,-6.263478,-7.533182,-8.370592,-6.603655,-2.392316,-2.692100,-3.144065,-7.144078,-4.978793,-5.281524,-4.384167,-9.192021,-1.049178,-4.133189,-1.021323,-3.580454,-0.104411,-0.972453,-4.178659,-6.799751,-10.141172,-6.318919,-1.712699,-5.185876,-7.090202,-4.447557,-0.150467,-0.249365,-0.066923,-1.216418,-0.685389,win
24553,42.016807,32.258065,32.258065,106.532936,draw,-4.529412,-4.272727,-4.473262,-0.224599,-7.764706,-4.882353,-1.802139,-4.243316,1.673797,2.839572,-3.962567,-5.868984,-2.518717,-3.387701,-0.505348,-7.751337,1.016043,-5.034759,-4.470588,-6.938503,-7.711230,-0.171123,-7.139037,-4.283422,-8.502674,-5.673797,-0.069519,-3.374332,-5.147059,-3.152406,0.877005,-1.275401,-0.644385,-2.673797,-0.438503,win
24554,63.694268,28.571429,14.285714,106.551410,loss,-6.786096,-10.721925,-6.558824,-0.767380,-6.171123,-7.986631,-5.911765,-5.096257,-0.836898,0.211230,-4.807487,-7.377005,-2.941176,-4.133690,-5.008021,-9.109626,0.283422,-3.513369,-4.518717,-4.780749,-2.810160,0.553476,-9.334225,-7.315508,-7.890374,-9.847594,1.109626,-3.711230,-6.173797,-4.754011,-0.449198,-0.195187,-0.171123,-2.553476,-1.620321,win
24555,41.666667,32.258065,32.258065,106.182796,loss,-3.609626,-5.791444,-1.588235,-4.890374,-4.326203,-4.628342,-6.165775,-3.917112,-1.203209,2.812834,-2.740642,-4.631016,0.034759,-1.962567,-0.946524,-6.890374,1.970588,-5.545455,-6.021390,-2.232620,-3.181818,-0.350267,-5.705882,-3.548128,-8.040107,-6.165775,3.112299,-2.580214,-3.794118,-0.614973,-2.069519,0.906417,-1.716578,-1.582888,-2.438503,win


## <font color='blue'><b>3. Voorspellingen van bookmaker en model</b></font>

In [6]:
not_features = list(betting.columns) + ['result', 'bookmaker_prediction']

X = df.drop(not_features, axis=1)
y = df['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

knn = KNeighborsClassifier()
param_grid = {'n_neighbors': range(1, 31), 'weights': ['uniform', 'distance']}

grid_search_knn = GridSearchCV(knn, param_grid, cv=5)
grid_search_knn.fit(X_train, y_train)

df['model_prediction'] = grid_search_knn.predict(X)

display(df)

Unnamed: 0,WHH,WHD,WHA,total,result,overall_rating_difference,potential_difference,crossing_difference,finishing_difference,heading_accuracy_difference,short_passing_difference,volleys_difference,dribbling_difference,curve_difference,free_kick_accuracy_difference,long_passing_difference,ball_control_difference,acceleration_difference,sprint_speed_difference,agility_difference,reactions_difference,balance_difference,shot_power_difference,jumping_difference,stamina_difference,strength_difference,long_shots_difference,aggression_difference,interceptions_difference,positioning_difference,vision_difference,penalties_difference,marking_difference,standing_tackle_difference,sliding_tackle_difference,gk_diving_difference,gk_handling_difference,gk_kicking_difference,gk_positioning_difference,gk_reflexes_difference,bookmaker_prediction,model_prediction
0,58.823529,30.303030,23.094688,112.221248,draw,0.836364,1.383333,0.069697,-0.604545,2.593939,-0.957576,-1.398485,0.934848,-1.537879,-2.657576,-0.690909,1.592424,2.884848,2.962121,-1.525758,1.257576,0.204545,-0.769697,3.798485,-0.440909,2.996970,-1.174242,5.187879,1.350000,-0.486364,-2.584848,-1.586364,1.863636,1.953030,2.421212,-1.330303,-2.795455,-8.098485,-2.886364,-2.348485,win,win
1,54.644809,30.303030,27.777778,112.725617,draw,-0.554545,-0.177273,-2.042424,-6.386364,-0.471212,-0.592424,-7.427273,-4.689394,-6.443939,-5.119697,-1.051515,0.113636,0.854545,1.746970,-0.148485,1.095455,2.134848,-2.651515,4.457576,-1.477273,-1.783333,-8.593939,1.337879,0.843939,-5.078788,-0.212121,-5.295455,1.630303,1.224242,1.700000,-1.650000,-2.139394,-8.183333,-2.074242,-2.340909,win,win
2,40.000000,30.769231,41.666667,112.435897,win,-1.304545,-1.727273,-2.340909,1.007576,1.662121,-0.006061,-4.340909,-2.875758,-7.104545,-1.687879,-0.634848,-0.683333,-1.336364,0.737879,-3.033333,-0.772727,-0.086364,-2.904545,0.598485,-1.443939,-0.930303,-3.893939,2.474242,2.318182,0.380303,1.815152,-0.819697,1.513636,2.259091,2.087879,-1.880303,-1.883333,-3.345455,-0.862121,-2.889394,loss,win
3,69.444444,26.666667,16.666667,112.777778,loss,1.987121,1.738636,3.522727,-0.628788,7.068939,2.479545,-3.802273,0.364394,2.046212,-0.748485,3.729545,2.359848,0.815152,0.881818,-0.889394,3.115152,3.377273,2.496212,3.509091,3.254545,5.178788,-2.552273,6.603788,4.287879,1.973485,2.832576,2.268182,3.721970,5.809091,6.203788,-1.730303,-2.219697,-6.884848,-1.578788,-3.271970,win,win
4,23.809524,29.411765,58.823529,112.044818,loss,-4.192424,-3.103030,-4.975000,-5.484848,-3.889394,-6.358333,-16.517382,-6.375000,-13.791204,-3.639394,-5.990152,-4.550000,-2.343182,-0.792424,-8.065530,-1.187879,-9.802694,-5.662121,-4.789815,-5.181818,0.752273,-4.948485,1.717424,-1.347727,-7.248485,-5.824327,-4.725758,-0.849242,-1.114394,0.294655,-1.688636,-1.321212,-3.396212,0.259091,-1.914394,loss,loss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24552,61.728395,30.303030,14.285714,106.317140,draw,-5.777964,-8.200703,-6.806813,-6.907349,-6.263478,-7.533182,-8.370592,-6.603655,-2.392316,-2.692100,-3.144065,-7.144078,-4.978793,-5.281524,-4.384167,-9.192021,-1.049178,-4.133189,-1.021323,-3.580454,-0.104411,-0.972453,-4.178659,-6.799751,-10.141172,-6.318919,-1.712699,-5.185876,-7.090202,-4.447557,-0.150467,-0.249365,-0.066923,-1.216418,-0.685389,win,loss
24553,42.016807,32.258065,32.258065,106.532936,draw,-4.529412,-4.272727,-4.473262,-0.224599,-7.764706,-4.882353,-1.802139,-4.243316,1.673797,2.839572,-3.962567,-5.868984,-2.518717,-3.387701,-0.505348,-7.751337,1.016043,-5.034759,-4.470588,-6.938503,-7.711230,-0.171123,-7.139037,-4.283422,-8.502674,-5.673797,-0.069519,-3.374332,-5.147059,-3.152406,0.877005,-1.275401,-0.644385,-2.673797,-0.438503,win,loss
24554,63.694268,28.571429,14.285714,106.551410,loss,-6.786096,-10.721925,-6.558824,-0.767380,-6.171123,-7.986631,-5.911765,-5.096257,-0.836898,0.211230,-4.807487,-7.377005,-2.941176,-4.133690,-5.008021,-9.109626,0.283422,-3.513369,-4.518717,-4.780749,-2.810160,0.553476,-9.334225,-7.315508,-7.890374,-9.847594,1.109626,-3.711230,-6.173797,-4.754011,-0.449198,-0.195187,-0.171123,-2.553476,-1.620321,win,loss
24555,41.666667,32.258065,32.258065,106.182796,loss,-3.609626,-5.791444,-1.588235,-4.890374,-4.326203,-4.628342,-6.165775,-3.917112,-1.203209,2.812834,-2.740642,-4.631016,0.034759,-1.962567,-0.946524,-6.890374,1.970588,-5.545455,-6.021390,-2.232620,-3.181818,-0.350267,-5.705882,-3.548128,-8.040107,-6.165775,3.112299,-2.580214,-3.794118,-0.614973,-2.069519,0.906417,-1.716578,-1.582888,-2.438503,win,draw


In [7]:
bookmaker_count = 0
model_count = 0
for row in df.loc[X_test.index].itertuples():
    if row.result == row.bookmaker_prediction:
        bookmaker_count += 1
    if row.result == row.model_prediction:
        model_count += 1

model_count / len(X_test), bookmaker_count / len(X_test)


(0.5021040974529347, 0.4159468438538206)

## <font color='blue'><b>4. Zou je willen gokken?</b></font>