# Logistic Regression
*Daniel Öman & Xander Yoon*

Import Dependencies

In [243]:
import numpy as np
import pandas as pd
import sqlite3
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


Read database

In [117]:
path = "database.sqlite"
connection = sqlite3.connect(path)

# Data Cleaning and Feature Engineering

In [118]:
# Read Match data
matches = pd.read_sql("SELECT * FROM Match;", connection)

In [119]:
for c in matches.columns:
    print(c)

id
country_id
league_id
season
stage
date
match_api_id
home_team_api_id
away_team_api_id
home_team_goal
away_team_goal
home_player_X1
home_player_X2
home_player_X3
home_player_X4
home_player_X5
home_player_X6
home_player_X7
home_player_X8
home_player_X9
home_player_X10
home_player_X11
away_player_X1
away_player_X2
away_player_X3
away_player_X4
away_player_X5
away_player_X6
away_player_X7
away_player_X8
away_player_X9
away_player_X10
away_player_X11
home_player_Y1
home_player_Y2
home_player_Y3
home_player_Y4
home_player_Y5
home_player_Y6
home_player_Y7
home_player_Y8
home_player_Y9
home_player_Y10
home_player_Y11
away_player_Y1
away_player_Y2
away_player_Y3
away_player_Y4
away_player_Y5
away_player_Y6
away_player_Y7
away_player_Y8
away_player_Y9
away_player_Y10
away_player_Y11
home_player_1
home_player_2
home_player_3
home_player_4
home_player_5
home_player_6
home_player_7
home_player_8
home_player_9
home_player_10
home_player_11
away_player_1
away_player_2
away_player_3
away_player_4
a

In [120]:
players = pd.read_sql("SELECT * FROM Player;", connection)

How to get a player in a matches starting lineup

In [121]:
players[players["player_api_id"] == matches["away_player_10"][25977]]

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
1771,1774,92252,Christian Schneuwly,184285,1988-02-07 00:00:00,177.8,161


In [122]:
teams = pd.read_sql("SELECT * FROM Team_Attributes;", connection)

In [123]:
for c in teams.columns:
    print(c)

id
team_fifa_api_id
team_api_id
date
buildUpPlaySpeed
buildUpPlaySpeedClass
buildUpPlayDribbling
buildUpPlayDribblingClass
buildUpPlayPassing
buildUpPlayPassingClass
buildUpPlayPositioningClass
chanceCreationPassing
chanceCreationPassingClass
chanceCreationCrossing
chanceCreationCrossingClass
chanceCreationShooting
chanceCreationShootingClass
chanceCreationPositioningClass
defencePressure
defencePressureClass
defenceAggression
defenceAggressionClass
defenceTeamWidth
defenceTeamWidthClass
defenceDefenderLineClass


### Dataset preprocessing
For the binary classification problem, we will use "home team win" as the positive label and "home team draw/loss" as the negative label. We can extend this to three labels for a multi-class classification problem. Let's introduce this label to our dataset:

In [146]:
binary_class_label = "home_team_win"

In [147]:
matches[binary_class_label] = np.where(matches["home_team_goal"] - matches["away_team_goal"] > 0, 1, 0)

In [148]:
# % of home wins in the dataset
num_home_wins = np.count_nonzero(matches["home_team_win"] == 1)
print(f"Percentage of home wins: {round(num_home_wins / matches.shape[0] * 100, 2)}%")

Percentage of home wins: 45.87%


### Helper functions

In [226]:
def train_test(table, test_size = 0.1):
    """
    Returns (train, test) of matches with a default 10% split
    """
    test = table.sample(frac = test_size, random_state = 200)
    train = table.drop(test.index)
    return train, test

# Model Training
1. Betting Odds
2. Team Match History
3. FIFA Attributes
4. Shape of formation

## 1. Betting Odds

We will use a logistic regression model trained on betting odds to predict match outcomes of a test dataset.

In [227]:
odds_columns = ['B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'LBH', 'LBD', 'LBA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'SJH', 'SJD', 'SJA', 'VCH', 'VCD', 'VCA', 'GBH', 'GBD', 'GBA', 'BSH', 'BSD', 'BSA']

In [228]:
# Clean Data
odds_data = matches[odds_columns + [binary_class_label]].dropna()

In [235]:
# Train model with 10% data split
odds_train, odds_test = train_test(odds_data)
odds_lr = LogisticRegression(solver="newton-cholesky")
odds_lr.fit(odds_train[odds_columns], odds_train[binary_class_label])

In [236]:
# Evaluate using RMSE
odds_predict = odds_lr.predict(odds_test[odds_columns])
mean_squared_error(odds_test[binary_class_label], odds_predict)

0.30434782608695654

QUESTION: What is the best test/train ratio? Let's investigate this using the odds data

In [237]:
def test_split_rmse(data, test_size, features):
    """
    Returns RMSE of training and testing odds data using the test_size split
    """
    train, test = train_test(data, test_size)
    lr = LogisticRegression(solver="newton-cholesky")
    lr.fit(train[features], train[binary_class_label])
    pred = lr.predict(test[features])
    return mean_squared_error(test[binary_class_label], pred)

In [242]:
for i in range(10, 50):
    test_size = i / 100
    rmse = test_split_rmse(odds_data, test_size, odds_columns)
    print(f"TEST SIZE = {round(test_size * 100)}% : RMSE = {round(rmse, 4)}")

TEST SIZE = 10% : RMSE = 0.3043
TEST SIZE = 11% : RMSE = 0.3102
TEST SIZE = 12% : RMSE = 0.2991
TEST SIZE = 13% : RMSE = 0.3101
TEST SIZE = 14% : RMSE = 0.3212
TEST SIZE = 15% : RMSE = 0.3269
TEST SIZE = 16% : RMSE = 0.3288
TEST SIZE = 17% : RMSE = 0.3369
TEST SIZE = 18% : RMSE = 0.3327
TEST SIZE = 19% : RMSE = 0.3282
TEST SIZE = 20% : RMSE = 0.3303
TEST SIZE = 21% : RMSE = 0.3212
TEST SIZE = 22% : RMSE = 0.3234
TEST SIZE = 23% : RMSE = 0.3186
TEST SIZE = 24% : RMSE = 0.3192
TEST SIZE = 25% : RMSE = 0.328
TEST SIZE = 26% : RMSE = 0.325
TEST SIZE = 27% : RMSE = 0.3306
TEST SIZE = 28% : RMSE = 0.3342
TEST SIZE = 29% : RMSE = 0.3417
TEST SIZE = 30% : RMSE = 0.3422
TEST SIZE = 31% : RMSE = 0.3407
TEST SIZE = 32% : RMSE = 0.3424
TEST SIZE = 33% : RMSE = 0.3454
TEST SIZE = 34% : RMSE = 0.3479
TEST SIZE = 35% : RMSE = 0.3534
TEST SIZE = 36% : RMSE = 0.3478
TEST SIZE = 37% : RMSE = 0.3471
TEST SIZE = 38% : RMSE = 0.3457
TEST SIZE = 39% : RMSE = 0.3423
TEST SIZE = 40% : RMSE = 0.3412
TEST SIZE 

## 2. Team Match History

## 3. FIFA Attributes

## 4. Formation shape
Use unsupervised learning to categorize formations into narrow/wide, defensive/attacking, output "formation width index" and "formation aggressiveness index"