In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# General

With the upcoming European Championchip 2021, I was curious on how good a model can predict the outcomes of a soccer match.
While looking for training datasets I found this [amazing dataset](https://www.kaggle.com/martj42/international-football-results-from-1872-to-2017) by Mart Jürisoo which cointains national soccer matches back to 1872.

**Disclaimer:** I am just getting started with Kaggle and this is my first notebook. I would really appreciate recommendations for improvement. Any tips are welcome! Thanks! :)

In [None]:
data = pd.read_csv(os.path.join(dirname, filename))

# Data preperation
For feature selection I decided to drop the location of the match and the "neutral" column which indicates if the game was played at a neutral location

In [None]:
# Drop NA rows
dropped_rows_data = data.dropna(axis=0, subset = ['home_score', 'away_score'], inplace=True)

# Drop columns
dropped_data = data.drop(["city", "country", "neutral"], axis=1)

dropped_data.head()

I considered differentiating between friendly matches and competitive matches (independent of the tournament though). But after trying this did not had any benefit.

In [None]:
# Change Tournament column to boolean expression to differ between friendly/ competitive
#competitive_list = []
#for i, data_point in dropped_data.iterrows():
#    if data_point.tournament == "Friendly":
#        competitive_list.append(False)
#    else:
#        competitive_list.append(True)
#        
#dropped_data["competitive"] = competitive_list
#
competitive_data = dropped_data.drop("tournament", axis=1)

Predicting the exact score seems to be a pretty excessive goal. So in addition to that I wanted to try a model which only predict the tendency of the game. So if the home team wins, ties or looses the game. Because the models are not able to handle strings I labeled them:
I labeled them
* 0: Tie
* 1: Win
* -1: Loss


In [None]:
# Add evaluation column "home win"
game_result = []
for i, data_point in competitive_data.iterrows():
    if data_point.home_score == data_point.away_score:
        game_result.append(0)
    elif data_point.home_score < data_point.away_score:
        game_result.append(-1)
    else:
        game_result.append(1)
        
competitive_data["game_result_home"] = game_result

competitive_data.head()

To get the date into a type which can be handled by the model I split the date column into 3 columns, containing the year, the month and the day.

In [None]:
# Split Date
date = competitive_data.date
split_date = date.str.split("-")

split_date = split_date.apply(pd.Series)
split_date.head()

In [None]:
year = split_date[0]
month = split_date[1]
day = split_date[2]

processed_date_data = competitive_data

processed_date_data["year"] = pd.to_numeric(year)
processed_date_data["month"] = pd.to_numeric(month)
processed_date_data["day"] = pd.to_numeric(day)

processed_date_data = processed_date_data.drop("date", axis=1)

processed_date_data.head()

To transform the home team and away team columns to a fitting form for the model I am using the OneHotEncoder instead of the LabelEncoder. This will generate a column per team to have the best fit for this categorial data. 

In [None]:
# Get list of categorical variables
s = (processed_date_data.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols = pd.DataFrame(OH_encoder.fit_transform(processed_date_data[object_cols]))

# One-hot encoding removed index; put it back
OH_cols.index = processed_date_data.index

# Remove categorical columns (will replace with one-hot encoding)
numerical_data = processed_date_data.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_data = pd.concat([numerical_data, OH_cols], axis=1)

OH_data.head()

# Getting training and testing data

The features to predict are the score for the home team, the away team and the general game result of the home team (tendency: win, loss, tie). 
I will try different models with different strategies and not all features will be predicted by one model.

In [None]:
prediction_features = ["home_score", "away_score", "game_result_home"]
y_data = OH_data[prediction_features]
X_data = OH_data.drop(prediction_features, axis=1)

For splitting the data into training and testing data I used the very common train_test_split function from sklearn.

In [None]:
from sklearn.model_selection import train_test_split

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X_data, y_data, train_size=0.8, test_size=0.2,
                                                      random_state=0)


# Models

I used the Random Forest Classifier as a first model for all approaches

## First model

For the first model I decided to predict both the home and the away score with the same model.

In [None]:
from sklearn.ensemble import RandomForestClassifier

## Model 1 - Predict exact result - final score for the match
y_train_1 = y_train[["home_score", "away_score"]]

model_classifier_1 = RandomForestClassifier(n_estimators=250, random_state=0)
model_classifier_1.fit(X_train, y_train_1)
preds_classifier_1 = model_classifier_1.predict(X_valid)

print(preds_classifier_1)

## Model 1 evaluation

Evaluating this model it shows, that the MAE for predicting the exact score for the home team and the away team is off by around 1, which indicates that the prediction if off by around one goal for the result score (if I am interpreting the MAE correctly). 
Also the prediction for hitting the exact score right is 27.92% for the home score and 36.67% for the away score. Why there is such a big gap between the two scores I can not explain. But hitting every third to fource score exactly right is actually a pretty decent result.

In [None]:
from sklearn.metrics import mean_absolute_error

# Evaluate Model 1

pred_classifier_home_scores_1 = [row[0] for row in preds_classifier_1]
pred_classifier_away_scores_1 = [row[1] for row in preds_classifier_1]

mae_home_scores_classifier_1 = mean_absolute_error(y_valid.home_score, pred_classifier_home_scores_1)
mae_away_scores_classifier_1 = mean_absolute_error(y_valid.away_score, pred_classifier_away_scores_1)

print("MAE home: ", mae_home_scores_classifier_1)
print("MAE away: ", mae_away_scores_classifier_1)

# Hit ratio home
score = 0
for i, j in zip(pred_classifier_home_scores_1, y_valid.home_score):
    if i == j:
        score = score + 1
hit_ratio_classifier_home_1 = score / len(pred_classifier_home_scores_1)

# Hit ratio away
score = 0
for i, j in zip(pred_classifier_away_scores_1, y_valid.away_score):
    if i == j:
        score = score + 1
hit_ratio_classifier_away_1 = score / len(pred_classifier_away_scores_1)

home_score_1_str = "Hit ratio for home score: " + str(hit_ratio_classifier_home_1 * 100) + "%"
away_score_1_str = "Hit ratio for away score: " + str(hit_ratio_classifier_away_1 * 100) + "%"

print(home_score_1_str)
print(away_score_1_str)

## Model 2

Model 2 also predicts the end score of a game. But instead of using one model for both scores, here I am using the approach of using one model for each the home team's score and the away team's score. My idea was that the optimization function behind the model won't cancel each other out when predicting the two diffent scores.

In [None]:
## Model 2 - Predict goals for each team individually
model_classifier_2_home = RandomForestClassifier(n_estimators=250, random_state=0)
model_classifier_2_away = RandomForestClassifier(n_estimators=250, random_state=0)

model_classifier_2_home.fit(X_train, y_train.home_score)
model_classifier_2_away.fit(X_train, y_train.away_score)

preds_classifier_2_home = model_classifier_2_home.predict(X_valid)
preds_classifier_2_away = model_classifier_2_away.predict(X_valid)

print(preds_classifier_2_home)
print(preds_classifier_2_away)

## Model 2 evaluation

The results for model 2 only improved the results slightly in comparison to model 1. The hitting score went up by around 1% which can probably be neglected. So no huge jump in performance with this approach.

In [None]:
# Evaluate Model 2

mae_home_scores_classifier_2 = mean_absolute_error(y_valid.home_score, preds_classifier_2_home)
mae_away_scores_classifier_2 = mean_absolute_error(y_valid.away_score, preds_classifier_2_away)

print("MAE home: ", mae_home_scores_classifier_2)
print("MAE away: ", mae_away_scores_classifier_2)

# Hit ratio home
score = 0
for i, j in zip(preds_classifier_2_home, y_valid.home_score):
    if i == j:
        score = score + 1
hit_ratio_classifier_home_2 = score / len(preds_classifier_2_home)

# Hit ratio away
score = 0
for i, j in zip(preds_classifier_2_away, y_valid.away_score):
    if i == j:
        score = score + 1
hit_ratio_classifier_away_2 = score / len(preds_classifier_2_away)

home_score_2_str = "Hit ratio for home score: " + str(hit_ratio_classifier_home_2 * 100) + "%"
away_score_2_str = "Hit ratio for away score: " + str(hit_ratio_classifier_away_2 * 100) + "%"

print(home_score_2_str)
print(away_score_2_str)

## Model 3

For this approach I used a slightly differnt method. I thought maybe it might be easier to not predict the exact result but the tendency of the game instead. So just figuring out who is going to win instead of trying to predict the exact score. Thefore I only tried to predict the game_result_home column of the dataset.

In [None]:
## Model 3 - Predict outcome as win, tie or loss for home team

model_classifier_3 = RandomForestClassifier(n_estimators=150, random_state=0)
model_classifier_3.fit(X_train, y_train.game_result_home)
preds_classifier_3 = model_classifier_3.predict(X_valid)

print(preds_classifier_3)

## Model 3 evaluation

This model has a hit ratio of 53,87%. But the results can not be compared to the ones of model 1 and model 2 because this model only has 3 possible outcomes instead of (in theory) indefinatly when predicting the exact score. Approx. 54% correct predictions by 3 possible outcomes is at least better than a wild guess.

In [None]:
# Evaluate Model 3

score = 0
for i, j in zip(preds_classifier_3, y_valid.game_result_home):
    if i == j:
        score = score + 1
hit_ratio_classifier_3 = score / len(preds_classifier_3)

score_3_str = "Hit ratio for home score: " + str(hit_ratio_classifier_3 * 100) + "%"

print(score_3_str)

## Comparing all models

I wanted to compare all models. But because model 3 is only predicting the tendency and therefore is not compareable to the previous models direcly I determined the tendency of the predictions from model 1 and 2 to get a better understanding for the performances.

The results for predicting the tendency of the game are:
* Model 1: 47.37%
* Model 2: 48.38%
* Model 3: 53.87%

So the third model is actually able to predict the tendency better than the models predicting the score (not by a lot but at least by almost 5%). But under consideration that the first two models actually present an actual prediction of the score, the result for hitting at least the correct tendency of the game is not too bad.

In [None]:
wins_valid = y_valid.home_score > y_valid.away_score
ties_valid = y_valid.home_score == y_valid.away_score
losses_valid = y_valid.home_score < y_valid.away_score

## Get wins, losses, ties 
# Model 1
model_1_general_game_pred = []
for i,j in zip(pred_classifier_home_scores_1, pred_classifier_away_scores_1):
    if i == j:
        model_1_general_game_pred.append(0)
    elif i < j:
        model_1_general_game_pred.append(-1)
    else:
        model_1_general_game_pred.append(1)
        
score = 0
for i, j in zip(model_1_general_game_pred, y_valid.game_result_home):
    if i == j:
        score = score + 1
hit_ratio_classifier_1 = score / len(pred_classifier_home_scores_1)

model_1_score_str = "Hit ratio model 1 tendency: " + str(hit_ratio_classifier_1 * 100) + "%"
print(model_1_score_str)

# Model 2
model_2_general_game_pred = []
for i,j in zip(preds_classifier_2_home, preds_classifier_2_away):
    if i == j:
        model_2_general_game_pred.append(0)
    elif i < j:
        model_2_general_game_pred.append(-1)
    else:
        model_2_general_game_pred.append(1)
        
score = 0
for i, j in zip(model_2_general_game_pred, y_valid.game_result_home):
    if i == j:
        score = score + 1
hit_ratio_classifier_2 = score / len(preds_classifier_2_home)

model_2_score_str = "Hit ratio model 2 tendency: " + str(hit_ratio_classifier_2 * 100) + "%"
print(model_2_score_str)

# Predicting upcoming games

The Euro2021 is coming up so I wanted to give it a shot and try out all models. So here are the results for the first couple of games, predicted by the presented models.
Most of the predictions actually have similar outcomes independent of the model.

Original predictions before the games started:
Results after 90min. Result after extension in brackets
## Group stage

| Game                    | Prediction Model 1  | Prediction Model 2  | Prediction Model 3  |  Actual Result      |   
|:-----------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
| Turkey - Italy | 1 - 2 | 1 - 2 | Turkey Looses | 0 - 3  |
| Wales - Switzerland | 1 - 0 | 1 - 0 | Wales Wins | 1 - 1  |
| Denmark - Finland | 2 - 0 | 2 - 1 | Denmark Wins | 0 - 1  |
| Belgium - Russia | 1 - 0 | 1 - 0 | Belgium Wins | 3 - 0  |
| England - Croatia | 2 - 1 | 1 - 1 | England Wins | 1 - 0  |
| Austria - North Macedonia | 1 - 0 | 1 - 0 | Austria Wins | 3 - 1  |
| Netherlands - Ukraine | 3 - 0 | 3 - 0 | Netherlands Wins | 3 - 2  |
| Scotland - Czech Republic | 1 - 0 | 1 - 0 | Scotland Wins | 0 - 2  |
| Poland - Slovakia | 1 - 0 | 1 - 0 | Poland Wins | 1 - 2  |
| Spain - Sweden | 3 - 0 | 3 - 0 | Spain Wins | 0 - 0  |
| Hungary - Portugal | 0 - 3 | 0 - 3 | Hungary Looses | 0 - 3  |
| France - Germany | 0 - 0 | 0 - 0 | France Looses | 1 - 0  |
| Finland - Russia | 0 - 3 | 0 - 3 | Finland Looses | 0 - 1  |
| Turkey - Wales | 0 - 0 | 0 - 0 | Turkey Wins | 0 - 2  |
| Italy - Switzerland | 2 - 0 | 1 - 0 | Italy Wins | 3 - 0  |
| Ukraine - North Macedonia | 1 - 0 | 1 - 0 | Ukraine Wins | 2 - 1  |
| Denmark - Belgium | 0 - 2 | 1 - 2 | Denmark Looses | 1 - 2  |
| Netherlands - Austria | 3 - 1 | 3 - 1 | Netherlands Wins | 2 - 0  |
| Sweden - Slovakia | 1 - 0 | 1 - 0 | Sweden Wins | 1 - 0  |
| Croatia - Czech Republic | 2 - 1 | 1 - 0 | Croatia Wins | 1 - 1  |
| England - Scotland | 2 - 0 | 2 - 0 | England Wins | 0 - 0  |
| Hungary - France | 0 - 0 | 0 - 0 | Hungary Looses | 1 - 1  |
| Portugal - Germany | 2 - 3 | 2 - 3 | Portugal Looses | 2 - 4  |
| Spain - Poland | 1 - 0 | 1 - 0 | Spain Wins | 1 - 1  |
| Italy - Wales | 1 - 0 | 1 - 0 | Italy Wins | 1 - 0  |
| Switzerland - Turkey | 3 - 1 | 3 - 1 | Switzerland Wins | 3 - 1  |
| Ukraine - Austria | 0 - 1 | 0 - 1 | Ukraine Looses | 0 - 1  |
| North Macedonia - Netherlands | 1 - 2 | 1 - 2 | North Macedonia Looses | 0 - 3  |
| Russia - Denmark | 0 - 1 | 0 - 1 | Russia Looses | 1 - 4  |
| Finland - Belgium | 0 - 2 | 0 - 2 | Finland Looses | 0 - 2  |
| Croatia - Scotland | 1 - 1 | 1 - 0 | Croatia Wins | 3 - 1  |
| Czech Republic - England | 0 - 0 | 0 - 0 | Czech Republic Looses | 0 - 1  |
| Slovakia - Spain | 1 - 1 | 1 - 1 | Slovakia Looses | 0 - 5  |
| Sweden - Poland | 3 - 2 | 3 - 2 | Sweden Wins | 3 - 2  |
| Portugal - France | 2 - 2 | 2 - 2 | Portugal Ties | 2 - 2  |
| Germany - Hungary | 2 - 0 | 2 - 0 | Germany Wins | 2 - 2  |

## Round of 16
| Game                    | Prediction Model 1  | Prediction Model 2  | Prediction Model 3  |  Actual Result      |   
|:-----------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
| Wales - Denmark | 0 - 2 | 1 - 2 | Wales Looses | 0 - 4  |
| Italy - Austria | 2 - 1 | 2 - 1 | Italy Wins | 0 - 0 (2 - 1)  |
| Netherlands - Czech Republic | 2 - 0 | 2 - 0 | Netherlands Wins | 2 - 0  |
| Belgium - Portugal | 0 - 0 | 0 - 0 | Belgium Ties | 1 - 0  |
| Croatia - Spain | 0 - 1 | 0 - 1 | Croatia Looses | 3 - 3 (3 - 5)  |
| France - Switzerland | 0 - 0 | 0 - 0 | France Ties | 3 - 3  |
| England - Germany | 1 - 1 | 1 - 1 | England Ties | 2 - 0  |
| Sweden - Ukraine | 1 - 0 | 1 - 0 | Sweden Looses |  1 -1 (1 - 2)  |

## Quarter Finals
| Game                    | Prediction Model 1  | Prediction Model 2  | Prediction Model 3  |  Actual Result      |   
|:-----------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
| Switzerland - Spain | 1 - 1 | 1 - 1 | Switzerland Ties | 1 - 1  |
| Belgium - Italy | 1 - 2 | 1 - 2 | Belgium Looses | 1 - 2  |
| Czech Republic - Denmark | 1 - 2 | 1 - 2 | Czech Republic Looses | 1 - 2  |
| Ukraine - England | 0 - 0 | 0 - 0 | Ukraine Ties | 0 - 4  |

## Semi Finals
| Game                    | Prediction Model 1  | Prediction Model 2  | Prediction Model 3  |  Actual Result      |   
|:-----------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
| Italy - Spain | 1 - 1 | 1 - 1 | Italy Ties | 1 - 1 |
| England - Denmark | 1 - 0 | 1 - 0 | England Wins | 1 - 1 (2 - 1)|

## Final
| Game                    | Prediction Model 1  | Prediction Model 2  | Prediction Model 3  |  Actual Result      |   
|:-----------------------:|:-------------------:|:-------------------:|:-------------------:|:-------------------:|
| Italy - England | 0 - 0 | 0 - 0 | Italy Ties |   |

In [None]:
home_teams = ['Turkey', 'Wales', 'Denmark', 'Belgium', 'England', 'Austria', 
              'Netherlands', 'Scotland', 'Poland', 'Spain', 'Hungary', 'France', 
              'Finland', 'Turkey', 'Italy', 'Ukraine', 'Denmark', 'Netherlands',
              'Sweden', 'Croatia', 'England', 'Hungary', 'Portugal', 'Spain',
              'Italy', 'Switzerland', 'Ukraine', 'North Macedonia', 'Russia', 'Finland',
              'Croatia', 'Czech Republic', 'Slovakia', 'Sweden', 'Portugal', 'Germany',
             # Round of 16
              'Wales', 'Italy', 'Netherlands', 'Belgium', 'Croatia', 'France', 'England', 'Sweden',
             # Quarter Final
              'Switzerland', 'Belgium', 'Czech Republic', 'Ukraine',
             # Semi Final
              'Italy', 'England',
             # Final
              'Italy'
              ]


away_teams = ['Italy', 'Switzerland', 'Finland', 'Russia', 'Croatia', 'North Macedonia', 
              'Ukraine', 'Czech Republic', 'Slovakia', 'Sweden', 'Portugal', 'Germany',
              'Russia', 'Wales', 'Switzerland', 'North Macedonia', 'Belgium', 'Austria',
              'Slovakia', 'Czech Republic', 'Scotland', 'France', 'Germany', 'Poland',
              'Wales', 'Turkey', 'Austria', 'Netherlands', 'Denmark', 'Belgium',
              'Scotland', 'England', 'Spain', 'Poland', 'France', 'Hungary',
             # Round of 16
              'Denmark', 'Austria', 'Czech Republic', 'Portugal', 'Spain', 'Switzerland', 'Germany', 'Ukraine',
             # Quarter Final
              'Spain', 'Italy', 'Denmark', 'England',
             # Semi Final
              'Spain', 'Denmark',
             # Final
              'England'
              
             ]
years = [2021, 2021, 2021, 2021, 2021, 2021, 
         2021, 2021, 2021, 2021, 2021, 2021,
         2021, 2021, 2021, 2021, 2021, 2021,
         2021, 2021, 2021, 2021, 2021, 2021,
         2021, 2021, 2021, 2021, 2021, 2021,
         2021, 2021, 2021, 2021, 2021, 2021,
        # Round of 16
         2021, 2021, 2021, 2021, 2021, 2021, 2021, 2021,
        # Quarter Final
         2021, 2021, 2021, 2021,
        # Semi Final
         2021, 2021,
        # Final
         2021
        ]
months = [6, 6, 6, 6, 6, 6, 
          6, 6, 6, 6, 6, 6,
          6, 6, 6, 6, 6, 6,
          6, 6, 6, 6, 6, 6,
          6, 6, 6, 6, 6, 6,
          6, 6, 6, 6, 6, 6,
         # Round of 16
          6, 6, 6, 6, 6, 6, 6, 6,
         # Quarter Final
          7, 7, 7 ,7,
         # Semi Final
          7, 7,
         # Final
          7
         ]
days = [11, 12, 12, 12, 13, 13, 
        13, 14, 14, 14, 15, 15,
        16, 16, 16, 17, 17, 17,
        18, 18, 18, 19, 19, 19,
        20, 20, 21, 21, 21, 21,
        22, 22, 23, 23, 23, 23,
       # Round of 16
        26, 26, 27, 27, 28, 28, 29, 29,
       # Quarter Final
        2, 2, 3, 3,
       # Semi Final
        6, 7,
       # Final
        11
       ]

upcoming_games_dict = {'home_team' : home_teams,
                       'away_team' : away_teams,
                       'year' : years,
                       'month': months,
                       'day' : days}

upcoming_games = pd.DataFrame(data=upcoming_games_dict)

OH_cols_upcoming_games = pd.DataFrame(OH_encoder.transform(upcoming_games[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_upcoming_games.index = upcoming_games.index

# Remove categorical columns (will replace with one-hot encoding)
numerical_upcoming_games = upcoming_games.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
X_predict = pd.concat([numerical_upcoming_games, OH_cols_upcoming_games], axis=1)

## Model 1
upcoming_games_prediction_classifier_1 = model_classifier_1.predict(X_predict)

## Model 2
upcoming_games_prediction_classifier_2_home = model_classifier_2_home.predict(X_predict)
upcoming_games_prediction_classifier_2_away = model_classifier_2_away.predict(X_predict)

## Model 3
upcoming_games_prediction_classifier_3 = model_classifier_3.predict(X_predict)
decode_match_result = {1: "Wins", -1: "Looses", 0: "Ties"}
    
for i, game in upcoming_games.iterrows():
    result_model_1_str = str(int(upcoming_games_prediction_classifier_1[i][0])) + " - " + str(int(upcoming_games_prediction_classifier_1[i][1]))
    result_model_2_str = str(int(upcoming_games_prediction_classifier_2_home[i])) + " - " + str(int(upcoming_games_prediction_classifier_2_away[i]))
    result_model_3_str = game.home_team + " " + str(decode_match_result[upcoming_games_prediction_classifier_3[i]])
    result_str = "| " + game.home_team + " - " + game.away_team + " | " + result_model_1_str + " | " + result_model_2_str + " | " + result_model_3_str + " |   |"
    
    print(result_str)