# Assessment: Analyzing Soccer Data

TODO: add intro

## 0. Set-up

### a. Load linting tool, create spark session, etc.

In [26]:
%load_ext pycodestyle_magic

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [25]:
#%%pycodestyle
from pyspark.sql import SparkSession
import pyspark.sql.functions as func

# Create spark app
spark = (
    SparkSession
    .builder
    .appName("rb assessment app")
    .getOrCreate()
)


### a. Data Engineering for Task 1

In [77]:
#%%pycodestyle
import pyspark.sql.functions as func

bl_results_spark_df = (
    # read matches folder
    spark
    .read
    .json(
        "data/matches/*/",
        multiLine=True
    )
    # filter 1. Bundesliga 2015/16 matches
    .where(
        func.col("competition.competition_name").eqNullSafe("1. Bundesliga")
        & func.col("season.season_name").eqNullSafe("2015/2016")
    )
    # order by date to make sure that the ELO function is used approprietly
    .orderBy(func.to_date("match_date"))
    # perform the mapping to get one row for each team and match
    # the associated result is:
    # - win = 1
    # - draw = 0.5
    # - loss = 0
    .select(
        func.col(
            "home_team.home_team_name"
        ).alias("home_team_name"),
        func.col(
            "away_team.away_team_name"
        ).alias("away_team_name"),
        func.coalesce(
            func.when(func.expr("home_score > away_score"), 1),
            func.when(func.expr("home_score < away_score"), 0),
            func.lit(0.5)
        ).alias("home_team_result"),
        func.coalesce(
            func.when(func.expr("home_score > away_score"), 0),
            func.when(func.expr("home_score < away_score"), 1),
            func.lit(0.5)
        ).alias("away_team_result")
    )
)

# print schema for verification
bl_results_spark_df.printSchema()

# cache result to pandas df
bl_result_df = bl_results_spark_df.toPandas()
# display resulting df
print(bl_result_df)

root
 |-- home_team_name: string (nullable = true)
 |-- away_team_name: string (nullable = true)
 |-- home_team_result: double (nullable = false)
 |-- away_team_result: double (nullable = false)

    home_team_name            away_team_name  home_team_result  \
0    Bayern Munich              Hamburger SV               1.0   
1         Augsburg             Hertha Berlin               0.0   
2    Werder Bremen                Schalke 04               0.0   
3     FSV Mainz 05                Ingolstadt               0.0   
4     Darmstadt 98               Hannover 96               0.5   
..             ...                       ...               ...   
301   Darmstadt 98  Borussia Mönchengladbach               0.0   
302  Bayern Munich               Hannover 96               1.0   
303   FSV Mainz 05             Hertha Berlin               0.5   
304  Werder Bremen       Eintracht Frankfurt               1.0   
305      Wolfsburg             VfB Stuttgart               1.0   

     away_t

## 1. Task 1 - Elo Rating

### a. Develop a function to implement the ELO Rating System with arbitrary K and s.

In [100]:
#%%pycodestyle
import math
import operator

def predict(rating_a, rating_b, s=15):
    return 1 / (1 + math.pow(10, -(rating_a - rating_b) / s))

def update_elo(rating_a, rating_b, outcome, s=15, K=15):
    # expected outcome
    pred = predict(rating_a, rating_b, s=15)
    # step
    return rating_a + K * (outcome - pred)

def elo(results, s=15, K=15, R_0=100):
    # init ratings
    ratings = {team_name: R_0 for team_name in results["home_team_name"].unique()}
    for index, row in results.iterrows():
        ratings[row["home_team_name"]] = update_elo(
            ratings[row["home_team_name"]], 
            ratings[row["away_team_name"]], 
            row["home_team_result"],
            s, K
        )
        ratings[row["away_team_name"]] = update_elo(
            ratings[row["away_team_name"]], 
            ratings[row["home_team_name"]], 
            row["away_team_result"],
            s, K
        )
    return dict(sorted(ratings.items(), key=operator.itemgetter(1), reverse=True))


### b. Apply the rating system to 1. Bundesliga 2015/2016 Season with starting values R0 = 100, s = 15 and K = 15.

In [101]:
elo(bl_result_df)

{'Borussia Mönchengladbach': 128.73684750507124,
 'Bayern Munich': 126.03882090784246,
 'Werder Bremen': 122.53431008967196,
 'Bayer Leverkusen': 119.86744451903735,
 'Schalke 04': 116.7404982618847,
 'Eintracht Frankfurt': 115.13792895836448,
 'Borussia Dortmund': 113.14276236567889,
 'FC Köln': 112.7931080154221,
 'Hannover 96': 112.683637686516,
 'Darmstadt 98': 104.6708369599969,
 'Hoffenheim': 104.14444368636025,
 'Ingolstadt': 104.05781546247819,
 'Hamburger SV': 103.82508894341434,
 'Wolfsburg': 103.54107736006338,
 'Augsburg': 100.16514926109396,
 'FSV Mainz 05': 98.21672472241177,
 'Hertha Berlin': 96.401623690483,
 'VfB Stuttgart': 88.0062841309003}

### c. Develop an approach that finds the optimal values for s and K based on that season and display the final ranking table at the end of the season.

We could take the assumption that the optimal values for parameters s and K are the one that would minimize the discrepancy between the predicted outcome and actual match result for each step of that particular season.

Let's define our loss function as the average brier score of our ELO Rating System.

In [113]:
def brier_score(results, s=15, K=15, R_0=100):
    # init ratings
    ratings = {team_name: R_0 for team_name in results["home_team_name"].unique()}
    agg = 0
    for index, row in results.iterrows():
        agg += (
            (
                predict(ratings[row["home_team_name"]], ratings[row["away_team_name"]], s) \
                - row["home_team_result"]
            ) ** 2 \
            + (
                predict(ratings[row["away_team_name"]], ratings[row["home_team_name"]], s) \
                - row["away_team_result"]
            ) ** 2
        )
        ratings[row["home_team_name"]] = update_elo(
            ratings[row["home_team_name"]], 
            ratings[row["away_team_name"]], 
            row["home_team_result"],
            s, K
        )
        ratings[row["away_team_name"]] = update_elo(
            ratings[row["away_team_name"]], 
            ratings[row["home_team_name"]], 
            row["away_team_result"],
            s, K
        )
    return agg / (2 * results.shape[0])


brier_score(bl_result_df)

0.23717873793811012