In [158]:
import numpy as np
import pandas as pd
import geopandas as gpd
from geopy.distance import geodesic

ROOT = '/home/robert/.config/JetBrains/DataSpell2021.3/projects/MLS'
INITIAL_ELO = 1500
K = 20
HOME_ADV = 100

In [159]:
AllYears = pd.read_csv(f'{ROOT}/Results/AllYears.csv', index_col=0)
AllYears.drop(['Venue'], axis=1, inplace=True)
AllYears = AllYears.sort_values(by=['Date'])

In [160]:
teams = {'Atlanta Utd': ['ATL', 'Atlanta'],
         'Austin FC': ['ATX', 'Austin'],
         'CF Montréal': ['MTL', 'Montreal'],
         'Charlotte FC': ['CLT', 'Charlotte'],
         'Chicago Fire': ['CHI', 'Chicago'],
         'Chivas USA': ['CHV', 'Los Angeles'],
         'Colorado Rapids': ['COL', 'Denver'],
         'Columbus Crew': ['CLB', 'Columbus'],
         'D.C. United': ['DC', 'Washington DC'],
         'Dallas Burn': ['DAL', 'Dallas'],
         'FC Cincinnati': ['CIN', 'Cincinnati'],
         'FC Dallas': ['DAL', 'Dallas'],
         'Houston Dynamo': ['HOU', 'Houston'],
         'Inter Miami': ['MIA', 'Miami'],
         'KC Wiz': ['SKC', 'Kansas City'],
         'KC Wizards': ['SKC', 'Kansas City'],
         'LA Galaxy': ['LA', 'Los Angeles'],
         'Los Angeles FC': ['LAFC', 'Los Angeles'],
         'MetroStars': ['RBNY', 'New York'],
         'Miami Fusion': ['MIAF', 'Miami'],
         'Minnesota Utd': ['MIN', 'Saint Paul'],
         'Montreal Impact': ['MTL', 'Montreal Impact'],
         'NY Red Bulls': ['RBNY', 'New York'],
         'NYCFC': ['NYC', 'New York'],
         'Nashville': ['NSH', 'Nashville'],
         'New England': ['NE', 'Foxborough'],
         'Orlando City': ['ORL', 'Orlando'],
         'Philadelphia': ['PHI', 'Chester'],
         'Portland Timbers': ['POR', 'Portland'],
         'Real Salt Lake': ['RSL', 'Sandy'],
         'San Jose': ['SJ', 'San Jose'],
         'Seattle': ['SEA', 'Seattle'],
         'Sporting KC': ['SKC', 'Kansas City'],
         'Tampa Bay': ['TB', 'Tampa Bay'],
         'Toronto FC': ['TOR', 'Toronto'],
         'Vancouver': ['VAN', 'Vancouver']
             }
elo = {}

In [161]:
def get_distance(city1, city2):
    def get_lat_long(city):
        lat = gpd.tools.geocode(city).geometry.y.values[0]
        long = gpd.tools.geocode(city).geometry.x.values[0]
        return (lat, long)
    return int(geodesic(get_lat_long(city1), get_lat_long(city2)).mi)

## Elo Functions

In [162]:
def win_chance(elo1, elo2):
    """Given elo1 and elo2, will return the chance of elo1 winning"""
    return  1 / ( 1 + 10**((elo2-elo1)/400) )

def get_g(score1, score2):
    if score2 > score1:
        score1, score2 = score2, score1
    if score1 - score2 <=1:
        return 1
    elif score1 - score2 == 2:
        return 1.5
    else:
        return (11 + score1 - score2) / 8

def update_elo(home_elo, away_elo, home_score, away_score):
    G = get_g(home_score, away_score)
    if home_score==away_score:
        W = 0.5
    elif home_score > away_score:
        W = 1
    else:
        W = 0
    We = win_chance(home_elo + HOME_ADV, away_elo)
    elo_change = K * G * (W - We)
    home_elo += elo_change
    away_elo -= elo_change
    return int(round(home_elo, 0)), int(round(away_elo, 0))

update_elo(1500, 1500, 1, 3)

(1481, 1519)

In [163]:
for i in AllYears.index:
    AllYears.at[i, 'Home'] = teams[AllYears.at[i, 'Home']][0]
    AllYears.at[i, 'Away'] = teams[AllYears.at[i, 'Away']][0]
    if AllYears.at[i, 'Winner'] == AllYears.at[i, 'Winner']:
        AllYears.at[i, 'Winner'] = teams[AllYears.at[i, 'Winner']][0]
    else:
        AllYears.at[i, 'Winner'] = 'Draw'
    if AllYears.at[i, 'Home'] not in elo:
        elo[AllYears.at[i, 'Home']]=INITIAL_ELO
    if AllYears.at[i, 'Away'] not in elo:
        elo[AllYears.at[i, 'Away']]=INITIAL_ELO

In [164]:
def get_regular_season(team_code):
    home_games = AllYears[AllYears.Home==team_code][AllYears[AllYears.Home==team_code].Round=='Regular Season']
    away_games = AllYears[AllYears.Away==team_code][AllYears[AllYears.Away==team_code].Round=='Regular Season']
    return pd.concat([home_games, away_games]).sort_values(by=['Date'])

In [165]:
def get_overall_record(team_code):
    df = get_regular_season(team_code)
    wins = len(df[df.Winner==team_code])
    draws = len(df[df.Winner=='Draw'])
    losses = len(df) - wins - draws
    return wins, draws, losses

In [166]:
AllYears['HomeElo'] = np.nan
AllYears['AwayElo'] = np.nan
len(elo)

31

In [167]:
for i in AllYears.index:
    # Define some variables
    home_code = AllYears.at[i, 'Home']
    home_score = AllYears.at[i, 'GHome']
    away_code = AllYears.at[i, 'Away']
    away_score = AllYears.at[i, 'GAway']
    # Add current Elos to df
    AllYears.at[i, 'HomeElo'] = elo[home_code]
    AllYears.at[i, 'AwayElo'] = elo[away_code]

    # Update Elos
    new_home_elo, new_away_elo = update_elo(elo[home_code], elo[away_code], home_score, away_score)
    elo[home_code]= new_home_elo
    elo[away_code]= new_away_elo

## All-time Rolling Elos

Eventually, I will regress the teams to the mean towards the end of the season but for now, a rough this will represent a rough model to further work on some features. Mainly, creating a criteria in which the model will predict draws

In [168]:
AllYears.HomeElo = AllYears.HomeElo.astype('int64')
AllYears.AwayElo = AllYears.AwayElo.astype('int64')
AllYears

Unnamed: 0,Season,Round,Date,Home,GHome,GAway,Away,Winner,HomeElo,AwayElo
6848,1996,Regular Season,1996-04-06,SJ,1,0,DC,SJ,1500,1500
6849,1996,Regular Season,1996-04-13,LA,2,1,RBNY,LA,1500,1500
6850,1996,Regular Season,1996-04-13,TB,3,2,NE,TB,1500,1500
6851,1996,Regular Season,1996-04-13,SKC,3,0,COL,SKC,1500,1500
6852,1996,Regular Season,1996-04-13,CLB,4,0,DC,CLB,1500,1493
...,...,...,...,...,...,...,...,...,...,...
6844,2022,Regular Season,2022-02-27,ATL,3,1,SKC,ATL,1527,1566
6845,2022,Regular Season,2022-02-27,ORL,2,0,MTL,ORL,1517,1465
6847,2022,Regular Season,2022-02-27,SEA,0,1,NSH,NSH,1611,1599
6843,2022,Regular Season,2022-02-27,HOU,0,0,RSL,Draw,1379,1503


### Calculate the percentage of games the end in Draws

In [169]:
PostDraws = AllYears[AllYears.Season>=2000]
Draws = PostDraws[PostDraws.Winner=='Draw']
len(Draws) / len(PostDraws)

0.246286535697173

In [170]:
Predictions = AllYears[AllYears.Season >= 2001].copy(deep=True)
Predictions['Prediction']=np.nan
Predictions

Unnamed: 0,Season,Round,Date,Home,GHome,GAway,Away,Winner,HomeElo,AwayElo,Prediction
797,2001,Regular Season,2001-04-07,RBNY,2,1,NE,RBNY,1518,1449,
801,2001,Regular Season,2001-04-07,DC,3,2,SKC,DC,1468,1553,
796,2001,Regular Season,2001-04-07,CLB,1,1,CHI,Draw,1468,1595,
799,2001,Regular Season,2001-04-07,LA,2,3,SJ,SJ,1598,1414,
798,2001,Regular Season,2001-04-07,DAL,2,4,TB,TB,1523,1507,
...,...,...,...,...,...,...,...,...,...,...,...
6844,2022,Regular Season,2022-02-27,ATL,3,1,SKC,ATL,1527,1566,
6845,2022,Regular Season,2022-02-27,ORL,2,0,MTL,ORL,1517,1465,
6847,2022,Regular Season,2022-02-27,SEA,0,1,NSH,NSH,1611,1599,
6843,2022,Regular Season,2022-02-27,HOU,0,0,RSL,Draw,1379,1503,


In [171]:
for i in Predictions.index:
    if Predictions.at[i, 'HomeElo'] >= Predictions.at[i, 'AwayElo']:
        Predictions.at[i, 'Prediction'] = Predictions.at[i, 'Home']
    else:
        Predictions.at[i, 'Prediction'] = Predictions.at[i, 'Away']

In [172]:
Predictions

Unnamed: 0,Season,Round,Date,Home,GHome,GAway,Away,Winner,HomeElo,AwayElo,Prediction
797,2001,Regular Season,2001-04-07,RBNY,2,1,NE,RBNY,1518,1449,RBNY
801,2001,Regular Season,2001-04-07,DC,3,2,SKC,DC,1468,1553,SKC
796,2001,Regular Season,2001-04-07,CLB,1,1,CHI,Draw,1468,1595,CHI
799,2001,Regular Season,2001-04-07,LA,2,3,SJ,SJ,1598,1414,LA
798,2001,Regular Season,2001-04-07,DAL,2,4,TB,TB,1523,1507,DAL
...,...,...,...,...,...,...,...,...,...,...,...
6844,2022,Regular Season,2022-02-27,ATL,3,1,SKC,ATL,1527,1566,SKC
6845,2022,Regular Season,2022-02-27,ORL,2,0,MTL,ORL,1517,1465,ORL
6847,2022,Regular Season,2022-02-27,SEA,0,1,NSH,NSH,1611,1599,SEA
6843,2022,Regular Season,2022-02-27,HOU,0,0,RSL,Draw,1379,1503,RSL


In [173]:
len(Predictions[Predictions.Winner==Predictions.Prediction])/len(Predictions)

0.43192333113020487

The very first attempt shows correct predictions at 43%. Having not accounted for any draws which, shown above, happen about 25% of the time. 43 > 75/2, so I think this shows promise as a predictive measure of Major League Soccer results.

In [174]:
#this is dumb