## Hi There!

This is just a personal project that I worked on. Was keen on doing some data analysis using pythong and them attempting to predict the Euro 2024 scores.
My Goals:
1. Predict at least 50% of the group stage winners
2. Have the predicted winner make it to atleast the QF

### Extracting tables from the upcoming Euros

In [None]:
#Libraries
import pandas as pd
from string import ascii_uppercase as alphabet
import pickle
from bs4 import BeautifulSoup
import requests
from scipy.stats import poisson

#Import the wikipedia page:
all_tables = pd.read_html('https://en.wikipedia.org/wiki/UEFA_Euro_2024')

#Understanding the correct tables:
all_tables[18]
all_tables[25]
all_tables[32]
all_tables[39]
all_tables[46]
all_tables[53]



In [None]:
#Create dictionary + assign the group letters to the groups
dict_table = {}
for letter, i in zip(alphabet, range(18,60, 7)):
    df = all_tables[i]
    #Rename from "Teamvte" to Team
    df.rename(columns={df.columns[1]: 'Team'}, inplace=True)
    #Remove the qual column from the group
    df.pop("Qualification")
    dict_table[f'Group {letter}'] = df

In [None]:
dict_table['Group F']

In [None]:
#Use pickle to export the dictionary
with open('dict_table', 'wb') as output:
    pickle.dump(dict_table, output)

### Extracting Football Matches

In [None]:
years = [1960, 1964, 1968, 1972, 1976, 1980, 1984, 1988, 1992, 1996, 2000, 2004, 2008, 2012, 2016, 2020]

def get_matches(year):
    web = f'https://en.wikipedia.org/wiki/UEFA_Euro_{year}'
    response = requests.get(web)
    content = response.text
    soup = BeautifulSoup(content, 'lxml')

    matches = soup.find_all('div', class_= 'footballbox')

    home = []
    score = []
    away = []


    for match in matches:
        home.append(match.find('th', class_='fhome').get_text())
        score.append(match.find('th', class_='fscore').get_text())
        away.append(match.find('th', class_='faway').get_text())

    dict_football = {'home': home, 'score':score, 'away': away}
    df_football = pd.DataFrame(dict_football)
    df_football['year'] = year
    return df_football

In [None]:
euro = [get_matches(year) for year in years]
df_euro = pd.concat(euro, ignore_index=True)
df_euro.to_csv('euro_historical_data.csv', index=False)

In [None]:
#Fixtures for 2024
df_fixture = get_matches(2024)
df_fixture.to_csv('euro_2024_fixtures.csv', index=False)

### Data Cleaning and Transformation

In [None]:
df_historical_data = pd.read_csv('euro_historical_data.csv')
df_fixture = pd.read_csv('euro_2024_fixtures.csv')

In [None]:
#Cleaning
df_fixture['home'] = df_fixture['home'].str.strip()
df_fixture['away'] = df_fixture['away'].str.strip()

df_historical_data['home'] = df_historical_data['home'].str.strip()
df_historical_data['away'] = df_historical_data['away'].str.strip()

#getting rid of the (a.e.t)
df_historical_data['score'] = df_historical_data['score'].str.replace('[^\d–]', '', regex=True)

In [None]:
#Cleaning the scores and giving them to home and away
df_historical_data[['HomeGoal', 'AwayGoal']] = df_historical_data['score'].str.split('–', expand=True)

In [None]:
df_historical_data.drop('score', axis=1, inplace=True)

In [None]:
df_historical_data.drop('HomeGoal', axis=1, inplace=True )
df_historical_data.drop('AwayGoal', axis=1, inplace=True )

In [None]:
#Renaming Columns + Converting goals from object to int
df_historical_data.rename(columns={'home': 'Home Team', 'away': 'Away Team', 'year': 'Year'}, inplace=True)
df_historical_data = df_historical_data.astype({'Home Goal': int, 'Away Goal': int, 'Year': int})

In [None]:
#Creating new column for total goals
df_historical_data['Total Goals'] =df_historical_data['Home Goal'] + df_historical_data['Away Goal']

### Exporting the Cleaned DF

In [None]:
df_historical_data.to_csv('cleaned_euro_historical_data.csv', index=False)

In [None]:
df_fixture.to_csv('cleaned_euro_2024_fixtures.csv', index=False)

### Time to build the model

In [None]:
dict_table = pickle.load(open('dict_table', 'rb'))
df_historical_data = pd.read_csv('cleaned_euro_historical_data.csv')
df_fixture = pd.read_csv('cleaned_euro_2024_fixtures.csv')

Calculating Team Strength

In [None]:
#split df into df_home and df_away
df_home = df_historical_data[['Home Team', 'Home Goal', 'Away Goal']]
df_away = df_historical_data[['Away Team', 'Home Goal', 'Away Goal']]

In [None]:
df_home = df_home.rename(columns={'Home Team': 'Team','Home Goal': 'Goals Scored', 'Away Goal': 'Goals Conceded' })
df_away = df_away.rename(columns={'Away Team': 'Team','Home Goal': 'Goals Conceded', 'Away Goal': 'Goals Scored' })

In [None]:
#Building the team strength
df_team_strength = pd.concat([df_home, df_away ], ignore_index=True).groupby('Team').mean()
df_team_strength


Prediction

Going to be using poisson distribution 

In [None]:
def predict_points(home, away):
    if home in df_team_strength.index and away in df_team_strength.index:
        #goals scored * goals conceced
        lamb_home = df_team_strength.at[home, 'Goals Scored'] * df_team_strength.at[away, 'Goals Conceded']
        lamb_away = df_team_strength.at[away, 'Goals Scored'] * df_team_strength.at[home, 'Goals Conceded']
        prob_home, prob_away, prob_draw = 0, 0, 0
        for x in range(0,11):
            for y in range(0,11):
                p = poisson.pmf(x, lamb_home) * poisson.pmf(y, lamb_away)
                if x == y:
                    prob_draw += p
                elif x  > y:
                    prob_home += p
                else:
                    prob_away += p
        points_home = 3 * prob_home + prob_draw
        points_away = 3* prob_away + prob_draw
        return (points_home, points_away)
    else:
        return (0,0)

Predictions

In [264]:
df_fixture_group36 = df_fixture[:36].copy()
df_fixture_knockout = df_fixture[37:44].copy()
df_fixture_quarter = df_fixture[44:48].copy()
df_fixture_semi = df_fixture[48:50].copy()
df_fixture_final = df_fixture[50:].copy()



KeyError: 0

In [260]:
#running all the games in the group stage
for group in dict_table:
    teams_in_group = dict_table[group]['Team'].values
    df_fixture_group_6 = df_fixture_group36[df_fixture_group36['home'].isin(teams_in_group)]
    for index, row in df_fixture_group_6.iterrows():
        home, away = row['home'], row['away']
        points_home, points_away = predict_points(home, away)
        dict_table[group].loc[dict_table[group]['Team'] == home, 'Pts'] += points_home
        dict_table[group].loc[dict_table[group]['Team'] == away, 'Pts'] += points_away
    dict_table[group] = dict_table[group].sort_values('Pts', ascending=False).reset_index()
    dict_table[group] = dict_table[group][['Team', 'Pts']]
    dict_table = dict_table[group].round(0)

#1:54:17


KeyError: 'Team'