# Web Scrapping Project: Euro 2020 Stats

## Import the required libraries

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from matplotlib import pyplot as plt, cm, colors
import squarify
import time
import import_ipynb
import re
from collections import Counter
from dataclasses import dataclass
from typing import Optional, List
from Euro_2020 import urls

## Results Class

In [None]:
@dataclass
class Results:
    home_team: str
    away_team: str
    home_goals: int
    away_goals: int
    pen_winner: Optional[str] = None
    pen_score: Optional[str] = None

    def is_draw(self):
        game_draw = self.home_goals == self.away_goals
        if self.pen_winner is not None: return False
        return game_draw
    
    def winner(self):
        if not self.is_draw():
            if self.home_goals > self.away_goals: return self.home_team
            elif self.away_goals > self.home_goals: return self.away_team
            else: return self.pen_winner
    
    def loser(self):
        if not self.is_draw():
            return self.away_team if self.winner() == self.home_team else self.home_team
    
    def goal_scored(self):
        return self.home_goals + self.away_goals
    
    def goal_diff(self):
        return abs(self.home_goals - self.away_goals)
    
    def __contains__(self,team):
        return team in [self.home_team, self.away_team]
    
    def __str__(self):
        return f'{self.home_team} {self.home_goals} - {self.away_goals} {self.away_team}'


In [None]:
# The list results will contain all match objects.
def result_class(urls):
    results = []
    KO_START_DATE = pd.Timestamp('2021-06-28').date()
    for url in urls:
        r = requests.get(url)
        match_date = pd.Timestamp(url.split('/')[-1]).date()
        time.sleep(1)
        soup = bs(r.text, 'html.parser')
        # THIS WILL GET ALL THE FIXTURES ON THE PAGE
        matches = soup.find_all('article', class_= 'sp-c-fixture')
        for match in matches:
            home_team = match.select_one('.sp-c-fixture__team-name--home .sp-c-fixture__team-name-trunc').text
            away_team = match.select_one('.sp-c-fixture__team-name--away .sp-c-fixture__team-name-trunc').text
            home_goals = match.select_one('.sp-c-fixture__number--home').text
            away_goals = match.select_one('.sp-c-fixture__number--away').text
            if match_date >= KO_START_DATE:
                pens =  match.select_one('.sp-c-fixture__win-message') 
                if pens is not None:
                    pen_win = pens.text.split(' ')[0]
                    pen_score = re.search('\d+-\d+', pens.text)
                    results.append(Results(
                        home_team,away_team,
                        int(home_goals),int(away_goals),
                        pen_win,pen_score.group())
                    )
                    continue
            results.append(Results(
                home_team,away_team,
                int(home_goals),int(away_goals))
            )
    return results

results = result_class(urls)

In [None]:
# The following snipet can be used to extract some useful information of the tournament
winners = [team.winner() for team in results if team.winner()]
losers = [team.loser() for team in results if team.loser()]
most_wins = Counter(winners).most_common()
most_losses = Counter(losers).most_common()
draws = [match for match in results if match.is_draw()]
matches_pens = [match for match in results if match.pen_winner]
total_goals = sum([goals.goal_scored() for goals in results])  
all_goal_diff = [gd.goal_diff() for gd in results]
biggest_victory = max(all_goal_diff)
biggest_margin_game = [str(game) for game in results if game.goal_diff() == biggest_victory]

## Creating a table using pandas library

In [None]:
# First we have to create a header for the table.
headers_matches = ['Home Team','Away Team','Goals (H)', 'Goals (A)', 'Winner', 'Penalties'] 
df_matches = pd.DataFrame(results, index = list(range(1,52)))
df_matches.set_axis(headers_matches, axis = 1, inplace = True)
df_matches = df_matches.iloc[:,[0,2,1,3,4,5]]
winner_column = [team.winner() for team in results]
df_matches['Winner'] = winner_column

In [None]:
# Group Stage
df_group_stage= df_matches.iloc[:36,:5] 

In [None]:
# Knockout Stage
df_ko_stage= df_matches.iloc[37:,:5] 
df_ko_stage['#'] = range(1,len(df_ko_stage['Winner'])+1) 
df_ko_stage.set_index('#', inplace = True)

## Creating graphs using Matplotlib library

In [None]:
font_title = {'family': 'Comic Sans MS', 'color':'#030300', 'size': 22, 'weight': 'bold'}
font_legend = {'family': 'Comic Sans MS', 'color':'#030300', 'size': 14, 'weight': 'bold'}
font_axes = {'family': 'Comic Sans MS', 'color':'#131312', 'size': 14}
total_matches, matches_drawn, matches_with_pens = len(results), len(draws), len(matches_pens)   
matches_with_result = total_matches - matches_drawn
match_stat = [total_matches, matches_with_result, matches_drawn, matches_with_pens] 
labels = ['Total Matches Played', 'Matches with Outcome', 'Matches Tied', 'Matches went to Penalties ']

def pie_chart(
    x,labels, font_legend, font_title, title, theme, explode = (0,0,0.25,0.5), pctdist = 0.7):
    # This function will generate pie chart
    plt.style.use(theme)
    plt.pie(x, labels = labels, autopct = '%.f %%',
            explode = explode, pctdistance = pctdist, textprops = font_legend)
    plt.title(title, fontdict = font_title, loc = 'right')
    plt.show


In [None]:
pie_chart(match_stat,labels, font_legend, font_title, title = 'BREAKDOWN OF MATCHES',
          theme = 'ggplot', pctdist = 0.8)

## BAR CHART

In [None]:
grp_stage, ko_stage = len(df_group_stage), len(df_ko_stage)       
labels = ['Group Stage Games', 'Knockout Games' ]
plt.style.use('ggplot')
plt.bar(['a',labels[0]], [0,grp_stage], width = 0.25)
plt.bar([labels[1],'l'] ,[ko_stage,0], width = 0.25)
plt.title('Group Stage Vs Knock Out Games', fontdict = font_title)
plt.ylabel('Number of Matches', fontdict = font_axes)
plt.xticks([])
plt.legend(labels)
plt.show()

## Team Statistics

To get insights of how well a particular team performed in the competiton, I created a dataclass for team stats

In [None]:
@dataclass
class TeamStats:
    name: str
    games_played: int = 0
    games_won: int = 0
    games_lost: int = 0
    games_drawn: int = 0
    goals_for: int = 0
    goals_against: int = 0
    pen_games:int = 0
        
    @property
    def goal_diff(self):
        return self.goals_for - self.goals_against

In [None]:
def parse_team_stats(results: List[Results]) -> List[TeamStats]:
    teams = set([team.home_team for team in results] + [team.away_team for team in results])
    teams_ob = [TeamStats(team) for team in teams]
    
    for result in results:
        home, away = result.home_team, result.away_team
        # HOME TEAM STATS
        h_ts = next(team for team in teams_ob if team.name == home)
        h_ts.goals_for += result.home_goals
        h_ts.goals_against += result.away_goals
        h_ts.games_played +=1
        
        # AWAY TEAM STATS
        a_ts = next(team for team in teams_ob if team.name == away)
        a_ts.goals_for += result.away_goals
        a_ts.goals_against += result.home_goals
        a_ts.games_played +=1
        
    # UPDATE DATA OF GAMES WON, LOST, DRAWN, AND GONE TO PENALTIES
        if result.winner() == home:
            h_ts.games_won += 1
            a_ts.games_lost += 1
        elif result.winner() == away:
            h_ts.games_lost += 1
            a_ts.games_won += 1   
        else:
            h_ts.games_drawn += 1
            a_ts.games_drawn += 1
        
        if result.pen_winner is not None: 
            h_ts.pen_games += 1
            a_ts.pen_games += 1
    
    return teams_ob
teams = parse_team_stats(results)

In [None]:
headers = ['Team','GP','W','L','D','GF','GA','PG']
index = list(range(1,25))
df = pd.DataFrame(teams, index = index)
df.set_axis(headers,axis = 1, inplace = True)
df.sort_values(by=['W','GP','L','D'], ascending = [False, False,True, True], inplace = True)
df['#'] = range(1,25)
df.set_index(['#'], inplace = True)
df.loc[17,'Team'], df.loc[23,'Team'] = 'Russia', 'N. Macedonia'

In [None]:
plt.figure(figsize=(10,6))
plt.barh(df['Team'], df['W'], align ='center')
plt.yticks(df['Team'])
plt.gca().invert_yaxis()
plt.title('Number of Wins by Teams in UEFA Euro 2020', fontdict = font_title)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
font_legend_u = {'family': 'Comic Sans MS', 'color':'#030300', 'size': 15, 'weight': 'bold'}
df_u = df[df['W'] !=0]
norm = colors.Normalize(vmin = min(df_u['GF']), vmax = max(df_u['GF']))
color = [cm.Reds(norm(i)) for i in df_u['GF']]
squarify.plot(sizes = df_u['GF'], label = df_u['Team'], alpha =0.7, pad = 1, color = color,
              text_kwargs = font_legend_u)
plt.title('Most Goals Scored by Teams in UEFA Euro 2020', fontdict = font_title)
plt.axis('off')
plt.show()