# Parsing the Accumulated html Data

In [130]:
import os
from bs4 import BeautifulSoup

import pandas as pd


In [131]:
DATA_DIR = "data"

SCORES_DIR = os.path.join(DATA_DIR, "scores_stats") #scores and fixtures data
LEAGUE_DIR = os.path.join(DATA_DIR, "league_stats") #squad standard stats
MISC_DIR = os.path.join(DATA_DIR, "misc_stats")     #miscellaneous stats
PLAYING_DIR = os.path.join(DATA_DIR, "playing_stats")#playing time stats
KEEPER_DIR = os.path.join(DATA_DIR, "keeper_stats")  #keepers stats
SHOOTING_DIR = os.path.join(DATA_DIR, "shooting_stats") #shooting stats

years = list(range(2010,2022))


### Useful stuff:  
In case we could not do it with bs4.
``` df = df[~df["Venue"].str.contains("Venue",na=False)]```
We have that header row in the middle of the data frame. Go to Venue column and delete any row that contains "Venue"

``` df.loc[df["Score_home"].isnull()]``` useful to find all the null values in a column

## Parsing the scores and fixtures table
This is the table that shows all the competitions in a season and the scores.

In [176]:
# Sorting out the directory and file name first:
score_fix = os.listdir(SCORES_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
score_fix = [os.path.join(SCORES_DIR,f) for f in score_fix if f.endswith(".html")]

# Parsing the html using beautiful soup
with open (score_fix[0], encoding= 'unicode_escape') as f: #open file and read
    html = f.read()

soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
soup.find("tr", class_ = "thead").decompose() # get rid o the middle row that repeats the headers.

[s.decompose() for s in soup.find_all("tr", class_ = "spacer partial_table result_all")] # deleting the table spacers, they give NaN value in pandas

df = pd.read_html(str(soup))[0]
# df = pd.read_html(score_fix[0])[0] # not using bs4

# Cleaning the df, including splitting the score column and removing the irrelevant columns. We add a year column and remove Wk and Day columns. 
df['year'] = years[0]

# score column split
df["home_team_score"] = df["Score"].str[:1].astype('int')
df["away_team_score"] = df["Score"].str[-1:].astype('int')
df.drop(["Score",'Notes', 'Match Report', "Attendance","Round","Time","Referee","Wk","Day","Date","Venue"], axis = 1, inplace=True)
df.head(10)

Unnamed: 0,Home,Away,year,home_team_score,away_team_score
0,Lyon fr,de Schalke 04,2010,1,0
1,Manchester Utd eng,sct Rangers,2010,0,0
2,Bursaspor tr,es Valencia,2010,0,4
3,Benfica pt,il Hapoel Tel Aviv,2010,2,0
4,FC Copenhagen dk,ru Rubin Kazan,2010,1,0
5,Barcelona es,gr Panathinaikos,2010,5,1
6,Twente nl,it Inter,2010,2,2
7,Werder Bremen de,eng Tottenham,2010,2,2
8,Arsenal eng,pt Braga,2010,6,0
9,Real Madrid es,nl Ajax,2010,2,0


## Parsing the Squad Standard Stats
This is available in the league stats directory as tables.


In [177]:
# Sorting out the directory and file name first:
league_stats = os.listdir(LEAGUE_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
league_stats = [os.path.join(LEAGUE_DIR,f) for f in league_stats if f.endswith(".html")]

# Parsing the html using beautiful soup
with open (league_stats[0], encoding= 'unicode_escape') as f: #open file and read
    html = f.read()

soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
[s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
soup.find("tr", class_ = "over_header").decompose() # remove the over_header

df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
df.drop(["Poss","Min","90s", "PrgC", "PrgP"], axis = 1, inplace=True)
df.head(10)

Unnamed: 0,Squad,# Pl,Age,MP,Starts,Gls,Ast,G+A,G-PK,PK,PKatt,CrdY,CrdR,Gls.1,Ast.1,G+A.1,G-PK.1,G+A-PK
0,nl Ajax,17,24.2,10,66,6,4,10,6,0,0,10,1,0.6,0.4,1.0,0.6,1.0
1,eng Arsenal,25,24.8,8,88,20,13,33,17,3,4,13,2,2.5,1.62,4.12,2.12,3.75
2,fr Auxerre,20,27.3,8,66,3,2,5,3,0,0,8,2,0.37,0.25,0.62,0.37,0.62
3,es Barcelona,25,26.6,13,143,30,24,54,28,2,3,14,1,2.31,1.85,4.15,2.15,4.0
4,ch Basel,18,25.7,10,66,8,6,14,8,0,0,7,1,0.8,0.6,1.4,0.8,1.4
5,de Bayern Munich,23,26.4,8,88,17,11,28,16,1,1,17,0,2.12,1.37,3.5,2.0,3.38
6,pt Benfica,19,25.6,6,66,7,6,13,7,0,0,12,1,1.17,1.0,2.17,1.17,2.17
7,pt Braga,21,27.6,10,66,5,4,9,5,0,0,10,0,0.5,0.4,0.9,0.5,0.9
8,tr Bursaspor,21,28.0,6,66,2,2,4,2,0,0,9,0,0.33,0.33,0.67,0.33,0.67
9,ro CFR Cluj,22,26.5,6,66,6,6,12,6,0,0,12,1,1.0,1.0,2.0,1.0,2.0


## Parsing the Squad Play time stats

In [178]:
# Sorting out the directory and file name first:
playing_stats = os.listdir(PLAYING_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
playing_stats = [os.path.join(PLAYING_DIR,f) for f in playing_stats if f.endswith(".html")]

# Parsing the html using beautiful soup
with open (playing_stats[0], encoding= 'unicode_escape') as f: #open file and read
    html = f.read()

soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
[s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
soup.find("tr", class_ = "over_header").decompose() # remove the over_header

df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
df.dropna(axis=1, inplace=True) #dropping 7 columns because NaN
df.head(10)

Unnamed: 0,Squad,# Pl,Age,MP,Min%,Starts,Subs,Mn/Sub,PPM,onG,onGA,+/-
0,nl Ajax,17,24.2,10,100,66,14,0,1.1,13,16,-3
1,eng Arsenal,25,24.8,8,100,88,23,0,1.88,21,11,10
2,fr Auxerre,20,27.3,8,100,66,16,0,0.5,5,13,-8
3,es Barcelona,25,26.6,13,100,143,33,0,2.15,30,9,21
4,ch Basel,18,25.7,10,100,66,15,0,1.6,17,12,5
5,de Bayern Munich,23,26.4,8,100,88,15,0,2.38,19,9,10
6,pt Benfica,19,25.6,6,100,66,18,0,1.0,7,12,-5
7,pt Braga,21,27.6,10,100,66,18,0,1.8,14,16,-2
8,tr Bursaspor,21,28.0,6,100,66,18,0,0.17,2,16,-14
9,ro CFR Cluj,22,26.5,6,100,66,17,0,0.67,6,12,-6


## Parsing the shooting stats 

In [185]:
# Sorting out the directory and file name first:
shooting_stats = os.listdir(SHOOTING_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
shooting_stats = [os.path.join(SHOOTING_DIR,f) for f in shooting_stats if f.endswith(".html")]

# Parsing the html using beautiful soup
with open (shooting_stats[0], encoding= 'unicode_escape') as f: #open file and read
    html = f.read()

soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
[s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
soup.find("tr", class_ = "over_header").decompose() # remove the over_header

df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
df.dropna(axis=1, inplace=True) #dropping 7 columns because NaN
df.head(10)

Unnamed: 0,Squad,# Pl,Gls,SoT,SoT/90,G/SoT,PK,PKatt
0,nl Ajax,17,6,27,2.7,0.22,0,0
1,eng Arsenal,25,20,46,5.75,0.37,3,4
2,fr Auxerre,20,3,20,2.5,0.15,0,0
3,es Barcelona,25,30,98,7.54,0.29,2,3
4,ch Basel,18,8,32,3.2,0.25,0,0
5,de Bayern Munich,23,17,56,7.0,0.29,1,1
6,pt Benfica,19,7,28,4.67,0.25,0,0
7,pt Braga,21,5,25,2.5,0.2,0,0
8,tr Bursaspor,21,2,20,3.33,0.1,0,0
9,ro CFR Cluj,22,6,30,5.0,0.2,0,0


## Parsing the keeper stats

In [188]:
# Sorting out the directory and file name first:
keeper_stats = os.listdir(KEEPER_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
keeper_stats = [os.path.join(KEEPER_DIR,f) for f in keeper_stats if f.endswith(".html")]

# Parsing the html using beautiful soup
with open (keeper_stats[0], encoding= 'unicode_escape') as f: #open file and read
    html = f.read()

soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
[s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
soup.find("tr", class_ = "over_header").decompose() # remove the over_header

df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
df.dropna(axis=1, inplace=True) #dropping 7 columns because NaN
df.head(10)

Unnamed: 0,Squad,# Pl,MP,Starts,Min,GA,GA90,SoTA,Saves,Save%,W,D,L,CS,CS%
0,nl Ajax,1,10,10,540,10,1.67,45,35,77.8,2,5,3,1,10.0
1,eng Arsenal,3,8,8,720,11,1.37,43,32,74.4,5,0,3,1,12.5
2,fr Auxerre,1,8,8,540,12,2.0,29,17,58.6,1,1,6,8,100.0
3,es Barcelona,2,13,13,1170,9,0.69,37,28,75.7,8,4,1,5,38.5
4,ch Basel,1,10,10,540,11,1.83,28,17,60.7,5,1,4,1,10.0
5,de Bayern Munich,2,8,8,720,9,1.13,46,37,80.4,6,1,1,4,50.0
6,pt Benfica,1,6,6,540,12,2.0,24,12,50.0,2,0,4,1,16.7
7,pt Braga,2,10,10,540,11,1.83,30,19,63.3,6,0,4,3,30.0
8,tr Bursaspor,2,6,6,540,16,2.67,36,20,55.6,0,1,5,0,0.0
9,ro CFR Cluj,2,6,6,540,12,2.0,46,34,73.9,1,1,4,0,0.0


## Parsing the Miscellaneous Stats

In [189]:
# Sorting out the directory and file name first:
misc_stats = os.listdir(MISC_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
misc_stats = [os.path.join(MISC_DIR,f) for f in misc_stats if f.endswith(".html")]

# Parsing the html using beautiful soup
with open (misc_stats[0], encoding= 'unicode_escape') as f: #open file and read
    html = f.read()

soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
[s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
soup.find("tr", class_ = "over_header").decompose() # remove the over_header

df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
df.dropna(axis=1, inplace=True) #dropping 7 columns because NaN
df.head(10)

Unnamed: 0,Squad,# Pl,CrdY,CrdR,Fls
0,nl Ajax,17,10,1,81
1,eng Arsenal,25,13,2,130
2,fr Auxerre,20,8,2,76
3,es Barcelona,25,14,1,134
4,ch Basel,18,7,1,94
5,de Bayern Munich,23,17,0,113
6,pt Benfica,19,12,1,102
7,pt Braga,21,10,0,102
8,tr Bursaspor,21,9,0,76
9,ro CFR Cluj,22,12,1,101


In [191]:
years = list(range(2010,2022))

In [206]:
a = years.index(2010)

In [208]:
type(a)

int