# Parsing the Accumulated html Data

In [63]:
import os
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np


In [64]:
DATA_DIR = "data"

SCORES_DIR = os.path.join(DATA_DIR, "scores_stats") #scores and fixtures data
LEAGUE_DIR = os.path.join(DATA_DIR, "league_stats") #squad standard stats
MISC_DIR = os.path.join(DATA_DIR, "misc_stats")     #miscellaneous stats
PLAYING_DIR = os.path.join(DATA_DIR, "playing_stats")#playing time stats
KEEPER_DIR = os.path.join(DATA_DIR, "keeper_stats")  #keepers stats
SHOOTING_DIR = os.path.join(DATA_DIR, "shooting_stats") #shooting stats

years = list(range(2010,2022))


### Useful stuff:  
In case we could not do it with bs4. We have that header row in the middle of the data frame. Go to Venue column and delete any row that contains "Venue"

``` df = df[~df["Venue"].str.contains("Venue",na=False)]```


``` df.loc[df["Score_home"].isnull()]``` useful to find all the null values in a column

```df.loc[~df["Score"].str.startswith("("),"away_team_score"] = df["Score"].str[:1]``` i.e. ```df.loc[df['column name'] condition, 'new column name'] = 'value if condition is met'```

```df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'}, inplace=True)```

```games.drop(games[games['score']=="S"].index, inplace=True)``` drop rows inplace if the column contains char "S" in the score column

## Parsing the scores and fixtures table
This is the table that shows all the competitions in a season and the scores.

In [65]:
# Sorting out the directory and file name first:
score_fix = os.listdir(SCORES_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
score_fix = [os.path.join(SCORES_DIR,f) for f in score_fix if f.endswith(".html")] # this is a lit but we pass an element of it to the function


def parse_score(score_fix, year):

    # Parsing the html using beautiful soup
    with open (score_fix, encoding= 'unicode_escape') as f: #open file and read
        html = f.read()

    soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
    soup.find("tr", class_ = "thead").decompose() # get rid o the middle row that repeats the headers.

    [s.decompose() for s in soup.find_all("tr", class_ = "spacer partial_table result_all")] # deleting the table spacers, they give NaN value in pandas

    df = pd.read_html(str(soup))[0]
    # df = pd.read_html(score_fix[0])[0] # not using bs4

    # Cleaning the df, including splitting the score column and removing the irrelevant columns. We add a year column and remove Wk and Day columns. 
    df['year'] = year

    # some scores follow this formate e.g. 	(3) 1–1 (4)	We need to do proper processing for these. 


    # # score column cleaning
    df.loc[df["Score"].str.startswith("("),"home_team_score"] = df["Score"].str[1:2] # if the Score column starts with "(", then grab the second from left char.
    df.loc[~df["Score"].str.startswith("("),"home_team_score"] = df["Score"].str[:1] #If it does not start with "(" then do your normal thing and grab the left most char

    df.loc[df["Score"].str.startswith("("),"away_team_score"] = df["Score"].str[-2:-1] # if the Score column starts with "(", then grab the second from right char.
    df.loc[~df["Score"].str.startswith("("),"away_team_score"] = df["Score"].str[-1:] #If it does not start with "(" then do your normal thing and grab the right most char


    return(df)



## Parsing the Squad Standard Stats (League Stats)
This is available in the league stats directory as tables.


In [66]:
# Sorting out the directory and file name first:
league_stats = os.listdir(LEAGUE_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
league_stats = [os.path.join(LEAGUE_DIR,f) for f in league_stats if f.endswith(".html")] # this is a lit but we pass an element of it to the function


def parse_league(league_stats,year):
    # Parsing the html using beautiful soup
    with open (league_stats, encoding= 'unicode_escape') as f: #open file and read
        html = f.read()

    soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
    [s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
    soup.find("tr", class_ = "over_header").decompose() # remove the over_header

    df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them

    df['year'] = year

    return (df)


## Parsing the Squad Play time stats

In [67]:
 # Sorting out the directory and file name first:
playing_stats = os.listdir(PLAYING_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
playing_stats = [os.path.join(PLAYING_DIR,f) for f in playing_stats if f.endswith(".html")] # this is a lit but we pass an element of it to the function

def parse_playing(playing_stats,year):   
    # Parsing the html using beautiful soup
    with open (playing_stats, encoding= 'unicode_escape') as f: #open file and read
        html = f.read()

    soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
    [s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
    soup.find("tr", class_ = "over_header").decompose() # remove the over_header

    df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
    df.dropna(axis=1, inplace=True) #dropping 7 columns because NaN
    df['year'] = year
    return df


## Parsing the shooting stats 

In [68]:
# Sorting out the directory and file name first:
shooting_stats = os.listdir(SHOOTING_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
shooting_stats = [os.path.join(SHOOTING_DIR,f) for f in shooting_stats if f.endswith(".html")]# this is a lit but we pass an element of it to the function

def parse_shooting(shooting_stats,year):
    # Parsing the html using beautiful soup
    with open (shooting_stats, encoding= 'unicode_escape') as f: #open file and read
        html = f.read()

    soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
    [s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
    soup.find("tr", class_ = "over_header").decompose() # remove the over_header

    df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
    df.dropna(axis=1, inplace=True) #dropping 7 columns because NaN
    df['year'] = year
    return df

## Parsing the keeper stats

In [69]:
# Sorting out the directory and file name first:
keeper_stats = os.listdir(KEEPER_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
keeper_stats = [os.path.join(KEEPER_DIR,f) for f in keeper_stats if f.endswith(".html")] # this is a lit but we pass an element of it to the function

def parse_keeper(keeper_stats,year):
    # Parsing the html using beautiful soup
    with open (keeper_stats[0], encoding= 'unicode_escape') as f: #open file and read
        html = f.read()

    soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
    [s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
    soup.find("tr", class_ = "over_header").decompose() # remove the over_header

    df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
    df.dropna(axis=1, inplace=True) #dropping 7 columns because NaN
    df['year'] = year
    return df

## Parsing the Miscellaneous Stats

In [70]:
# Sorting out the directory and file name first:
misc_stats = os.listdir(MISC_DIR) #for scores and fixtures
# score-fix has only the file name, we need to join with the path too.
misc_stats = [os.path.join(MISC_DIR,f) for f in misc_stats if f.endswith(".html")]

def parse_misc(misc_stats, year):
    # Parsing the html using beautiful soup
    with open (misc_stats[0], encoding= 'unicode_escape') as f: #open file and read
        html = f.read()

    soup = BeautifulSoup(html,'html.parser') # instantiates bs4 using the file and html parser
    [s.decompose() for s in soup.find_all("tr", class_ = "thead")] # get rid of the middle row that repeats the headers.
    soup.find("tr", class_ = "over_header").decompose() # remove the over_header

    df = pd.read_html(str(soup))[0] # columns like Poss,Min,90s, PreC, PreP are all null. We drop them
    df.dropna(axis=1, inplace=True) #dropping 7 columns because NaN
    df['year'] = year
    return df 

# combining the team statistics

In [71]:
years = list(range(2010,2022))

In [72]:
league_summaries = [] # to append each year's stats
playing_summaries = []
shooting_summaries = []
keeper_summaries = []
misc_summaries = []

for year in years: #getting summary tables for all the yers

    league_df = parse_league(league_stats[years.index(year)], year)
    league_df['year'] = year
    league_summaries.append(league_df)
    

    misc_df = parse_league(misc_stats[years.index(year)], year)
    misc_summaries.append(misc_df)

    keeper_df = parse_league(keeper_stats[years.index(year)], year)
    keeper_summaries.append(keeper_df)

    shooting_df = parse_league(shooting_stats[years.index(year)], year)
    shooting_summaries.append(shooting_df)

    playing_df = parse_league(playing_stats[years.index(year)], year)
    playing_summaries.append(playing_df)


# concatenating and cleaning summary tables for each type of statistic

league_summary = pd.concat(league_summaries, axis = 0)
keeper_summary = pd.concat(keeper_summaries, axis = 0)
shooting_summary = pd.concat(shooting_summaries, axis = 0)
playing_summary = pd.concat(playing_summaries, axis = 0)
misc_summary = pd.concat(misc_summaries, axis = 0)



Out of the six tables we have, five of them are team statistics, and the score_summary is game based. Since the goal is to have stats in the game dataframe, we first combine all the stats per game per year (season) and then merge that with the score_summary, on home team, away team and year. From 2010 to 2022, the row count for all dataframes is 384 (makes sense as 32 teams per year * 12 years). SO the concatenation should go well. 

In [73]:
combined_stats = pd.concat([league_summary,playing_summary,keeper_summary,shooting_summary,misc_summary], axis = 1)

combined_stats.shape

(384, 120)

In [74]:
pd.DataFrame(combined_stats.isnull().sum()).T

Unnamed: 0,Squad,# Pl,Age,Poss,MP,Starts,Min,90s,Gls,Ast,...,Int,TklW,PKwon,PKcon,OG,year,Recov,Won,Lost,Won%
0,0,0,0,128,0,0,128,128,0,0,...,192,192,192,192,192,0,224,224,224,224


In [75]:
combined_stats.dropna(axis = 1, inplace=True) # We drop the null values because the constitute more than a third of the values in all the columns. 
combined_stats.shape

#combined_stats.rename(columns={'Squad': 'team', 'oldName2': 'newName2'}, inplace=True)

(384, 59)

In [76]:
# dropping duplicate columns such as Squad, year, etc. 

combined_stats = combined_stats.T.drop_duplicates().T

combined_stats.columns

Index(['Squad', '# Pl', 'Age', 'MP', 'Starts', 'Gls', 'Ast', 'G+A', 'G-PK',
       'PK', 'PKatt', 'Gls.1', 'Ast.1', 'G+A.1', 'G-PK.1', 'G+A-PK', 'year',
       'Min%', 'Subs', 'Mn/Sub', 'PPM', 'onG', 'onGA', '+/-', '# Pl', 'Min',
       'GA', 'GA90', 'SoTA', 'Saves', 'Save%', 'W', 'D', 'L', 'CS', 'CS%',
       'SoT', 'SoT/90', 'G/SoT', 'Fls'],
      dtype='object')

Encountered an issue. The namings for the teams are not consistent. team, team_opp and Squad have different way of naming. Otherwise it would be a nice simple left merge (or rather two left merges, as we want the opposing team stats too). So we make sure the team names are consistent across all the tables. 

In [77]:
combined_stats['Squad'] = combined_stats['Squad'].str[3:].str.lstrip() #The naming is not consistent, so remove the first 3 chars and then remove white spaces (prefixes such as eng, and sct have three letters as opposed ot two)

In [78]:
combined_stats['Squad'].unique()

array(['Ajax', 'Arsenal', 'Auxerre', 'Barcelona', 'Basel',
       'Bayern Munich', 'Benfica', 'Braga', 'Bursaspor', 'CFR Cluj',
       'Chelsea', 'FC Copenhagen', 'Hapoel Tel Aviv', 'Inter', 'Lyon',
       'Manchester Utd', 'Marseille', 'Milan', 'MÅ\xa0K Å½ilina',
       'Panathinaikos', 'Partizan', 'Rangers', 'Real Madrid', 'Roma',
       'Rubin Kazan', 'Schalke 04', 'Shakhtar', 'Spartak Moscow',
       'Tottenham', 'Twente', 'Valencia', 'Werder Bremen', 'APOEL FC',
       'BATE Borisov', 'CSKA Moscow', 'Dinamo Zagreb', 'Dortmund', 'Genk',
       'Leverkusen', 'Lille', 'Manchester City', 'Napoli', 'Olympiacos',
       'OÈ\x9belul GalaÈ\x9bi', 'Porto', 'Trabzonspor',
       'Viktoria PlzeÅ\x88', 'Villarreal', 'Zenit', 'Anderlecht',
       'Celtic', 'Dynamo Kyiv', 'Galatasaray', 'Juventus', 'MÃ¡laga',
       'Montpellier', 'NordsjÃ¦lland', 'Paris S-G', 'AtlÃ©tico Madrid',
       'Austria Wien', 'Real Sociedad', 'Steaua', 'Athletic Club',
       'Liverpool', 'Ludogorets', 'MalmÃ¶', 'Mona

In [79]:
combined_stats

Unnamed: 0,Squad,# Pl,Age,MP,Starts,Gls,Ast,G+A,G-PK,PK,...,Save%,W,D,L,CS,CS%,SoT,SoT/90,G/SoT,Fls
0,Ajax,17,24.2,10,66,6,4,10,6,0,...,77.8,2,5,3,1,10.0,27,2.7,0.22,81
1,Arsenal,25,24.8,8,88,20,13,33,17,3,...,74.4,5,0,3,1,12.5,46,5.75,0.37,130
2,Auxerre,20,27.3,8,66,3,2,5,3,0,...,58.6,1,1,6,8,100.0,20,2.5,0.15,76
3,Barcelona,25,26.6,13,143,30,24,54,28,2,...,75.7,8,4,1,5,38.5,98,7.54,0.29,134
4,Basel,18,25.7,10,66,8,6,14,8,0,...,60.7,5,1,4,1,10.0,32,3.2,0.25,94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,Sporting CP,27,26.7,8,88,14,10,24,12,2,...,55.6,3,1,4,2,25.0,30,3.75,0.4,85
28,Villarreal,25,28.0,12,132,20,16,36,18,2,...,70.9,5,3,4,3,25.0,44,3.67,0.41,115
29,Wolfsburg,21,26.1,6,66,5,4,9,5,0,...,72.7,1,2,3,1,16.7,11,1.83,0.45,89
30,Young Boys,24,25.6,6,66,7,4,11,7,0,...,62.5,1,2,3,0,0.0,25,4.17,0.28,87


# Parsing Game Data from Scores and Fixtures

In [80]:
games_list = []


for year in years:
    
    game = parse_score(score_fix[years.index(year)], year) # passing the index of year to correctly get the stats from html files
    game['year'] = year
    games_list.append(game)

games = pd.concat(games_list, axis = 0)

In [81]:
games.drop(['Notes','Score', 'Match Report', "Attendance","Round","Time","Referee","Wk","Day","Venue","xG","xG.1"], axis = 1, inplace=True) # need data rich numeric values. 
games.drop(games[games['Date'].str.len() < 9].index, inplace = True) # 3 rouge rows with no data in them. Dropping those. 
games['Date'] = pd.to_datetime(games['Date'])

#renaming columns
games.rename(columns={'Home': 'team', 'Away': 'team_opp', 'home_team_score':'score', 'away_team_score':'score_opp'}, inplace=True)

# changing scores to numeric as opposed to string
games.drop(games[games['score']=="S"].index, inplace=True) #cleaning the rows where score is not a number
games[['score','score_opp']] = games[['score','score_opp']].astype("int").copy()

# using string methods on the team name column to get consistent team names across all the columns. 
games['team'] = games['team'].str[:-3].str.rstrip() #making all the team names consistent to allow merging later
games['team_opp'] = games['team_opp'].str[3:].str.lstrip() #removing spaces form the left side  because of the naming inconsistency


In [82]:
#Sorting out the home and away flags
games["home"] = 1


In [83]:
games_opp = games.copy() # need to make each game into two rows for home and away teams, and we flip team and opposing team. Also copy otherwise they BOTH change!
games_opp.rename(columns={'team': 'team_opp', 'team_opp': 'team', "score":"score_opp","score_opp":"score"}, inplace=True)
games_opp["home"] =0

In [84]:
games[0:11]

Unnamed: 0,Date,team,team_opp,year,score,score_opp,home
0,2010-09-14,Lyon,Schalke 04,2010,1,0,1
1,2010-09-14,Manchester Utd,Rangers,2010,0,0,1
2,2010-09-14,Bursaspor,Valencia,2010,0,4,1
3,2010-09-14,Benfica,Hapoel Tel Aviv,2010,2,0,1
4,2010-09-14,FC Copenhagen,Rubin Kazan,2010,1,0,1
5,2010-09-14,Barcelona,Panathinaikos,2010,5,1,1
6,2010-09-14,Twente,Inter,2010,2,2,1
7,2010-09-14,Werder Bremen,Tottenham,2010,2,2,1
8,2010-09-15,Arsenal,Braga,2010,6,0,1
9,2010-09-15,Real Madrid,Ajax,2010,2,0,1


In [85]:
games_opp[0:11]

Unnamed: 0,Date,team_opp,team,year,score_opp,score,home
0,2010-09-14,Lyon,Schalke 04,2010,1,0,0
1,2010-09-14,Manchester Utd,Rangers,2010,0,0,0
2,2010-09-14,Bursaspor,Valencia,2010,0,4,0
3,2010-09-14,Benfica,Hapoel Tel Aviv,2010,2,0,0
4,2010-09-14,FC Copenhagen,Rubin Kazan,2010,1,0,0
5,2010-09-14,Barcelona,Panathinaikos,2010,5,1,0
6,2010-09-14,Twente,Inter,2010,2,2,0
7,2010-09-14,Werder Bremen,Tottenham,2010,2,2,0
8,2010-09-15,Arsenal,Braga,2010,6,0,0
9,2010-09-15,Real Madrid,Ajax,2010,2,0,0


In [86]:
combined_games = pd.concat([games, games_opp],axis = 0) #putting the dataframes together 

combined_games.sort_values(by="Date", inplace= True) # we now have datetime object. 

In [87]:
# we need to add a result column, to show won, lost, and draw. (win as a flag = 1,0,2 for won, lost and drew respectively). We use numpy for this
conditions = [(combined_games['score']>combined_games['score_opp']),
              (combined_games['score']<combined_games['score_opp']),
              (combined_games['score']==combined_games['score_opp']),]

values = [1,0,2] # 1 is team wn, 0 is team lost, 2 is a draw

combined_games["result"] = np.select(conditions,values)

In [88]:
combined_games.head(20)

Unnamed: 0,Date,team,team_opp,year,score,score_opp,home,result
0,2010-09-14,Lyon,Schalke 04,2010,1,0,1,1
5,2010-09-14,Panathinaikos,Barcelona,2010,1,5,0,0
4,2010-09-14,Rubin Kazan,FC Copenhagen,2010,0,1,0,0
0,2010-09-14,Schalke 04,Lyon,2010,0,1,0,0
3,2010-09-14,Hapoel Tel Aviv,Benfica,2010,0,2,0,0
2,2010-09-14,Valencia,Bursaspor,2010,4,0,0,1
1,2010-09-14,Rangers,Manchester Utd,2010,0,0,0,2
7,2010-09-14,Tottenham,Werder Bremen,2010,2,2,0,2
6,2010-09-14,Inter,Twente,2010,2,2,0,2
2,2010-09-14,Bursaspor,Valencia,2010,0,4,1,0


In [89]:
combined_stats['year']

0     2010
1     2010
2     2010
3     2010
4     2010
      ... 
27    2021
28    2021
29    2021
30    2021
31    2021
Name: year, Length: 384, dtype: object

# Merging the games data frame with the combines statistics

Game table is where the scores live
combined stats contains the statistics per season for each team. 

In [90]:
df_temp = combined_games.merge(combined_stats, how='left', left_on=['year','team'],right_on=["year","Squad"])

In [91]:
df_temp

Unnamed: 0,Date,team,team_opp,year,score,score_opp,home,result,Squad,# Pl,...,Save%,W,D,L,CS,CS%,SoT,SoT/90,G/SoT,Fls
0,2010-09-14,Lyon,Schalke 04,2010,1,0,1,1,Lyon,20,...,68.2,3,2,3,2,25.0,44,5.5,0.25,105
1,2010-09-14,Panathinaikos,Barcelona,2010,1,5,0,0,Panathinaikos,21,...,67.5,0,2,4,2,33.3,16,2.67,0.13,81
2,2010-09-14,Rubin Kazan,FC Copenhagen,2010,0,1,0,0,Rubin Kazan,20,...,77.8,1,3,2,3,50.0,21,3.5,0.0,91
3,2010-09-14,Schalke 04,Lyon,2010,0,1,0,0,Schalke 04,27,...,78.1,5,4,3,3,25.0,65,5.42,0.32,195
4,2010-09-14,Hapoel Tel Aviv,Benfica,2010,0,2,0,0,Hapoel Tel Aviv,19,...,79.6,4,4,4,2,16.7,28,2.33,0.21,76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2917,2022-05-03,Villarreal,Liverpool,2021,2,3,1,0,Villarreal,25,...,70.9,5,3,4,3,25.0,44,3.67,0.41,115
2918,2022-05-04,Manchester City,Real Madrid,2021,1,3,0,0,Manchester City,26,...,58.8,7,2,3,4,33.3,65,5.27,0.4,126
2919,2022-05-04,Real Madrid,Manchester City,2021,3,1,1,1,Real Madrid,25,...,80.6,9,0,4,5,38.5,66,4.83,0.38,120
2920,2022-05-28,Liverpool,Real Madrid,2021,0,1,1,0,Liverpool,28,...,50.0,10,1,2,4,30.8,79,6.08,0.34,151


In [92]:
#another merge for the opposing team stats: 

combined_stats_opp = combined_stats
combined_stats_opp.columns += "_opp"

In [93]:
df = df_temp.merge(combined_stats_opp, how='left', left_on=['year','team_opp'],right_on=["year_opp","Squad_opp"])

In [94]:
df # and this is the final product after all the cleaning, 1461 games, where we have the scores, and stats for the team and the opposing team. 

Unnamed: 0,Date,team,team_opp,year,score,score_opp,home,result,Squad,# Pl,...,Save%_opp,W_opp,D_opp,L_opp,CS_opp,CS%_opp,SoT_opp,SoT/90_opp,G/SoT_opp,Fls_opp
0,2010-09-14,Lyon,Schalke 04,2010,1,0,1,1,Lyon,20,...,78.1,5,4,3,3,25.0,65,5.42,0.32,195
1,2010-09-14,Panathinaikos,Barcelona,2010,1,5,0,0,Panathinaikos,21,...,75.7,8,4,1,5,38.5,98,7.54,0.29,134
2,2010-09-14,Rubin Kazan,FC Copenhagen,2010,0,1,0,0,Rubin Kazan,20,...,82.5,3,5,4,3,25.0,21,1.75,0.24,114
3,2010-09-14,Schalke 04,Lyon,2010,0,1,0,0,Schalke 04,27,...,68.2,3,2,3,2,25.0,44,5.5,0.25,105
4,2010-09-14,Hapoel Tel Aviv,Benfica,2010,0,2,0,0,Hapoel Tel Aviv,19,...,50.0,2,0,4,1,16.7,28,4.67,0.25,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2917,2022-05-03,Villarreal,Liverpool,2021,2,3,1,0,Villarreal,25,...,50.0,10,1,2,4,30.8,79,6.08,0.34,151
2918,2022-05-04,Manchester City,Real Madrid,2021,1,3,0,0,Manchester City,26,...,80.6,9,0,4,5,38.5,66,4.83,0.38,120
2919,2022-05-04,Real Madrid,Manchester City,2021,3,1,1,1,Real Madrid,25,...,58.8,7,2,3,4,33.3,65,5.27,0.4,126
2920,2022-05-28,Liverpool,Real Madrid,2021,0,1,1,0,Liverpool,28,...,80.6,9,0,4,5,38.5,66,4.83,0.38,120


In [95]:
df[["Date","team","team_opp","Squad","Squad_opp"]][0:20] # Checking to see if we have done this correctly. We now have twice as many game rows. 

Unnamed: 0,Date,team,team_opp,Squad,Squad_opp
0,2010-09-14,Lyon,Schalke 04,Lyon,Schalke 04
1,2010-09-14,Panathinaikos,Barcelona,Panathinaikos,Barcelona
2,2010-09-14,Rubin Kazan,FC Copenhagen,Rubin Kazan,FC Copenhagen
3,2010-09-14,Schalke 04,Lyon,Schalke 04,Lyon
4,2010-09-14,Hapoel Tel Aviv,Benfica,Hapoel Tel Aviv,Benfica
5,2010-09-14,Valencia,Bursaspor,Valencia,Bursaspor
6,2010-09-14,Rangers,Manchester Utd,Rangers,Manchester Utd
7,2010-09-14,Tottenham,Werder Bremen,Tottenham,Werder Bremen
8,2010-09-14,Inter,Twente,Inter,Twente
9,2010-09-14,Bursaspor,Valencia,Bursaspor,Valencia
