In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import csv
import numpy as np

import os

from unidecode import unidecode

In [None]:
def clean_space(x):
    x = ' ' + x
    return re.sub('\s[a-z]+\s', '', x)

def convert_numeric_cols(df):    
    for column in df:
        try:
            df[column] = pd.to_numeric(df[column])
        except:
            pass
        
    return df

def standardize(string):
    new = string.replace('_', ' ')
    new = new.title()
    return new

def standarize_columns(df):
    for col in df:
        df.rename(columns={col : standardize(col)}, inplace=True)
    return df

def age_clean(x):
    return re.sub('-\d+', '', x)

def clean(df, column, function):
    if column in df.columns:
        df[column] = df[column].apply(function)
        
def add_comp_level(df, league):
    if 'Comp Level' not in df:
        df['Comp Level'] = [league for player in range(df.shape[0])]

def table_scraper(url, columns):
    res = requests.get(url)
    comm = re.compile("<!--|-->")
    soup = BeautifulSoup(comm.sub("",res.text),'lxml')
    all_tables = soup.findAll("tbody")
    team_table = all_tables[0]
    player_table = all_tables[1]

    pre_df_player = dict()
    features_wanted_player = columns
    rows_player = player_table.find_all('tr')
    for row in rows_player:
        if(row.find('th',{"scope":"row"}) != None):
            for f in features_wanted_player:
                cell = row.find("td",{"data-stat": f})
                if cell != None:
                    a = cell.text.strip().encode()
                    text=a.decode("utf-8")
                    if f in pre_df_player:
                        pre_df_player[f].append(text)
                    else:
                        pre_df_player[f] = [text]

    df_player = pd.DataFrame.from_dict(pre_df_player)
    
    clean(df_player, 'nationality', clean_space)
    clean(df_player, 'comp_level', clean_space)
    clean(df_player, 'age', age_clean)
    clean(df_player, 'nationality', clean_space)
    
    # convert numeric columns from objects to floats/integers
    df_player = convert_numeric_cols(df_player)
    
    # standardize columns
    df_player = standarize_columns(df_player)
    
    df_player = df_player.fillna(0)
    
    return df_player

def combine_df(current_df, addition_df):
    if current_df.shape[0] == 0:
        new_df = current_df.append(addition_df)
        
    else:
        current_df_columns = set(current_df.columns)
        addition_df_columns = set(addition_df.columns)
        common_cols = list(current_df_columns.intersection(addition_df_columns))
        
        new_df = pd.merge(current_df, addition_df, on=common_cols, how='outer')
        
    return new_df

In [None]:
standard_stats_columns = ['player', 'nationality', 'team', 'position', 'comp_level', 'age', 'games', 'games_starts', 'minutes', 'minutes_90s', 'goals', 'assists', 'goals_pens', 'pens_made', 'pens_att', 'cards_yellow', 'cards_red', 'goals_per90', 'assists_per90', 'goals_assists_per90', 'goals_pens_per90', 'goals_assists_pens_per90', 'xg', 'npxg', 'xa, right, npxg_xa, xg_per90', 'xa_per90', 'xg_xa_per90', 'npxg_per90', 'npxg_xa_per90']
passing_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'passes_completed', 'passes', 'passes_pct', 'passes_total_distance', 'passes_progressive_distance', 'passes_completed_short', 'passes_short', 'passes_pct_short', 'passes_completed_medium', 'passes_medium, right, passes_pct_medium', 'passes_completed_long', 'passes_long', 'passes_pct_long', 'assists', 'right', 'xa', 'xa_net', 'assisted_shots', 'passes_into_final_third', 'passes_into_penalty_area', 'crosses_into_penalty_area', 'progressive_passes']
shooting_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'goals', 'shots_total', 'shots_on_target', 'shots_on_target_pct', 'shots_total_per90', 'shots_on_target_per90', 'goals_per_shot', 'goals_per_shot_on_target', 'average_shot_distance', 'shots_free_kicks', 'pens_made', 'pens_att', 'xg', 'npxg', 'npxg_per_shot', 'xg_net', 'npxg_net']
pass_types_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'passes', 'passes_live', 'passes_dead', 'passes_free_kicks', 'through_balls', 'passes_pressure', 'passes_switches', 'crosses', 'corner_kicks, corner_kicks_in, corner_kicks_out', 'corner_kicks_straight', 'passes_ground', 'passes_low', 'passes_high, passes_left_foot', 'passes_right_foot', 'passes_head', 'throw_ins', 'passes_other_body', 'passes_completed', 'passes_offsides', 'passes_oob', 'passes_intercepted', 'passes_blocked']
shot_creation_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'sca', 'sca_per90', 'sca_passes_live', 'sca_passes_dead', 'sca_dribbles', 'sca_shots', 'sca_fouled', 'sca_defense', 'gca', 'gca_per90', 'gca_passes_live', 'gca_passes_dead', 'gca_dribbles', 'gca_shots', 'gca_fouled', 'gca_defense']
defense_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'tackles', 'tackles_won', 'tackles_def_3rd', 'tackles_mid_3rd', 'tackles_att_3rd', 'dribble_tackles', 'dribbles_vs', 'dribble_tackles_pct', 'dribbled_past', 'pressures', 'pressure_regains', 'pressure_regain_pct', 'pressures_def_3rd', 'pressures_mid_3rd', 'pressures_att_3rd', 'blocks', 'blocked_shots', 'blocked_shots_saves', 'blocked_passes', 'interceptions', 'tackles_interceptions', 'clearances', 'errors']
possession_columns = ['player', 'nationality', 'position', 'team', 'comp_level', 'age', 'minutes_90s', 'touches', 'touches_def_pen_area', 'touches_def_3rd', 'touches_mid_3rd', 'touches_att_3rd', 'touches_att_pen_area', 'touches_live_ball', 'dribbles_completed', 'dribbles', 'dribbles_completed_pct', 'players_dribbled_past', 'nutmegs', 'carries', 'carry_distance', 'carry_progressive_distance', 'progressive_carries', 'carries_into_final_third', 'carries_into_penalty_area', 'miscontrols', 'dispossessed', 'pass_targets', 'passes_received', 'passes_received_pct', 'progressive_passes_received']

In [None]:
standard = 'https://fbref.com/en/comps/Big5/2021-2022/stats/players/2021-2022-Big-5-European-Leagues-Stats#stats_standard'
shooting = 'https://fbref.com/en/comps/Big5/2021-2022/shooting/players/2021-2022-Big-5-European-Leagues-Stats#stats_shooting'
passing = 'https://fbref.com/en/comps/Big5/2021-2022/passing/players/2021-2022-Big-5-European-Leagues-Stats#stats_passing'
pass_types = 'https://fbref.com/en/comps/Big5/2021-2022/passing_types/players/2021-2022-Big-5-European-Leagues-Stats#stats_passing_types'
gca = 'https://fbref.com/en/comps/Big5/2021-2022/gca/players/2021-2022-Big-5-European-Leagues-Stats#stats_gca'
defending = 'https://fbref.com/en/comps/Big5/2021-2022/defense/players/2021-2022-Big-5-European-Leagues-Stats#stats_defense'
posession = 'https://fbref.com/en/comps/Big5/2021-2022/possession/players/2021-2022-Big-5-European-Leagues-Stats#compare_possession'

urls = [standard, passing, shooting, pass_types, gca, defending, posession]

In [None]:
cols = [standard_stats_columns, passing_columns, shooting_columns, pass_types_columns, shot_creation_columns, defense_columns, possession_columns]

urls_cols = {urls[i] : cols[i] for i in range(len(urls))}

In [None]:
combined_df = pd.DataFrame()
for url in urls_cols:
    columns = urls_cols[url]
    
    df = table_scraper(url, columns)
    combined_df = combine_df(combined_df, df)

In [None]:
# add_comp_level(combined_df, 'Liga MX')

In [None]:
identifiers = ['Player', 'Nationality', 'Team', 'Position', 'Comp Level', 'Age', 'Minutes']

agg_functions = {}
for col in combined_df.columns:
    
    if col == 'Player':
        continue
        
    elif col in identifiers:
        agg_functions[col] = 'first'
        
    else:
        if 'Per90' in col:
            agg_functions[col] = 'sum'

        else:
            agg_functions[col] = 'sum'
            
combined_df = combined_df.groupby(combined_df['Player']).aggregate(agg_functions).reset_index()


for col in combined_df.columns:
    if 'Per90' in col:
        combined_df = combined_df.drop(columns=[col])

In [None]:
identifiers = ['Player', 'Nationality', 'Team', 'Position', 'Games', 'Games Starts', 'Comp Level', 'Age', 'Minutes', 'Minutes 90S']
for col in combined_df.columns:
    if col not in identifiers:
        combined_df[f'{col}_Per90'] = combined_df[col] / combined_df['Minutes 90S']
        combined_df = combined_df.drop(columns=[col])

In [None]:
combined_df.replace([np.inf, -np.inf], 0, inplace=True)
combined_df = combined_df.fillna(0)
combined_df = combined_df.reset_index(drop=True)
combined_df = combined_df.round(2)

In [None]:
combined_df.to_csv('21_22.csv', index=False)

In [None]:
pd.read_csv('21_22.csv')

## Combine Leagues

In [None]:
folder_path = r'leagues'
data_dir = os.listdir(folder_path)

In [None]:
combined_df = pd.DataFrame()
for league in data_dir:
    league_csv_path = folder_path + '/' + league
    print(league_csv_path)
    new_df = pd.read_csv(league_csv_path)
    print('League Shape:', new_df.shape)
    combined_df = pd.concat([combined_df, new_df])

In [None]:
combined_df.to_csv('test.csv', index=False)