In [1]:
import os
import pandas as pd

In [2]:
def clean_std_stats(data):
    data.drop(['Matches'], axis=1, inplace=True)
    data['Nation'] = data['Nation'].str.replace(r'[a-z]+', '', regex=True)
    data['Nation']= data['Nation'].str.strip()
    data['Nation'].fillna('Unknow')

    # drop total lines
    data  = data[~data['Player'].str.contains('Squad Total')]
    data  = data[~data['Player'].str.contains('Opponent Total')]

    #cleaning age column
    data['Age'] = data['Age'].str.replace(r'-[0-9]+', '', regex=True)
    data['Age']= data['Age'].str.strip()
    data['Age'] = pd.to_numeric(data['Age'], errors='coerce')
    mean = data['Age'].mean()
    data['Age'] = data['Age'].fillna(mean).astype(int)

    if 'Playing Time_MP' in data.columns:
        data = data.rename(columns={'Playing Time_MP': 'MP'})

    data.iloc[:, 4:33] = data.iloc[:, 4:33].fillna(0)

    return data

def add_team_championship(data, team, championship):
    data['Team'] = team
    data['Championship'] = championship
    data['Player_Team'] = data['Player'] + ' (' + data['Team'] + ')'
    return data

In [3]:
def directories(folder_path):
    entries = os.listdir(folder_path)
    directories = (entry for entry in entries if os.path.isdir(os.path.join(folder_path, entry)))
    directory_names = list(directories)

    return directory_names

BASE_DIR = os.path.join('..', 'data')
RAW_DIR = os.path.join(BASE_DIR, 'raw')
SEASON_DIR = os.path.join(RAW_DIR, '2023')
ENGINEERED_DIR = os.path.join(BASE_DIR, 'engineered')

names = directories(SEASON_DIR)
names[0].split('_')[0]

'america-mg'

In [4]:
def read_files(output_path, filename):
    datas = []
    teams = []
    folders = directories(output_path)
    for folder in folders:
        team_folder = os.path.join(output_path, folder)
        teams.append(folder.split('_')[0])

        file_path = os.path.join(team_folder, filename)
        df = pd.read_csv(file_path)
        datas.append(df)
    return datas, teams

In [5]:
datas, teams = read_files(SEASON_DIR, 'squad_std_stats.csv')

teams

['america-mg',
 'athletico-pr',
 'atletico-mg',
 'bahia',
 'botafogo',
 'bragantino',
 'corinthians',
 'coritiba',
 'cruzeiro',
 'cuiaba',
 'flamengo',
 'fluminense',
 'fortaleza',
 'goias',
 'gremio',
 'internacional',
 'palmeiras',
 'santos',
 'sao-paulo',
 'vasco']

In [6]:
def transform(datas, teams, champ='Brasileiro'):
    new_datas = []
    for i in range(len(datas)):
        data = datas[i]
        team = teams[i]
        data = clean_std_stats(data)
        data = add_team_championship(data, team, champ)
        new_datas.append(data)
    return new_datas

In [7]:
def concat_dfs(datas):
    df_combined = pd.concat(datas, ignore_index=True)
    return df_combined

In [8]:
new = transform(datas, teams)
std_stats_df = concat_dfs(new)

In [9]:
championship_name = 'Brasileiro'
filename = championship_name + '.csv'
file = os.path.join(ENGINEERED_DIR, championship_name, filename)

In [10]:
if not os.path.exists(ENGINEERED_DIR):
    os.mkdir(ENGINEERED_DIR)

std_stats_df.to_csv(file, index=False)

In [11]:
filename = championship_name + '.xls'
file = os.path.join(ENGINEERED_DIR, championship_name, filename)
std_stats_df.to_excel(file, index=False)

  std_stats_df.to_excel(file, index=False)
