In [1]:
import glob
import re

import numpy as np
import pandas as pd
import tqdm

pd.set_option("display.max_columns", None)


In [11]:
def join_games(file_list):
   
   df = pd.DataFrame()

   for match in tqdm.tqdm(file_list, ncols=100):

      clean_match = match.split("\\")
      # print(' '.join(clean_match))

      year = int(clean_match[0].split("/")[1])
      jornada = int(clean_match[1].split()[-1])
      local = clean_match[-1].split(" vs ")[0]
      visit = clean_match[-1].split(" vs ")[1].replace(".xlsx", "")
      game = clean_match[-1].replace(".xlsx", "")
      # print(year, jornada, local, visit)

      df_local = pd.read_excel(match, sheet_name=local)
      df_local["Equipo"] = local
      df_local["Siglas"] = re.sub("[^A-Z]", "", local)
      df_local["Jornada"] = jornada
      df_local["Anio"] = year
      df_local["Game"] = game

      df_visit = pd.read_excel(match, sheet_name=visit)
      df_visit["Equipo"] = visit
      df_visit["Siglas"] = re.sub("[^A-Z]", "", visit)
      df_visit["Jornada"] = jornada
      df_visit["Anio"] = year
      df_visit["Game"] = game

      df = pd.concat([df, df_local, df_visit], ignore_index=True)

   df.columns = ['Jugador', 'P', 'M', 'V', 'AG', 'AST', 'A', 'R', 'PENALTI', 'PP', 'PF',
                    'D', 'DP', 'D1', 'DS', 'Rob', 'P1', 'PC', 'PPC', 'D2', 'FC', 'FS',
                    'Equipo', 'Siglas', 'Jornada', 'Anio', 'Game']

   df.PPC = df.PPC.replace({'%':''}, regex=True)
   num_columns = ['M', 'V', 'AG', 'AST', 'A', 'R', 'PENALTI', 'PP', 'PF',
       'D', 'DP', 'D1', 'DS', 'Rob', 'P1', 'PC', 'PPC', 'D2', 'FC', 'FS']

   df[num_columns] = df[num_columns].apply(pd.to_numeric, errors='coerce', axis=1)
    
   return df


In [12]:
excels_2020 = glob.glob("data/2020/*/*.xlsx")
excels_2021 = glob.glob("data/2021/*/*.xlsx")


In [13]:
df_2020 = join_games(excels_2020)
df_2021 = join_games(excels_2021)

100%|█████████████████████████████████████████████████████████████| 190/190 [00:05<00:00, 36.10it/s]
100%|█████████████████████████████████████████████████████████████| 153/153 [00:04<00:00, 36.72it/s]


# EDA 2020

In [14]:
df_2020.head()

Unnamed: 0,Jugador,P,M,V,AG,AST,A,R,PENALTI,PP,PF,D,DP,D1,DS,Rob,P1,PC,PPC,D2,FC,FS,Equipo,Siglas,Jornada,Anio,Game
0,Érick Delgado,P,85.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19.0,7.0,37.0,3.0,0.0,1.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
1,José Ramírez,D,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,33.0,28.0,85.0,0.0,0.0,0.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
2,Orlando Núñez,D,43.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,8.0,89.0,0.0,3.0,2.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
3,Arón Sánchez,D,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,16.0,33.0,30.0,91.0,0.0,0.0,2.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
4,Christian Sánchez,D,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,5.0,24.0,17.0,71.0,0.0,0.0,0.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano


In [15]:
df_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6781 entries, 0 to 6780
Data columns (total 27 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Jugador  6781 non-null   object 
 1   P        6781 non-null   object 
 2   M        6781 non-null   float64
 3   V        6781 non-null   float64
 4   AG       6781 non-null   float64
 5   AST      6781 non-null   float64
 6   A        6781 non-null   float64
 7   R        6781 non-null   float64
 8   PENALTI  6781 non-null   float64
 9   PP       6781 non-null   float64
 10  PF       6781 non-null   float64
 11  D        6277 non-null   float64
 12  DP       6277 non-null   float64
 13  D1       6277 non-null   float64
 14  DS       6277 non-null   float64
 15  Rob      6277 non-null   float64
 16  P1       6277 non-null   float64
 17  PC       6277 non-null   float64
 18  PPC      6277 non-null   float64
 19  D2       6277 non-null   float64
 20  FC       6277 non-null   float64
 21  FS       6277 

## Eliminando partidos con registros nulos

In [16]:
nan_rows = df_2020[df_2020.isnull().any(axis=1)].copy()
# nan_jornada_games_2020 = nan_rows.groupby(['Jornada','Game']).size().reset_index(name='Cantidad')
# print(nan_jornada_games_2020)
drop_games_2020 = nan_rows.Game.unique()
drop_games_2020
# nan_rows.Game.unique()

array(['FC Carlos Stein vs Carlos A. Mannucci',
       'Alianza Huánuco vs Cusco FC',
       'FC Carlos Stein vs Deportivo Llacuabamba',
       'Alianza Huánuco vs FC Carlos Stein',
       'Atlético Grau vs Univ. César Vallejo',
       'Deportivo Llacuabamba vs Academia Cantolao',
       'FC Carlos Stein vs Universitario', 'Alianza Huánuco vs Cienciano',
       'Atlético Grau vs Sport Boys Callao',
       'Deportivo Llacuabamba vs Univ. San Martín',
       'Alianza Huánuco vs Deportivo Llacuabamba',
       'FC Carlos Stein vs Binacional',
       'Atlético Grau vs Univ. San Martín',
       'Cusco FC vs Sport Boys Callao'], dtype=object)

In [19]:
df_2020.columns

Index(['Jugador', 'P', 'M', 'V', 'AG', 'AST', 'A', 'R', 'PENALTI', 'PP', 'PF',
       'D', 'DP', 'D1', 'DS', 'Rob', 'P1', 'PC', 'PPC', 'D2', 'FC', 'FS',
       'Equipo', 'Siglas', 'Jornada', 'Anio', 'Game'],
      dtype='object')

In [18]:
df_2020 = df_2020.loc[~df_2020.Game.isin(drop_games_2020)].reset_index(drop=True)
df_2020

Unnamed: 0,Jugador,P,M,V,AG,AST,A,R,PENALTI,PP,PF,D,DP,D1,DS,Rob,P1,PC,PPC,D2,FC,FS,Equipo,Siglas,Jornada,Anio,Game
0,Érick Delgado,P,85.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,19.0,7.0,37.0,3.0,0.0,1.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
1,José Ramírez,D,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,33.0,28.0,85.0,0.0,0.0,0.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
2,Orlando Núñez,D,43.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,8.0,89.0,0.0,3.0,2.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
3,Arón Sánchez,D,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,16.0,33.0,30.0,91.0,0.0,0.0,2.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
4,Christian Sánchez,D,90.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,5.0,24.0,17.0,71.0,0.0,0.0,0.0,Academia Cantolao,AC,1,2020,Academia Cantolao vs Cienciano
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6272,Ángel Romero,C,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Binacional,B,9,2020,Univ. San Martín vs Binacional
6273,Omar Reyes,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Binacional,B,9,2020,Univ. San Martín vs Binacional
6274,Joaquín Astorga,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Binacional,B,9,2020,Univ. San Martín vs Binacional
6275,Sebastián Gularte,D,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,50.0,0.0,0.0,1.0,Binacional,B,9,2020,Univ. San Martín vs Binacional
