In [2]:
#Import necessary libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os

In [3]:
#Transform all csv into Pandas Dataframes
palmeiras25_df = pd.read_csv('../data/Palmeiras_2025.csv', encoding='windows-1252')
botafogo_df = pd.read_csv('../data/Botafogo_2024.csv', encoding='windows-1252')
fluminense_df = pd.read_csv('../data/Fluminense_2023.csv', encoding='windows-1252')
flamengo22_df = pd.read_csv('../data/Flamengo_2022.csv', encoding='windows-1252')
palmeiras21_df = pd.read_csv('../data/Palmeiras_2021.csv', encoding='windows-1252')
palmeiras20_df = pd.read_csv('../data/Palmeiras_2020.csv', encoding='windows-1252')
flamengo19_df = pd.read_csv('../data/Flamengo_2019.csv', encoding='windows-1252')
river18_df = pd.read_csv('../data/River_2018.csv', encoding='windows-1252')
gremio17_df = pd.read_csv('../data/Gremio_2017.csv', encoding='windows-1252')
an16_df = pd.read_csv('../data/AtleticoNacional_2016.csv', encoding='windows-1252')
river15_df = pd.read_csv('../data/River_2015.csv', encoding='windows-1252')

In [4]:
#Create list with all imported data frames for looping and concatenating
all_dfs = [river15_df, an16_df, gremio17_df, river18_df, flamengo19_df, palmeiras20_df, palmeiras21_df, flamengo22_df,
              fluminense_df, botafogo_df, palmeiras25_df]

In [5]:
# 1) Mandatory base columns
mandatory = ['Date','Round','Venue','Result','GF','GA','Opponent']

# 2) Gather union of columns and check whether any dataframe has a real (non-NA) value for each column
all_columns = set()
for df in all_dfs:
    all_columns.update(df.columns)

#Candidate optional columns
candidates = ['xG','xGA','Poss']

# 3) Determine which candidate columns are actually present with at least one non-NA value across all dfs
keep_optional = []
for col in candidates:
    # check across dfs: is there at least one non-null entry in any df for this column?
    has_value = any((col in df.columns) and df[col].notna().any() for df in all_dfs)
    if has_value:
        keep_optional.append(col)

# 4) Final columns
final_columns = mandatory + keep_optional

# 5) Normalize column names related to possession variants (rename in-place)
for i, df in enumerate(all_dfs):
    df = df.copy()
    # a few known variants — rename to canonical 'Poss'
    if 'Poss.' in df.columns:
        df = df.rename(columns={'Poss.':'Poss'})
    if 'Poss*' in df.columns:
        df = df.rename(columns={'Poss*':'Poss'})
    all_dfs[i] = df

# 6) For each df, keep only the final_columns (add pd.NA for missing ones)
cleaned = []
for df in all_dfs:
    df = df.copy()
    for col in final_columns:
        if col not in df.columns:
            df[col] = pd.NA
    # keep only final columns and preserve order
    cleaned.append(df[final_columns])

# 7) Concatenate all dfs with minimal N/A columns
combined_df = pd.concat(
    [df.dropna(axis=1, how='all') for df in cleaned],
    ignore_index=True
)

# 8) Quick sanity checks (dates, dtypes, missingness)
combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')  # parse dates if needed
print(combined_df.info())
print(combined_df[final_columns].isna().mean().round(3))  # fraction missing per column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Date      125 non-null    datetime64[ns]
 1   Round     150 non-null    object        
 2   Venue     150 non-null    object        
 3   Result    150 non-null    object        
 4   GF        150 non-null    object        
 5   GA        150 non-null    object        
 6   Opponent  150 non-null    object        
 7   Poss      122 non-null    float64       
 8   xG        94 non-null     float64       
 9   xGA       94 non-null     float64       
dtypes: datetime64[ns](1), float64(3), object(6)
memory usage: 12.2+ KB
None
Date        0.188
Round       0.026
Venue       0.026
Result      0.026
GF          0.026
GA          0.026
Opponent    0.026
xG          0.390
xGA         0.390
Poss        0.208
dtype: float64


In [6]:
combined_df.head(70)

Unnamed: 0,Date,Round,Venue,Result,GF,GA,Opponent,Poss,xG,xGA
0,2015-02-19,Second stage,Away,L,0,2,bo San José,,,
1,2015-03-05,Second stage,Home,D,1,1,mx UANL,,,
2,2015-03-12,Second stage,Away,D,1,1,pe Juan Aurich,,,
3,2015-03-19,Second stage,Home,D,1,1,pe Juan Aurich,,,
4,2015-04-08,Second stage,Away,D,2,2,mx UANL,,,
...,...,...,...,...,...,...,...,...,...,...
65,2019-08-28,Quarter-finals,Away,D,1,1,br Internacional,45.0,1.5,0.5
66,2019-10-02,Semi-finals,Away,D,1,1,br Grêmio,58.0,1.3,1.2
67,2019-10-23,Semi-finals,Home,W,5,0,br Grêmio,56.0,3.0,0.9
68,2019-11-23,Final,Neutral,W,2,1,ar River Plate,60.0,2.2,0.5


In [7]:
#Export combined df to parquet
combined_df.to_csv('libertadores_champs.csv', index=False)