In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [57]:
pal2020 = pd.read_csv("../data/Palmeiras_2020.csv", encoding='latin-1')
pal2021 = pd.read_csv("../data/Palmeiras_2021.csv", encoding='latin-1')
fla2019 = pd.read_csv("../data/Flamengo_2019.csv", encoding='latin-1')
fla2022 = pd.read_csv("../data/Flamengo_2022.csv", encoding='latin-1')
flu2023 =  pd.read_csv("../data/Fluminense_2023.csv", encoding='latin-1')
bot2024 = pd.read_csv("../data/Botafogo_2024.csv", encoding='latin-1')

In [58]:
fla2022 = fla2022.dropna()

In [59]:
import pandas as pd

def calculate_performance_filtered(df, team_name):
    # Data Cleaning and Preparation 
    df['GF'] = pd.to_numeric(df['GF'], errors='coerce')
    df['GA'] = pd.to_numeric(df['GA'], errors='coerce')

    # Convert date column and sort (using 'df' correctly)
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
        df = df.sort_values(['Date']).reset_index(drop=True)
    
    # === NEW FILTERING STEP ===
    # Filter to only Group, R16, QF, and SF stages
    stages_to_include = ["Group stage", "Round of 16", "Quarter-finals", "Semi-finals"]
    
    if 'Round' in df.columns:
        # Use .copy() to avoid SettingWithCopyWarning
        # This creates a new DataFrame containing only the selected stages.
        df_filtered = df.query('Stage in @stages_to_include').copy()
    else:
        # If 'Stage' column is missing, analyze the whole DataFrame
        print(f"[NOTE: 'Stage' column not found in {team_name}. Analyzing all rows.]")
        df_filtered = df.copy() # Use a copy of the original DF
        
    # --- Performance Calculations (using the filtered DataFrame) ---
    # Rename the DataFrame used for calculations for clarity
    df_calc = df_filtered
    
    # 1. Offensive Overperformance
    goals_real = df_calc['GF'].sum()
    xg_total = df_calc['xG'].sum()
    overperf = goals_real - xg_total

    # 2. Defensive Performance
    xga_total = df_calc['xGA'].sum()
    ga_real = df_calc['GA'].sum()
    defensive_overperf = xga_total - ga_real

    # 3. Possession average
    poss_avg = df_calc['Poss'].mean()

    # 4. Summary Output
    # ... (Summary printing logic remains the same)
    print("\n" + "="*60)
    print(f"RESUMO {team_name} (Fase de Grupos at√© Semi-finais):")
    print("="*60)
    print(f"‚öΩ Gols Marcados: **{goals_real}** | xG Esperado: **{xg_total:.1f}**")
    print(f"üìà Overperformance Ofensiva: **{overperf:+.1f}**")
    print(f"üõ°Ô∏è Gols Sofridos: **{ga_real}** | xGA Esperado: **{xga_total:.1f}**")
    print(f"üìâ Defesa vs xGA: **{defensive_overperf:+.1f}**")
    print(f" Possession M√©dia: **{poss_avg:.1f}%**")
    print("="*60)
    
    # Return results
    return {
        'Team': team_name,
        'Goals_Real': goals_real,
        'xG_Total': xg_total,
        'Overperf_Offense': overperf,
        'Poss_Avg': poss_avg,
        'Overperf_Defense': defensive_overperf
    }

In [60]:
dfs_to_process = {
    "PALMEIRAS 2020": pal2020,
    "PALMEIRAS 2021": pal2021,
    "FLAMENGO 2019": fla2019,
    "FLAMENGO 2022": fla2022,
    "FLUMINENSE 2023": flu2023,
    "BOTAFOGO 2024": bot2024,
}

In [64]:
# List to collect all the results for a final summary table
all_results = []

# Loop through the dictionary
for name, df in dfs_to_process.items():
    result = calculate_performance(df, name)
    all_results.append(result)

# Convert the collected results into a single summary DataFrame
summary_df = pd.DataFrame(all_results)
# print("\n\n--- GLOBAL SUMMARY TABLE ---")
# print(summary_df)


RESUMO PALMEIRAS 2020:
‚öΩ Gols Marcados: **33** | xG Esperado: **23.8**
üìà Overperformance Ofensiva: **+9.2**
üõ°Ô∏è Gols Sofridos: **6** | xGA Esperado: **13.5**
üìâ Defesa vs xGA: **+7.5** (Positivo = melhor que o esperado)
 Possession M√©dia: **52.2%**

RESUMO PALMEIRAS 2021:
‚öΩ Gols Marcados: **29** | xG Esperado: **21.0**
üìà Overperformance Ofensiva: **+8.0**
üõ°Ô∏è Gols Sofridos: **10** | xGA Esperado: **15.5**
üìâ Defesa vs xGA: **+5.5** (Positivo = melhor que o esperado)
 Possession M√©dia: **44.0%**

RESUMO FLAMENGO 2019:
‚öΩ Gols Marcados: **22.0** | xG Esperado: **24.9**
üìà Overperformance Ofensiva: **-2.9**
üõ°Ô∏è Gols Sofridos: **10.0** | xGA Esperado: **12.3**
üìâ Defesa vs xGA: **+2.3** (Positivo = melhor que o esperado)
 Possession M√©dia: **58.4%**

RESUMO FLAMENGO 2022:
‚öΩ Gols Marcados: **33.0** | xG Esperado: **24.4**
üìà Overperformance Ofensiva: **+8.6**
üõ°Ô∏è Gols Sofridos: **8.0** | xGA Esperado: **9.8**
üìâ Defesa vs xGA: **+1.8** (Positivo 

In [36]:
fla2022.head(17)

Unnamed: 0,Date,Round,Venue,Result,GF,GA,Opponent,xG,xGA,Poss
0,2022-04-05,Group stage,Away,W,2.0,0.0,pe¬†Sporting Cristal,0.7,1.1,48.0
1,2022-04-12,Group stage,Home,W,3.0,1.0,ar¬†Talleres,1.3,0.8,56.0
2,2022-04-28,Group stage,Away,W,3.0,2.0,cl¬†Univ Cat√≥lica,2.9,0.9,41.0
3,2022-05-04,Group stage,Away,D,2.0,2.0,ar¬†Talleres,0.5,1.0,59.0
4,2022-05-17,Group stage,Home,W,3.0,0.0,cl¬†Univ Cat√≥lica,3.8,1.1,45.0
5,2022-05-24,Group stage,Home,W,2.0,1.0,pe¬†Sporting Cristal,1.8,0.8,58.0
7,2022-06-29,Round of 16,Away,W,1.0,0.0,co¬†Tolima,0.3,1.0,48.0
8,2022-07-06,Round of 16,Home,W,7.0,1.0,co¬†Tolima,4.5,0.6,47.0
10,2022-08-02,Quarter-finals,Away,W,2.0,0.0,br¬†Corinthians,0.9,0.5,57.0
11,2022-08-09,Quarter-finals,Home,W,1.0,0.0,br¬†Corinthians,2.1,0.5,57.0


In [50]:
print("--- Unique Stages in DataFrame ---")
print(fla2019['Round'].unique())

--- Unique Stages in DataFrame ---
['Group stage' 'Round of 16' 'Quarter-finals' 'Semi-finals' 'Final']


In [62]:
df_test = flu2023.copy()
stages_to_include = ["Group stage", "Round of 16", "Quarter-finals", "Semi-finals"]

# 1. Clean the 'Round' column (Crucial step for hidden spaces)
if 'Round' in df_test.columns:
    df_test['Round'] = df_test['Round'].astype(str).str.strip()
else:
    print("Column 'Round' not found in test DataFrame.")

# 2. Filter for only the stages you want to EXCLUDE
# This line finds all rows where 'Round' is NOT in your desired list
df_excluded = df_test.query('Round not in @stages_to_include').copy()

print("\n--- Check for Excluded Rows ---")
print(f"Total rows to be excluded: {len(df_excluded)}")
print("Unique 'Round' values in the excluded set:")
print(df_excluded['Round'].unique())


--- Check for Excluded Rows ---
Total rows to be excluded: 1
Unique 'Round' values in the excluded set:
['Final']


In [63]:
# Apply the cleaning fix
df_test['Round'] = df_test['Round'].astype(str).str.strip()

# Apply the filter
df_filtered = df_test.query('Round in @stages_to_include').copy()

print(f"Original Row Count: {len(df_test)}")
print(f"Filtered Row Count: {len(df_filtered)}")

Original Row Count: 13
Filtered Row Count: 12
