In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [25]:
# ---------------------------------------
# 1. LOAD MAIN DATA (2025 combined dataset)
# ---------------------------------------
final_df = pd.read_csv("../data/final_liberta.csv", encoding="latin-1")

# Convert Date to datetime
final_df['Date'] = pd.to_datetime(final_df['Date'], errors='coerce')

# Drop rows with no date
final_df = final_df.dropna(subset=['Date'])

# Sort chronologically (VERY IMPORTANT)
final_df = final_df.sort_values(by="Date").reset_index(drop=True)

In [26]:
# ---------------------------------------
# 2. FILTER TEAMS FOR 2025 CAMPAIGN
# ---------------------------------------
paldf = final_df[final_df['Team'] == 'Palmeiras'].copy()
fladf = final_df[final_df['Team'] == 'Flamengo'].copy()

# Compute cumulative xG
paldf['xG_cum'] = paldf['xG'].cumsum()
fladf['xG_cum'] = fladf['xG'].cumsum()

In [27]:
# ---------------------------------------
# 3. LOAD HISTORICAL CSVs
# ---------------------------------------
pal2020 = pd.read_csv("../data/Palmeiras_2020.csv", encoding='latin-1')
pal2021 = pd.read_csv("../data/Palmeiras_2021.csv", encoding='latin-1')
fla2019 = pd.read_csv("../data/Flamengo_2019.csv", encoding='latin-1')
fla2022 = pd.read_csv("../data/Flamengo_2022.csv", encoding='latin-1')

In [28]:
# ---------------------------------------
# 4. FIX DATES + SORT HISTORICAL DATA
# ---------------------------------------
for df in [pal2020, pal2021, fla2019, fla2022]:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df.dropna(subset=['Date'], inplace=True)
    df.sort_values(by="Date", inplace=True)
    df.reset_index(drop=True, inplace=True)

In [29]:
# ---------------------------------------
# 5. COMPUTE CUMULATIVE xG FOR HISTORICAL DATA
# ---------------------------------------
pal2020['xG_cum'] = pal2020['xG'].cumsum()
pal2021['xG_cum'] = pal2021['xG'].cumsum()
fla2019['xG_cum'] = fla2019['xG'].cumsum()
fla2022['xG_cum'] = fla2022['xG'].cumsum()

# Clean any fully empty rows (happens in some old CSVs)
fla2022 = fla2022.dropna(how="all")

In [40]:
def fix_encoding(s):
    if pd.isna(s):
        return s
    try:
        return s.encode('latin1').decode('utf-8')
    except:
        try:
            return s.encode('utf-8').decode('latin1')
        except:
            return s

cols_to_fix = ["Opponent", "Round", "Venue", "Team"]  # adicione o que precisar

for c in cols_to_fix:
    df[c] = df[c].astype(str).apply(fix_encoding)

In [43]:
# -----------------------------------------------------------
# 5) FUNÇÃO PARA CRIAR LABELS (versão robusta)
# -----------------------------------------------------------

def build_label(row):
    # Garantir datetime
    date_obj = pd.to_datetime(row['Date'], errors='coerce')
    date = date_obj.strftime('%d/%m') if not pd.isna(date_obj) else ""

    # Venue
    venue_value = str(row.get('Venue', '')).lower()
    venue = 'H' if venue_value.startswith('h') or venue_value == 'home' else 'A'

    # Score
    score = ""
    if pd.notna(row.get('GF')) and pd.notna(row.get('GA')):
        score = f"{int(row['GF'])}-{int(row['GA'])}"

    opponent = str(row.get('Opponent', ''))
    round_name = str(row.get('Round', ''))

    return f"{date} • vs {opponent} ({venue}) • {score} • {round_name}"


# -----------------------------------------------------------
# 6) APLICAR LABELS NOS DFS
# -----------------------------------------------------------

for df in [paldf, fladf]:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # Garantir datetime
    df['MatchNumber'] = range(1, len(df) + 1)
    df['MatchLabel'] = df.apply(build_label, axis=1)


# -----------------------------------------------------------
# 7) OPCIONAL: FORMATAR DATE PARA ISO PARA EXPORTAR AO FLOURISH
# -----------------------------------------------------------

for df in [paldf, fladf]:
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')


# -----------------------------------------------------------
# 8) EXIBIR AMOSTRA PARA VERIFICAR LABELS
# -----------------------------------------------------------

fladf[['MatchNumber', 'MatchLabel']].head(10)

Unnamed: 0,MatchNumber,MatchLabel
1,1,03/04 • vs Deportivo TÃ¡chira (A) • 1-0 • Grou...
3,2,09/04 • vs Central Cordoba (H) • 1-2 • Group s...
4,3,22/04 • vs LDU (A) • 0-0 • Group stage
7,4,07/05 • vs Central Cordoba (A) • 1-1 • Group s...
9,5,15/05 • vs LDU (H) • 2-0 • Group stage
11,6,28/05 • vs Deportivo TÃ¡chira (H) • 1-0 • Grou...
12,7,13/08 • vs Internacional (H) • 1-0 • Round of 16
14,8,20/08 • vs Internacional (A) • 2-0 • Round of 16
17,9,18/09 • vs Estudiantes (H) • 2-1 • Quarter-finals
19,10,25/09 • vs Estudiantes (A) • 0-1 • Quarter-finals


In [44]:
paldf.to_csv("palmeiras_2025_flourish.csv", index=False, encoding="utf-8")
fladf.to_csv("flamengo_2025_flourish.csv", index=False, encoding="utf-8")

In [45]:
combined = pd.concat([paldf, fladf], ignore_index=True)
combined.to_csv("../data/exports_flourish/combined_flourish.csv", index=False, encoding="utf-8")

In [46]:
test_df = pd.read_csv('../data/exports_flourish/combined_flourish.csv')

In [47]:
test_df.head()

Unnamed: 0,Date,Round,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Team,xG_cum,MatchNumber,MatchLabel
0,2025-04-03,Group stage,Away,W,3,2,Sporting Cristal,3.4,0.9,60,Palmeiras,3.4,1,03/04 • vs Sporting Cristal (A) • 3-2 • Group ...
1,2025-04-09,Group stage,Home,W,1,0,Cerro PorteÃ±o,3.0,0.2,57,Palmeiras,6.4,2,09/04 • vs Cerro PorteÃ±o (H) • 1-0 • Group stage
2,2025-04-24,Group stage,Away,W,3,2,Bolivar,2.5,1.5,25,Palmeiras,8.9,3,24/04 • vs Bolivar (A) • 3-2 • Group stage
3,2025-05-07,Group stage,Away,W,2,0,Cerro PorteÃ±o,1.6,0.7,54,Palmeiras,10.5,4,07/05 • vs Cerro PorteÃ±o (A) • 2-0 • Group stage
4,2025-05-15,Group stage,Home,W,2,0,Bolivar,1.4,0.2,51,Palmeiras,11.9,5,15/05 • vs Bolivar (H) • 2-0 • Group stage
