![title](img/ESPN.png)

In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm

LastGames = 10
print(f"LastGames: {LastGames}")
score_ranges = list(range(0, 7))

History = pd.read_csv('History.csv')

# 1. Clean and Sort
History.drop_duplicates(inplace=True)
# Sorting by Date is crucial to ensure iloc[:LastGames] gets the correct games
if 'Date' in History.columns:
    History['Date'] = pd.to_datetime(History['Date'])
    History = History.sort_values('Date', ascending=False)

# 2. Filter for teams with enough games
groups = History.groupby('Home')
h = pd.concat([g.iloc[:LastGames] for _, g in groups if len(g) >= LastGames])

# 3. Vectorized Column Creation
for r in tqdm(score_ranges, desc="Calculating Score Stats"):
    h[f'B>{r}'] = (h['But'] > r).astype(int)
    h[f'B<{r}'] = (h['But'] < r).astype(int)
    
    # Conditional stats based on DH, HA, DA flags
    h[f'DHB>{r}'] = ((h['But'] > r) & (h['DH'] == 1)).astype(int)
    h[f'DHB<{r}'] = ((h['But'] < r) & (h['DH'] == 1)).astype(int)
    
    h[f'HAB>{r}'] = ((h['But'] > r) & (h['HA'] == 1)).astype(int)
    h[f'HAB<{r}'] = ((h['But'] < r) & (h['HA'] == 1)).astype(int)
    
    h[f'DAB>{r}'] = ((h['But'] > r) & (h['DA'] == 1)).astype(int)
    h[f'DAB<{r}'] = ((h['But'] < r) & (h['DA'] == 1)).astype(int)

h['n'] = 1

# 4. Define Columns to Aggregate
# Generate the dynamic list of columns we just created
dynamic_cols = []
for r in score_ranges:
    dynamic_cols.extend([f'B>{r}', f'B<{r}', f'DHB>{r}', f'DHB<{r}', 
                         f'HAB>{r}', f'HAB<{r}', f'DAB>{r}', f'DAB<{r}'])

# The static columns already present in History
static_cols = [
    'Ds', 'Hs', 'As', 'DA', 'DH', 'HA', 'LDEMs', 'LDEMPs', 
    'DALDEMs', 'DHLDEMs', 'HALDEMs', 'DALDEMPs', 'DHLDEMPs', 'HALDEMPs', 'n'
]

all_cols = dynamic_cols + static_cols

# 5. Generate HomeStats
HomeStats = h.groupby('Home')[all_cols].sum()
HomeStats = HomeStats[HomeStats["n"] >= 6]
HomeStats = HomeStats.add_prefix('1').reset_index().rename(columns={'1Home': 'Home'})

# 6. Generate AwayStats
# Note: We use the same 'h' dataframe, but group by the 'Away' column
AwayStats = h.groupby('Away')[all_cols].sum()
AwayStats = AwayStats[AwayStats["n"] >= 6]
AwayStats = AwayStats.add_prefix('2').reset_index().rename(columns={'2Away': 'Away'})

# Display result
print(f"Home stats shape: {HomeStats.shape}")
print(f"Away stats shape: {AwayStats.shape}")

Calculating Score Stats: 100%|███████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 77.94it/s]

Home stats shape: (444, 72)
Away stats shape: (414, 72)





In [11]:
import pandas as pd
import numpy as np
import re
from datetime import datetime

# 1. Data Preparation
# Assuming 'History' is your original DataFrame
HistoryHA = pd.DataFrame(History)

# Create 'Team' column by stacking Home and Away
home_side = HistoryHA.copy()
home_side['Team'] = home_side['Home']

away_side = HistoryHA.copy()
away_side['Team'] = away_side['Away']

HistoryHA = pd.concat([home_side, away_side], ignore_index=True)

# Clean up Dates
HistoryHA["Date"] = pd.to_datetime(HistoryHA["Date"], format="%a, %d %b %Y", errors='coerce')
HistoryHA = HistoryHA.dropna(subset=['Team', 'Date', 'Score'])
HistoryHA = HistoryHA.sort_values(by=["Team", "Date"], ascending=False)

# 2. Robust Score Extraction
def extract_scores(score_str):
    if pd.isna(score_str):
        return pd.Series([0, 0])
    nums = re.findall(r'\d+', str(score_str))
    if len(nums) >= 2:
        return pd.Series([int(nums[0]), int(nums[1])])
    return pd.Series([0, 0])

HistoryHA[['ScoreH', 'ScoreA']] = HistoryHA['Score'].apply(extract_scores)
HistoryHA['But'] = HistoryHA['ScoreH'] + HistoryHA['ScoreA']

# 3. Determine Result Logic
def determine_result(row):
    if row['ScoreH'] == row['ScoreA']: 
        return 'D'
    # We compare against the 'Team' column we created during the stack
    is_home = row['Team'] == row['Home']
    if (row['ScoreH'] > row['ScoreA'] and is_home) or (row['ScoreH'] < row['ScoreA'] and not is_home):
        return 'W'
    return 'L'

HistoryHA['Result'] = HistoryHA.apply(determine_result, axis=1)

# 4. Optimized Streak Function (Fixed KeyError)
def calculate_streaks(group):
    # group.name is the 'Team' value for this specific group
    current_team_name = group.name 
    
    res_map = {
        "W": ["W", "WD", "WL"], 
        "D": ["D", "WD", "LD"], 
        "L": ["L", "LD", "WL"]
    }
    
    cols = ['W','L','D','WD','LD','WL','HW','HL','HD','HWD','HLD','HWL','AW','AL','AD','AWD','ALD','AWL']
    s = {k: 0 for k in cols}
    active = {k: True for k in cols}

    for _, row in group.iterrows():
        res = row['Result']
        current_res_types = res_map[res]
        
        # Use current_team_name instead of row['Team'] to avoid KeyError
        is_home = current_team_name == row['Home']
        loc_prefix = "H" if is_home else "A"

        # Check Overall Streaks
        for rt in ['W','L','D','WD','LD','WL']:
            if active[rt]:
                if rt in current_res_types: 
                    s[rt] += 1
                else: 
                    active[rt] = False
        
        # Check Location-Specific Streaks
        for rt in ['W','L','D','WD','LD','WL']:
            loc_key = loc_prefix + rt
            if active[loc_key]:
                if rt in current_res_types: 
                    s[loc_key] += 1
                else: 
                    active[loc_key] = False
        
        # Stop if all streaks are broken
        if not any(active.values()):
            break
            
    return pd.Series(s)

# 5. Apply and Aggregate
print("Calculating current streaks...")
# Using include_groups=False is the modern way to handle this in Pandas 2.0+
Streak = HistoryHA.groupby('Team').apply(calculate_streaks, include_groups=False)

# Calculate Maxima
Streak['max'] = Streak[['W','L','D','WD','LD','WL']].max(axis=1)
Streak['maxH'] = Streak[['HW','HL','HD','HWD','HLD','HWL']].max(axis=1)
Streak['maxA'] = Streak[['AW','AL','AD','AWD','ALD','AWL']].max(axis=1)

# Formatting
Streak = Streak.sort_values(['max', 'maxH'], ascending=False).reset_index()

# Display
print("Streaks calculated successfully.")
Streak.head(10)

Calculating current streaks...
Streaks calculated successfully.


Unnamed: 0,Team,W,L,D,WD,LD,WL,HW,HL,HD,...,HWL,AW,AL,AD,AWD,ALD,AWL,max,maxH,maxA
0,FC Salzburg,3,0,0,28,0,3,2,0,0,...,2,1,0,0,14,0,1,28,15,14
1,Barcelona,0,1,0,0,1,22,13,0,0,...,22,0,1,0,0,1,12,22,22,12
2,Lugo,0,2,0,0,20,2,0,1,0,...,1,0,4,0,0,11,4,20,10,11
3,Lens,3,0,0,3,0,18,11,0,0,...,18,1,0,0,1,0,9,18,18,9
4,Feyenoord Rotterdam,0,3,0,0,3,18,0,1,0,...,8,0,3,0,0,3,10,18,8,10
5,Sheffield United,1,0,0,1,0,17,5,0,0,...,15,1,0,0,1,0,8,17,15,8
6,Real Madrid,2,0,0,2,0,16,5,0,0,...,18,1,0,0,1,0,9,16,18,9
7,Gazisehir Gaziantep,0,12,0,0,12,16,0,7,0,...,10,0,5,0,0,5,6,16,10,6
8,Hatayspor,0,12,0,0,12,16,0,6,0,...,8,0,8,0,0,10,8,16,8,10
9,Lyon,12,0,0,12,0,15,8,0,0,...,20,5,0,0,5,0,7,15,20,7


In [14]:
# Newcastle United
# HistoryHA[(HistoryHA["Away"] == "Lens")].head(10)

Unnamed: 0,Date,Home,Score,Away,Competition,ScoreH,ScoreA,But,Result,Ds,...,LDEMs,LDEMPs,DALDEMs,DHLDEMs,HALDEMs,DALDEMPs,DHLDEMPs,HALDEMPs,IdGame,Team
90,2026-02-04,Troyes,2 - 4,Lens,fra.2,2,4,6,L,0,...,1,0,1,0,1,0,0,0,2026-02-04TroyesLens,Troyes
6429,2023-01-28,Troyes,1 - 1,Lens,ger.1,1,1,2,D,1,...,1,0,1,1,0,0,0,0,2023-01-28TroyesLens,Troyes
478,2026-01-02,Toulouse,0 - 3,Lens,fra.1,0,3,3,L,0,...,0,1,0,0,0,1,0,1,2026-01-02ToulouseLens,Toulouse
2846,2023-05-02,Toulouse,0 - 1,Lens,ger.1,0,1,1,L,0,...,0,1,0,0,0,1,0,1,2023-05-02ToulouseLens,Toulouse
6885,2023-01-11,Strasbourg,2 - 2,Lens,ger.1,2,2,4,D,1,...,1,0,1,1,0,0,0,0,2023-01-11StrasbourgLens,Strasbourg
9459,2022-09-04,Stade de Reims,1 - 1,Lens,ger.1,1,1,2,D,1,...,1,0,1,1,0,0,0,0,2022-09-04Stade de ReimsLens,Stade de Reims
1378,2025-09-28,Stade Rennais,0 - 0,Lens,fra.1,0,0,0,D,1,...,0,1,0,0,0,1,1,0,2025-09-28Stade RennaisLens,Stade Rennais
4354,2023-04-01,Stade Rennais,0 - 1,Lens,ger.1,0,1,1,L,0,...,0,1,0,0,0,1,0,1,2023-04-01Stade RennaisLens,Stade Rennais
368,2026-01-11,Sochaux,0 - 3,Lens,fra.1,0,3,3,L,0,...,0,1,0,0,0,1,0,1,2026-01-11SochauxLens,Sochaux
3706,2023-04-15,Paris Saint-Germain,3 - 1,Lens,ger.1,3,1,4,W,0,...,1,0,0,1,1,0,0,0,2023-04-15Paris Saint-Germain Lens,Paris Saint-Germain
