# Soccer Analytics: Polymarket + StatsBomb Integration

## Objectives
1. **Connect Polymarket betting data with StatsBomb match data**

---

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

DATA_DIR = Path("..") / "data"
POLYMARKET_DIR = DATA_DIR / "Polymarket"
STATSBOMB_DIR = DATA_DIR / "StatsBomb"



### Load Polymarket Data

In [14]:
# Polymarket files
pm_markets = pd.read_parquet(DATA_DIR / "Polymarket" / "soccer_markets.parquet")
pm_tokens = pd.read_parquet(DATA_DIR / "Polymarket" / "soccer_tokens.parquet")
pm_odds_history = pd.read_parquet(DATA_DIR / "Polymarket" / "soccer_odds_history.parquet")
pm_summary = pd.read_parquet(DATA_DIR / "Polymarket" / "soccer_summary.parquet")
pm_event_stats = pd.read_parquet(DATA_DIR / "Polymarket" / "soccer_event_stats.parquet")

print("Polymarket Data Loaded:")
print(f"  Markets: {len(pm_markets):,} rows")
print(f"  Tokens: {len(pm_tokens):,} rows")
print(f"  Odds History: {len(pm_odds_history):,} rows")
print(f"  Event Stats: {len(pm_event_stats):,} rows")
print(f"  Summary: {len(pm_summary):,} rows")

Polymarket Data Loaded:
  Markets: 8,549 rows
  Tokens: 17,096 rows
  Odds History: 666,837 rows
  Event Stats: 2,640 rows
  Summary: 8,549 rows


In [15]:
# Quick preview of markets
print("\nSample Market Questions:")
pm_markets[['question', 'event_slug', 'volume', 'created_at']].head(10)


Sample Market Questions:


Unnamed: 0,question,event_slug,volume,created_at
0,Will Ukraine qualify for the 2022 FIFA World Cup?,will-ukraine-qualify-to-the-2022-fifa-world-cup,4766.88,2022-04-06 07:51:48
1,UEFA Europa League final: Who will win Eintrac...,uefa-europa-league-final-who-will-win-eintrach...,1543.29,2022-05-18 14:16:53
2,Soccer: Who will win the United States vs. Uru...,soccer-who-will-win-the-united-states-vs-urugu...,1363.07,2022-06-05 12:45:16
3,UEFA Nations League: Who will win the Germany ...,uefa-nations-league-who-will-win-the-germany-v...,1031.58,2022-06-06 17:09:19
4,2022 Wimbledon Championships: Who will win Kyr...,2022-wimbledon,3098.29,2022-07-06 19:33:08
5,2022 Wimbledon Championships: Who will win Djo...,2022-wimbledon-championships-who-will-win-djok...,0.0,2022-07-07 18:37:01
6,2022 Wimbledon Championships: Who will win Djo...,2022-wimbledon-championships-who-will-win-djok...,0.0,2022-07-08 20:41:06
7,Who will win Anthony Joshua vs. Oleksandr Usyk 2?,who-will-win-anthony-joshua-vs-oleksandr-usyk-2,1737.27,2022-08-09 22:03:11
8,EFL Cup: Manchester United vs. Newcastle,efl-cup-manchester-united-vs-newcastle,27.68,2023-02-23 22:51:48
9,Will Manchester City win the 2023 Champions Le...,champions-league-winner,51976.25,2023-02-24 08:44:36


### Load StatsBomb Data

In [16]:
# StatsBomb files
sb_matches = pd.read_parquet(STATSBOMB_DIR / "matches.parquet")
sb_events = pd.read_parquet(STATSBOMB_DIR / "events.parquet")
sb_lineups = pd.read_parquet(STATSBOMB_DIR / "lineups.parquet")

print("StatsBomb Data Loaded:")
print(f"  Matches: {len(sb_matches):,} rows")
print(f"  Events: {len(sb_events):,} rows")
print(f"  Lineups: {len(sb_lineups):,} rows")

StatsBomb Data Loaded:
  Matches: 3,464 rows
  Events: 12,188,949 rows
  Lineups: 165,820 rows


In [17]:
# Quick preview of matches
print("\nSample Matches:")
sb_matches[['match_date', 'home_team', 'away_team', 'home_score', 'away_score', 'competition_name']].head(10)


Sample Matches:


Unnamed: 0,match_date,home_team,away_team,home_score,away_score,competition_name
0,2018-04-14,Barcelona,Valencia,2,1,La Liga
1,2018-04-29,RC Deportivo La Coruña,Barcelona,2,4,La Liga
2,2018-05-06,Barcelona,Real Madrid,2,2,La Liga
3,2018-03-18,Barcelona,Athletic Club,2,0,La Liga
4,2018-03-01,Las Palmas,Barcelona,1,1,La Liga
5,2018-02-17,Eibar,Barcelona,0,2,La Liga
6,2017-10-01,Barcelona,Las Palmas,3,0,La Liga
7,2017-09-19,Barcelona,Eibar,6,1,La Liga
8,2017-08-20,Barcelona,Real Betis,2,0,La Liga
9,2018-05-09,Barcelona,Villarreal,5,1,La Liga


---

### Connecting Polymarket to StatsBomb, Strategy for Joining the Datasets

The key challenge is that Polymarket and StatsBomb use different identifiers. Here's how we'll connect them:

**Matching Strategy:**
1. **Team Names**: Extract team names from Polymarket `event_slug` and `question`
2. **Match Date**: Use Polymarket `created_at` or `end_date` to narrow down StatsBomb `match_date`
3. **Fuzzy Matching**: Handle team name variations (e.g., "Arsenal" vs "Arsenal FC")

**Example Polymarket event_slug:** `epl-ars-che-2025` → Arsenal vs Chelsea, 2025

### Step 1: Parse Polymarket Event Slugs

In [18]:
def parse_event_slug(slug):
    """
    Parse event slug to extract team abbreviations and year.
    Example: 'epl-ars-che-2025' -> ('ars', 'che', '2025', 'epl')
    """
    if pd.isna(slug):
        return None, None, None, None
    
    parts = slug.lower().split('-')
    if len(parts) >= 4:
        league = parts[0]
        team1 = parts[1]
        team2 = parts[2]
        year = parts[3] if parts[3].isdigit() else None
        return team1, team2, year, league
    return None, None, None, None

# Apply parsing to markets
pm_markets[['team1_abbr', 'team2_abbr', 'year', 'league']] = pm_markets['event_slug'].apply(
    lambda x: pd.Series(parse_event_slug(x))
)

print("Sample parsed event slugs:")
pm_markets[['event_slug', 'team1_abbr', 'team2_abbr', 'year', 'league']].head(10)

Sample parsed event slugs:


Unnamed: 0,event_slug,team1_abbr,team2_abbr,year,league
0,will-ukraine-qualify-to-the-2022-fifa-world-cup,ukraine,qualify,,will
1,uefa-europa-league-final-who-will-win-eintrach...,europa,league,,uefa
2,soccer-who-will-win-the-united-states-vs-urugu...,who,will,,soccer
3,uefa-nations-league-who-will-win-the-germany-v...,nations,league,,uefa
4,2022-wimbledon,,,,
5,2022-wimbledon-championships-who-will-win-djok...,wimbledon,championships,,2022
6,2022-wimbledon-championships-who-will-win-djok...,wimbledon,championships,,2022
7,who-will-win-anthony-joshua-vs-oleksandr-usyk-2,will,win,,who
8,efl-cup-manchester-united-vs-newcastle,cup,manchester,,efl
9,champions-league-winner,,,,


### Step 2: Create Team Abbreviation Mapping

We need to map Polymarket abbreviations to StatsBomb full team names.

In [19]:
# Common team abbreviation mappings (extend this as needed)
TEAM_ABBR_MAP = {
    'ars': 'Arsenal',
    'che': 'Chelsea',
    'liv': 'Liverpool',
    'mci': 'Manchester City',
    'mun': 'Manchester United',
    'tot': 'Tottenham Hotspur',
    'new': 'Newcastle United',
    'avl': 'Aston Villa',
    'whu': 'West Ham United',
    'lei': 'Leicester City',
    'wol': 'Wolverhampton Wanderers',
    'eve': 'Everton',
    'cry': 'Crystal Palace',
    'sou': 'Southampton',
    'bha': 'Brighton & Hove Albion',
    'bur': 'Burnley',
    'ful': 'Fulham',
    'bre': 'Brentford',
    'nor': 'Norwich City',
    'wat': 'Watford',
    # La Liga
    'fcb': 'Barcelona',
    'rma': 'Real Madrid',
    'atm': 'Atlético Madrid',
    'sev': 'Sevilla',
    'val': 'Valencia',
    # Add more as you discover them in your data
}

def map_abbr_to_team(abbr):
    """Convert abbreviation to full team name."""
    if pd.isna(abbr):
        return None
    return TEAM_ABBR_MAP.get(abbr.lower(), None)

pm_markets['team1_full'] = pm_markets['team1_abbr'].apply(map_abbr_to_team)
pm_markets['team2_full'] = pm_markets['team2_abbr'].apply(map_abbr_to_team)

print("Sample team mappings:")
pm_markets[['event_slug', 'team1_abbr', 'team1_full', 'team2_abbr', 'team2_full']].dropna().head(10)

Sample team mappings:


Unnamed: 0,event_slug,team1_abbr,team1_full,team2_abbr,team2_full
287,epl-new-ars-2024-11-02,new,Newcastle United,ars,Arsenal
291,epl-sou-eve-2024-11-02,sou,Southampton,eve,Everton
292,epl-sou-eve-2024-11-02,sou,Southampton,eve,Everton
293,epl-sou-eve-2024-11-02,sou,Southampton,eve,Everton
770,epl-mun-bre-2024-10-19,mun,Manchester United,bre,Brentford
771,epl-mun-bre-2024-10-19,mun,Manchester United,bre,Brentford
772,epl-mun-bre-2024-10-19,mun,Manchester United,bre,Brentford
779,epl-sou-lei-2024-10-19,sou,Southampton,lei,Leicester City
780,epl-sou-lei-2024-10-19,sou,Southampton,lei,Leicester City
781,epl-sou-lei-2024-10-19,sou,Southampton,lei,Leicester City


### Step 3: Extract Match Dates from Polymarket

Polymarket `end_date` often corresponds to when the match happens.

In [20]:
# Convert to datetime and handle NaN values
pm_markets['end_date'] = pd.to_datetime(pm_markets['end_date'], errors='coerce')
pm_markets['match_date_est'] = pm_markets['end_date'].dt.date

# Also convert StatsBomb match_date
sb_matches['match_date'] = pd.to_datetime(sb_matches['match_date'], errors='coerce')
# Keep as datetime for easier date math, convert to date only when needed

# Print date ranges, filtering out NaN values
print("Date ranges:")
pm_dates_valid = pm_markets['match_date_est'].dropna()
sb_dates_valid = sb_matches['match_date'].dropna()

if len(pm_dates_valid) > 0:
    print(f"Polymarket: {pm_dates_valid.min()} to {pm_dates_valid.max()}")
else:
    print("Polymarket: No valid dates found")
    
if len(sb_dates_valid) > 0:
    print(f"StatsBomb: {sb_dates_valid.min().date()} to {sb_dates_valid.max().date()}")
else:
    print("StatsBomb: No valid dates found")

# Show how many records have valid dates
print(f"\nPolymarket records with valid dates: {len(pm_dates_valid):,} / {len(pm_markets):,}")
print(f"StatsBomb records with valid dates: {len(sb_dates_valid):,} / {len(sb_matches):,}")

Date ranges:
Polymarket: 2021-04-13 to 2026-07-20
StatsBomb: 1958-06-24 to 2025-07-27

Polymarket records with valid dates: 8,534 / 8,549
StatsBomb records with valid dates: 3,464 / 3,464


### Step 4: Join Polymarket to StatsBomb

We'll create a join based on:
- Team names (home/away combinations)
- Match date (with tolerance for ±3 days)

In [21]:
def find_matching_statsbomb_match(pm_row, sb_matches_df, date_tolerance_days=3):
    """
    Find StatsBomb match that corresponds to a Polymarket market.
    
    Returns: match_id or None
    """
    team1 = pm_row['team1_full']
    team2 = pm_row['team2_full']
    pm_date = pm_row['end_date']  # Use datetime instead of date object
    
    # Check for missing values
    if pd.isna(team1) or pd.isna(team2) or pd.isna(pm_date):
        return None
    
    # Filter StatsBomb matches by date range
    date_min = pm_date - pd.Timedelta(days=date_tolerance_days)
    date_max = pm_date + pd.Timedelta(days=date_tolerance_days)
    
    # Find matches with these teams (in either home/away order)
    matches = sb_matches_df[
        (sb_matches_df['match_date'] >= date_min) &
        (sb_matches_df['match_date'] <= date_max) &
        (
            ((sb_matches_df['home_team'] == team1) & (sb_matches_df['away_team'] == team2)) |
            ((sb_matches_df['home_team'] == team2) & (sb_matches_df['away_team'] == team1))
        )
    ]
    
    if len(matches) == 0:
        return None
    elif len(matches) == 1:
        return matches.iloc[0]['match_id']
    else:
        # Multiple matches found - return closest by date
        matches = matches.copy()
        matches['date_diff'] = abs((matches['match_date'] - pm_date).dt.total_seconds())
        return matches.nsmallest(1, 'date_diff').iloc[0]['match_id']

# Apply matching (this may take a minute for large datasets)
print("Matching Polymarket markets to StatsBomb matches...")
print("This may take 1-2 minutes depending on dataset size...\n")

pm_markets['statsbomb_match_id'] = pm_markets.apply(
    lambda row: find_matching_statsbomb_match(row, sb_matches),
    axis=1
)

# Summary
matched_count = pm_markets['statsbomb_match_id'].notna().sum()
total_count = len(pm_markets)
print(f"\nMatched {matched_count:,} / {total_count:,} markets ({matched_count/total_count*100:.1f}%)")

if matched_count == 0:
    print("\n⚠️ No matches found! This could be because:")
    print("   1. Date ranges don't overlap between datasets")
    print("   2. Team abbreviations need to be added to TEAM_ABBR_MAP")
    print("   3. Different competitions/leagues in the two datasets")
    print("\nDebugging tips:")
    print("   - Check unique team abbreviations: pm_markets['team1_abbr'].unique()")
    print("   - Check StatsBomb teams: sb_matches['home_team'].unique()")
    print("   - Verify date overlap between datasets")
elif matched_count < total_count * 0.3:
    print("\n⚠️ Low match rate. Consider:")
    print("   - Expanding TEAM_ABBR_MAP with missing abbreviations")
    print("   - Increasing date_tolerance_days parameter")
    print("   - Checking if datasets cover different time periods")

Matching Polymarket markets to StatsBomb matches...
This may take 1-2 minutes depending on dataset size...


Matched 0 / 8,549 markets (0.0%)

⚠️ No matches found! This could be because:
   1. Date ranges don't overlap between datasets
   2. Team abbreviations need to be added to TEAM_ABBR_MAP
   3. Different competitions/leagues in the two datasets

Debugging tips:
   - Check unique team abbreviations: pm_markets['team1_abbr'].unique()
   - Check StatsBomb teams: sb_matches['home_team'].unique()
   - Verify date overlap between datasets


### Whoa, no matches, we need to examine these datasets a bit more. 

Let's start by trying to determine if the datasets will allow for matching. For example, 
it could be the case the datasets have different date ranges with no overlaps. Or, temporal 
overlaps may exist, but not for all competitions. 

In [22]:
# Get unique competitions from StatsBomb
sb_competitions = sb_matches['competition_name'].unique()

# Map Polymarket league abbreviations to competition names
# This helps us compare apples-to-apples
league_mapping = {
    'epl': 'Premier League',
    'la': 'La Liga',
    'liga': 'La Liga',
    'laliga': 'La Liga',
    'ucl': 'Champions League',
    'champions': 'Champions League',
    'bl1': '1. Bundesliga',
    'bundesliga': '1. Bundesliga',
    'serie': 'Serie A',
    'ligue': 'Ligue 1',
    'fl1': 'Ligue 1',
    'uefa': 'UEFA Euro',
    'euro': 'UEFA Euro',
    'fifa': 'FIFA World Cup',
    'mls': 'Major League Soccer',
}

print("=" * 80)
print("DATE OVERLAP ANALYSIS: Polymarket vs StatsBomb")
print("=" * 80)

# Check each competition
overlap_results = []

for comp in sb_competitions:
    # Get StatsBomb date range for this competition
    sb_comp = sb_matches[sb_matches['competition_name'] == comp]
    sb_min = sb_comp['match_date'].min()
    sb_max = sb_comp['match_date'].max()
    
    # Find corresponding Polymarket league(s)
    pm_leagues = [league for league, name in league_mapping.items() if name == comp]
    
    if pm_leagues:
        # Get Polymarket data for matching leagues
        pm_comp = pm_markets[pm_markets['league'].isin(pm_leagues)]
        
        if len(pm_comp) > 0:
            pm_dates = pd.to_datetime(pm_comp['end_date'], errors='coerce').dropna()
            
            if len(pm_dates) > 0:
                pm_min = pm_dates.min()
                pm_max = pm_dates.max()
                
                # Check for overlap
                has_overlap = not (sb_max < pm_min or pm_max < sb_min)
                
                if has_overlap:
                    overlap_start = max(sb_min, pm_min)
                    overlap_end = min(sb_max, pm_max)
                    overlap_days = (overlap_end - overlap_start).days
                else:
                    overlap_days = 0
                    
                gap_days = (pm_min - sb_max).days if sb_max < pm_min else (sb_min - pm_max).days
                
                overlap_results.append({
                    'Competition': comp,
                    'StatsBomb_Start': sb_min,
                    'StatsBomb_End': sb_max,
                    'Polymarket_Start': pm_min,
                    'Polymarket_End': pm_max,
                    'Has_Overlap': has_overlap,
                    'Overlap_Days': overlap_days if has_overlap else 0,
                    'Gap_Days': gap_days if not has_overlap else 0,
                    'StatsBomb_Matches': len(sb_comp),
                    'Polymarket_Markets': len(pm_comp)
                })

# Create DataFrame for analysis
overlap_df = pd.DataFrame(overlap_results)

# Sort by StatsBomb matches (most data first)
if len(overlap_df) > 0:
    overlap_df = overlap_df.sort_values('StatsBomb_Matches', ascending=False)
    
    print("\nCOMPETITIONS WITH DATE RANGE COMPARISON:")
    print("-" * 80)
    
    for idx, row in overlap_df.iterrows():
        print(f"\n{row['Competition']}:")
        print(f"  StatsBomb:  {row['StatsBomb_Start'].strftime('%Y-%m-%d')} to {row['StatsBomb_End'].strftime('%Y-%m-%d')} ({row['StatsBomb_Matches']} matches)")
        print(f"  Polymarket: {row['Polymarket_Start'].strftime('%Y-%m-%d')} to {row['Polymarket_End'].strftime('%Y-%m-%d')} ({row['Polymarket_Markets']} markets)")
        
        if row['Has_Overlap']:
            print(f"  ✅ OVERLAP: {row['Overlap_Days']} days")
        else:
            print(f"  ❌ NO OVERLAP - Gap of {row['Gap_Days']} days")
    
    # Summary statistics
    print("\n" + "=" * 80)
    print("SUMMARY:")
    print("-" * 80)
    total_comps = len(overlap_df)
    overlap_comps = overlap_df['Has_Overlap'].sum()
    
    print(f"Total competitions checked: {total_comps}")
    print(f"Competitions with overlap: {overlap_comps}")
    print(f"Competitions with NO overlap: {total_comps - overlap_comps}")
    
    if overlap_comps > 0:
        print("\n✅ MATCHING POSSIBLE in these competitions:")
        overlap_df[overlap_df['Has_Overlap']].apply(
            lambda x: print(f"   - {x['Competition']} ({x['Overlap_Days']} days overlap)"), 
            axis=1
        )
    else:
        print("\n❌ NO TEMPORAL OVERLAP found in any competition")
        
else:
    print("\n❌ No matching competitions found between datasets")

print("\n" + "=" * 80)

DATE OVERLAP ANALYSIS: Polymarket vs StatsBomb

COMPETITIONS WITH DATE RANGE COMPARISON:
--------------------------------------------------------------------------------

La Liga:
  StatsBomb:  1974-02-17 to 2021-05-16 (868 matches)
  Polymarket: 2024-09-29 to 2026-05-30 (103 markets)
  ❌ NO OVERLAP - Gap of 1232 days

Ligue 1:
  StatsBomb:  2015-08-07 to 2023-06-03 (435 matches)
  Polymarket: 2025-04-23 to 2026-05-30 (735 markets)
  ❌ NO OVERLAP - Gap of 690 days

Premier League:
  StatsBomb:  2003-08-16 to 2016-05-17 (418 matches)
  Polymarket: 2023-09-30 to 2026-05-27 (2107 markets)
  ❌ NO OVERLAP - Gap of 2692 days

Serie A:
  StatsBomb:  1986-11-09 to 2016-05-15 (381 matches)
  Polymarket: 2021-12-04 to 2026-05-28 (91 markets)
  ❌ NO OVERLAP - Gap of 2029 days

1. Bundesliga:
  StatsBomb:  2015-08-14 to 2024-05-18 (340 matches)
  Polymarket: 2024-09-28 to 2026-05-28 (38 markets)
  ❌ NO OVERLAP - Gap of 133 days

FIFA World Cup:
  StatsBomb:  1958-06-24 to 2022-12-18 (147 matches)


So, we have some teporal overlaps for two competitions, let's check what Polymarkets exist.

In [23]:
# Check FIFA World Cup overlap period
sb_wc_overlap = sb_matches[
    (sb_matches['competition_name'] == 'FIFA World Cup') &
    (sb_matches['match_date'] >= '2022-03-30') &
    (sb_matches['match_date'] <= '2022-12-18')
]
print(f"StatsBomb World Cup matches in overlap period: {len(sb_wc_overlap)}")
print(sb_wc_overlap[['match_date', 'home_team', 'away_team']].head(10))

# Check UEFA Euro overlap period  
sb_euro_overlap = sb_matches[
    (sb_matches['competition_name'] == 'UEFA Euro') &
    (sb_matches['match_date'] >= '2021-11-03') &
    (sb_matches['match_date'] <= '2024-07-14')
]
print(f"\nStatsBomb Euro matches in overlap period: {len(sb_euro_overlap)}")
print(sb_euro_overlap[['match_date', 'home_team', 'away_team']].head(10))

# Check Polymarket World Cup markets in overlap period
pm_wc_overlap = pm_markets[
    (pm_markets['league'] == 'fifa') &
    (pm_markets['end_date'] >= '2022-03-30') &
    (pm_markets['end_date'] <= '2022-12-18')
]
print(f"\nPolymarket World Cup markets in overlap period: {len(pm_wc_overlap)}")
print(pm_wc_overlap[['question', 'end_date']].head(10))

# Check Polymarket Euro markets in overlap period
pm_euro_overlap = pm_markets[
    (pm_markets['league'] == 'uefa') &
    (pm_markets['end_date'] >= '2021-11-03') &
    (pm_markets['end_date'] <= '2024-07-14')
]
print(f"\nPolymarket Euro markets in overlap period: {len(pm_euro_overlap)}")
print(pm_euro_overlap[['question', 'end_date']].head(10))

StatsBomb World Cup matches in overlap period: 64
     match_date    home_team    away_team
2215 2022-12-02       Serbia  Switzerland
2216 2022-12-03    Argentina    Australia
2217 2022-11-30    Australia      Denmark
2218 2022-11-24       Brazil       Serbia
2219 2022-11-26      Tunisia    Australia
2220 2022-11-29      Ecuador      Senegal
2221 2022-12-09  Netherlands    Argentina
2222 2022-11-24      Uruguay  South Korea
2223 2022-12-10      Morocco     Portugal
2224 2022-12-18    Argentina       France

StatsBomb Euro matches in overlap period: 51
     match_date    home_team    away_team
2466 2024-07-10  Netherlands      England
2467 2024-07-14        Spain      England
2468 2024-07-09        Spain       France
2469 2024-07-06  Netherlands       Turkey
2470 2024-07-05     Portugal       France
2471 2024-06-25  Netherlands      Austria
2472 2024-06-20      Denmark      England
2473 2024-07-06      England  Switzerland
2474 2024-07-05        Spain      Germany
2475 2024-06-26      U

In [24]:
print("\n" + "=" * 80)
print("VERIFICATION: Checking actual matches in overlap periods")
print("=" * 80)

# World Cup verification
sb_wc_overlap = sb_matches[
    (sb_matches['competition_name'] == 'FIFA World Cup') &
    (sb_matches['match_date'] >= '2022-03-30') &
    (sb_matches['match_date'] <= '2022-12-18')
]
pm_wc_overlap = pm_markets[
    (pm_markets['league'] == 'fifa') &
    (pm_markets['end_date'] >= '2022-03-30') &
    (pm_markets['end_date'] <= '2022-12-18')
]

print(f"\nFIFA World Cup overlap period (Mar-Dec 2022):")
print(f"  StatsBomb matches: {len(sb_wc_overlap)}")
print(f"  Polymarket markets: {len(pm_wc_overlap)}")

# Euro verification
sb_euro_overlap = sb_matches[
    (sb_matches['competition_name'] == 'UEFA Euro') &
    (sb_matches['match_date'] >= '2021-11-03') &
    (sb_matches['match_date'] <= '2024-07-14')
]
pm_euro_overlap = pm_markets[
    (pm_markets['league'] == 'uefa') &
    (pm_markets['end_date'] >= '2021-11-03') &
    (pm_markets['end_date'] <= '2024-07-14')
]

print(f"\nUEFA Euro overlap period (Nov 2021-Jul 2024):")
print(f"  StatsBomb matches: {len(sb_euro_overlap)}")
print(f"  Polymarket markets: {len(pm_euro_overlap)}")
print(f"  Sample Polymarket questions:")
for q in pm_euro_overlap['question'].head(3):
    print(f"    - {q}")



VERIFICATION: Checking actual matches in overlap periods

FIFA World Cup overlap period (Mar-Dec 2022):
  StatsBomb matches: 64
  Polymarket markets: 1

UEFA Euro overlap period (Nov 2021-Jul 2024):
  StatsBomb matches: 51
  Polymarket markets: 10
  Sample Polymarket questions:
    - UEFA Europa League final: Who will win Eintracht vs. Rangers?
    - UEFA Nations League: Who will win the Germany vs. England game on June 7?
    - UEFA Champions League: Will there be 3 or more goals in the Liverpool v. Atletico game on November 3?


The current datasets do not have a meaningful overlap. Therefore, any Polymarket activities will need to be done with a different dataset than the current Statsbomb data. One possibility would be to gather the data necessary to correlate with Polymarket data. 