In [12]:
import polars as pl
from pathlib import Path

DATA_DIR = Path("..") / "data"
POLYMARKET_DIR = DATA_DIR / "Polymarket"

## Example: Manchester United vs Arsenal (Three-Outcome Market)

Before kickoff,  lists a **three-outcome market** (a market with more than two mutually exclusive results) for *Manchester United vs Arsenal*.  

Each possible result is represented by an **outcome share** (a tradable token that pays out $1 if that outcome happens):  
- **Manchester United win** trading at **$0.41**  
- **Draw** trading at **$0.30**  
- **Arsenal win** trading at **$0.29**  

Here, each **price** represents the **market-implied probability** of that outcome.

The market is identified by a **slug** (a human-readable identifier such as  
`manchester-united-vs-arsenal-match-winner` used in URLs and API queries).

As team news arrives, traders buy and sell shares, causing **price movement** (changes in implied probability driven by new information), for example pushing the United-win share up to **$0.47** after strong lineups are announced.

After the final whistle, the **market resolves** (the official outcome is declared), the winning outcome’s shares **settle** (pay out **$1 per share**), and all other outcome shares settle at **$0**.

---

## Mini Glossary

- **Market**: A single question being asked (e.g. *Who will win this match?*).  
- **Outcome**: One possible answer to a market (United win, Draw, Arsenal win).  
- **Share**: A tradable token tied to an outcome that pays $1 if that outcome occurs.  
- **Price**: The cost of a share; interpreted as the crowd-implied probability.  
- **Slug**: A human-readable market identifier used in URLs and APIs.  
- **Resolve / Settlement**: The process of declaring the true outcome and paying out shares.


### what data do we actually have downloaded?

In [None]:
list(POLYMARKET_DIR.iterdir())

[PosixPath('../data/Polymarket/soccer_markets.parquet'),
 PosixPath('../data/Polymarket/soccer_odds_history.parquet'),
 PosixPath('../data/Polymarket/soccer_trades.parquet'),
 PosixPath('../data/Polymarket/soccer_tokens.parquet'),
 PosixPath('../data/Polymarket/polymarket_soccer_analytics_schema.md'),
 PosixPath('../data/Polymarket/soccer_summary.parquet'),
 PosixPath('../data/Polymarket/soccer_event_stats.parquet')]

### explore markets data

In [19]:
markets = pl.read_parquet(DATA_DIR / "Polymarket" / "soccer_markets.parquet")

In [20]:
markets.shape

(8549, 10)

In [21]:
markets.head()

market_id,question,slug,event_slug,category,volume,active,closed,created_at,end_date
str,str,str,str,str,f64,bool,bool,datetime[μs],datetime[μs]
"""242920""","""Will Ukraine qualify for the 2…","""will-ukraine-qualify-to-the-20…","""will-ukraine-qualify-to-the-20…","""Sports""",4766.88,True,True,2022-04-06 07:51:48,2022-06-30 00:00:00
"""244963""","""UEFA Europa League final: Who …","""uefa-europa-league-final-who-w…","""uefa-europa-league-final-who-w…","""Sports""",1543.29,True,True,2022-05-18 14:16:53,2022-05-18 00:00:00
"""246443""","""Soccer: Who will win the Unite…","""soccer-who-will-win-the-united…","""soccer-who-will-win-the-united…","""Sports""",1363.07,True,True,2022-06-05 12:45:16,2022-06-05 00:00:00
"""246490""","""UEFA Nations League: Who will …","""uefa-nations-league-who-will-w…","""uefa-nations-league-who-will-w…","""Sports""",1031.58,True,True,2022-06-06 17:09:19,2022-06-07 00:00:00
"""246661""","""2022 Wimbledon Championships: …","""2022-wimbledon-championships-w…","""2022-wimbledon""","""Sports""",3098.29,True,True,2022-07-06 19:33:08,2022-07-08 00:00:00


In [30]:
markets["category"].value_counts().sort("count", descending=True)

category,count
str,u32
"""""",8397
"""Sports""",150
"""Olympics""",2


In [31]:
markets.select("question").head(10)

question
str
"""Will Ukraine qualify for the 2…"
"""UEFA Europa League final: Who …"
"""Soccer: Who will win the Unite…"
"""UEFA Nations League: Who will …"
"""2022 Wimbledon Championships: …"
"""2022 Wimbledon Championships: …"
"""2022 Wimbledon Championships: …"
"""Who will win Anthony Joshua vs…"
"""EFL Cup: Manchester United vs.…"
"""Will Manchester City win the 2…"


### explore the tokens data

In [32]:
tokens = pl.read_parquet(DATA_DIR / "Polymarket" / "soccer_tokens.parquet")
tokens.shape

(17096, 3)

In [33]:
tokens["outcome"].value_counts().sort("count", descending=True)

outcome,count
str,u32
"""Yes""",7375
"""No""",7375
"""Under""",572
"""Over""",572
"""Liverpool""",15
…,…
"""BHA/Draw""",1
"""Anderlecht""",1
"""FOR""",1
"""MAN/Draw""",1


In [None]:
tokens["market_id"].value_counts(name="n_outcomes")["n_outcomes"].value_counts().sort("count")
# so, this tells us that most markets have two outcomes, not three.

n_outcomes,count
u32,u32
3,2
2,8545


In [40]:
tokens.select("outcome").unique().sort("outcome")

outcome
str
""" Eintracht Frankfurt"""
""" PSG"""
""" Rangers"""
"""-other-"""
"""AC Milan"""
…
"""Wolves +0.5"""
"""Yes"""
"""Zenit"""
"""Zverev"""


In [41]:
tokens["outcome"].value_counts().sort("count", descending=True)

outcome,count
str,u32
"""Yes""",7375
"""No""",7375
"""Under""",572
"""Over""",572
"""Liverpool""",15
…,…
"""Canada """,1
"""Usyk""",1
"""West Ham +0.5""",1
"""Madrid""",1


### explore soccer summary data

In [42]:
summary = pl.read_parquet(DATA_DIR / "Polymarket" / "soccer_summary.parquet")
summary.shape

(8549, 9)

In [43]:
summary.head(3)

market_id,question,slug,volume,active,token_count,trade_count,first_trade,last_trade
str,str,str,f64,bool,i64,i64,datetime[μs],datetime[μs]
"""580986""","""MLS: Will St. Louis City SC be…","""mls-will-st-louis-city-sc-beat…",1312.879974,True,2,31,1970-01-21 07:54:02.835,1970-01-21 07:56:53.760
"""794127""","""Spread: Paris Saint-Germain FC…","""fl1-met-psg-2025-12-13-spread-…",39.473683,True,2,1,1970-01-21 10:18:46.735,1970-01-21 10:18:46.735
"""718033""","""Will Rangers FC win on 2025-12…","""uel-ftc-ran1-2025-12-11-ran1""",0.0,True,2,87,1970-01-21 10:18:58.991,1970-01-21 10:21:23.967


In [47]:
summary["trade_count"].value_counts().sort("count", descending=True)

trade_count,count
i64,u32
0,5304
1,95
2,89
4,75
8,53
…,…
550,1
1752,1
2149,1
774,1


In [48]:
# trade count being 0 means that nobody ever bet in that market, and that's a large part of the data, so we need to cut down to where we have at least 10 or so

In [50]:
summary.filter(pl.col("trade_count") > 0).shape
# this cuts the data down to like 40% of its original size

(3245, 9)

In [51]:
summary.filter(pl.col("trade_count") > 0)["trade_count"].describe()

statistic,value
str,f64
"""count""",3245.0
"""null_count""",0.0
"""mean""",350.975039
"""std""",936.898568
"""min""",1.0
"""25%""",19.0
"""50%""",70.0
"""75%""",279.0
"""max""",29335.0


In [53]:
summary.filter(pl.col("trade_count") >= 10).select("market_id").n_unique()

2735

In [None]:
usable = summary.filter(pl.col("trade_count") >= 10)
usable.select("market_id").n_unique(), usable.select("question").n_unique()
# almost every market is a different question, only a few markets are repeats

(2735, 2684)

In [55]:
usable.join(markets.select(["market_id", "event_slug"]), on="market_id").select("event_slug").n_unique()

1215

In [56]:
# so, there are 2735 markets that map to 1215 events... meaning multiple markets exist per a single real-world match/event

In [57]:
usable.join(
    markets.select(["market_id", "event_slug"]),
    on="market_id"
).group_by("event_slug").len().describe()

statistic,event_slug,len
str,str,f64
"""count""","""1215""",1215.0
"""null_count""","""0""",0.0
"""mean""",,2.251029
"""std""",,2.921888
"""min""","""2025-uefa-european-u21-champio…",1.0
"""25%""",,1.0
"""50%""",,2.0
"""75%""",,2.0
"""max""","""will-olympique-lyon-play-in-li…",43.0


In [58]:
# so , we are seeing about 2 (2.25) markets per event.

### lets look at the odds data

In [59]:
odds = pl.read_parquet(DATA_DIR / "Polymarket" / "soccer_odds_history.parquet")
odds.shape

(666837, 4)

In [None]:
odds = odds.with_columns(
    pl.col("timestamp").cast(pl.Int64).cast(pl.Datetime("ms"))
)

odds.select(
    pl.col("timestamp").min().alias("min_ts"),
    pl.col("timestamp").max().alias("max_ts"),
)

# the dates need to be specifically interpreted, otherwise you get dates in the 1970s
# in any case, we can extract pre-match prices

min_ts,max_ts
datetime[ms],datetime[ms]
2023-02-23 00:00:35,2026-01-05 01:00:09


In [64]:
odds.group_by("market_id").len().describe()

statistic,market_id,len
str,str,f64
"""count""","""8322""",8322.0
"""null_count""","""0""",0.0
"""mean""",,80.129416
"""std""",,134.421926
"""min""","""248844""",2.0
"""25%""",,8.0
"""50%""",,26.0
"""75%""",,50.0
"""max""","""902161""",668.0


In [65]:
# okay, wow, there are about 26 snapshot odds per market.. that's amazing

In [None]:
odds.join(
    usable.select("market_id"),
    on="market_id",
    how="inner"
).select("market_id").n_unique()

# perfect, almost all events we can get the pre-match odds for

2715

In [69]:
# how can we isolate the pre-match prices if there are so many prices per market?
# just take the first?
odds.group_by("market_id").agg(
    pl.col("timestamp").min().alias("first_odds_ts")
).describe()


statistic,market_id,first_odds_ts
str,str,str
"""count""","""8322""","""8322"""
"""null_count""","""0""","""0"""
"""mean""",,"""2025-06-27 01:09:45.883000"""
"""std""",,
"""min""","""248844""","""2023-02-23 00:00:35"""
"""25%""",,"""2025-03-20 00:00:05"""
"""50%""",,"""2025-08-16 00:00:04"""
"""75%""",,"""2025-11-04 00:00:12"""
"""max""","""902161""","""2025-12-09 20:00:15"""


In [70]:
# alternatively, we can use the kickoff field from the Statsbomb data, so we're good, we can use that also

### explore trades data

In [71]:
trades = pl.read_parquet(DATA_DIR / "Polymarket" / "soccer_trades.parquet")
trades.shape

(1138914, 9)

In [73]:
trades.select(
    pl.col("price").min().alias("min_price"),
    pl.col("price").max().alias("max_price"),
)


min_price,max_price
f64,f64
0.001,0.999


In [74]:
trades.select(
    pl.col("size").min().alias("min_size"),
    pl.col("size").max().alias("max_size"),
)

min_size,max_size
f64,f64
0.005,522169.68


In [76]:
trades.group_by("side").len().sort("len", descending=True)

side,len
str,u32
"""BUY""",935599
"""SELL""",203315


In [77]:
trades.group_by("market_id").len().describe()

statistic,market_id,len
str,str,f64
"""count""","""3245""",3245.0
"""null_count""","""0""",0.0
"""mean""",,350.975039
"""std""",,936.898568
"""min""","""506741""",1.0
"""25%""",,19.0
"""50%""",,70.0
"""75%""",,279.0
"""max""","""902160""",29335.0


In [78]:
# good amount of trades per market

### lets explore soccer events stats

In [79]:
events = pl.read_parquet(DATA_DIR / "Polymarket" / "soccer_event_stats.parquet")
events.shape

(2640, 5)

In [80]:
events.head(3)

event_slug,market_count,total_volume,first_market_start,last_market_end
str,i64,f64,datetime[μs],datetime[μs]
"""real-madrid-vs-bayern-munich-u…",1,1794.81,2024-05-06 16:06:33,2024-05-08 12:00:00
"""champions-league-winner-2025""",36,1001700000.0,2024-09-16 22:17:10,2025-05-31 12:00:00
"""epl-ips-mac-2025-01-19""",3,163470.299061,2025-01-07 01:31:12,2025-01-19 16:30:00


In [81]:
events.select("market_count").describe()

statistic,market_count
str,f64
"""count""",2640.0
"""null_count""",0.0
"""mean""",3.238258
"""std""",3.687239
"""min""",1.0
"""25%""",3.0
"""50%""",3.0
"""75%""",3.0
"""max""",64.0


### Polymarket Data – EDA Takeaways

- Polymarket dataset is mostly binary markets (Yes/No) rather than three-way win/draw/loss markets.
- A large fraction of markets have less than 10 trades.
- After filtering to markets with more than 10 trades, ~2.7k markets remain, mapping to ~1.2k distinct events (matches).
- Each event typically has 1–3 associated markets.,
- Odds history provides multiple timestamped snapshots per market, have to correlate with kickoff time from Statsbomb data.
- Timestamps in Polymarket odds require explicit casting to proper datetimes.