In [1]:
from initial import *

import pandas as pd

In [2]:
# from gamma import gamma_api

# gamma_api()

## 下载原始event数据

In [3]:
# 目标 URL
url = "https://gamma-api.polymarket.com/events?active=true&closed=false&archived=false&limit=1000000000000000000"
# url = 'https://gamma-api.polymarket.com/events?closed=false'

# 发送 GET 请求
response = requests.get(url)
response.raise_for_status()  

# 将返回的 JSON 数据解析成 Python 对象
data = response.json()


if isinstance(data, list):
    df = pd.DataFrame(data)
elif isinstance(data, dict):
    key = "events"  
    if key in data:
        df = pd.DataFrame(data[key])
    else:
        df = pd.DataFrame(data)
else:
    raise ValueError("未知的数据结构")

df_market = df[['id','title','markets']]

df_market

Unnamed: 0,id,title,markets
0,12483,Premier League Winner,"[{'id': '506728', 'question': 'Manchester City..."
1,12585,Champions League Winner,"[{'id': '507300', 'question': 'Will Inter Mila..."
2,12672,La Liga Winner,"[{'id': '507395', 'question': 'Will Real Madri..."
3,12756,English Premier League Top Scorer,"[{'id': '507683', 'question': 'Will Luis Diaz ..."
4,12815,NBA Champion,"[{'id': '507869', 'question': 'Will the LA Cli..."
...,...,...,...
495,18396,Will Russia capture Kupiansk by...?,"[{'id': '522894', 'question': 'Will Russia cap..."
496,18400,"Will Trump rename Greenland 'Red, White, and B...","[{'id': '522922', 'question': 'Will Trump rena..."
497,18402,Will an FBI agent be charged for leaking ICE p...,"[{'id': '522932', 'question': 'Will an FBI age..."
498,18413,Who will be named in Epstein files by June 30?,"[{'id': '525315', 'question': 'Will Anderson C..."


## 提取 ID，title，markets，volume24hr_list

In [4]:
market_volumes = []

for _, row in df_market.iterrows():
    each_market = row['markets']
    
    if isinstance(each_market, list):  # Ensure it’s a list
        volumes = [(
                    entry.get('question', 'Unknown'), 
                    entry.get('volume24hrClob', 'Unknown'),
                    entry.get('clobTokenIds', 'Unknown'),
                    entry.get('conditionId', 'Unknown'), 
                    entry.get('spread', 'Unknown'),
                    entry.get('startDate','Unknown'),
                    entry.get('endDate','Unknown')
                    )
                    for entry in each_market if isinstance(entry, dict)]
        market_volumes.append(volumes)
    else:
        market_volumes.append([])  # Handle cases where 'markets' is not a list

df_market['volume24hr_spread_list'] = market_volumes

df_market

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_market['volume24hr_spread_list'] = market_volumes


Unnamed: 0,id,title,markets,volume24hr_spread_list
0,12483,Premier League Winner,"[{'id': '506728', 'question': 'Manchester City...","[(Manchester City wins the Premier League?, 10..."
1,12585,Champions League Winner,"[{'id': '507300', 'question': 'Will Inter Mila...",[(Will Inter Milan win the UEFA Champions Leag...
2,12672,La Liga Winner,"[{'id': '507395', 'question': 'Will Real Madri...","[(Will Real Madrid win La Liga?, 1723.621166, ..."
3,12756,English Premier League Top Scorer,"[{'id': '507683', 'question': 'Will Luis Diaz ...",[(Will Luis Diaz be the top goalscorer in the ...
4,12815,NBA Champion,"[{'id': '507869', 'question': 'Will the LA Cli...",[(Will the LA Clippers win the 2025 NBA Finals...
...,...,...,...,...
495,18396,Will Russia capture Kupiansk by...?,"[{'id': '522894', 'question': 'Will Russia cap...","[(Will Russia capture Kupiansk by April 30?, 8..."
496,18400,"Will Trump rename Greenland 'Red, White, and B...","[{'id': '522922', 'question': 'Will Trump rena...","[(Will Trump rename Greenland 'Red, White, and..."
497,18402,Will an FBI agent be charged for leaking ICE p...,"[{'id': '522932', 'question': 'Will an FBI age...",[(Will an FBI agent be charged for leaking ICE...
498,18413,Who will be named in Epstein files by June 30?,"[{'id': '525315', 'question': 'Will Anderson C...",[(Will Anderson Cooper be named in Epstein fil...


## 基于df_market，提取每个market的yes和no的token id

In [5]:
# Step 2: 创建一个新的 DataFrame，
# 其中包含满足条件的交易量事件及其 market id，同时添加 spread 列
filtered_data = [
    {
        'id': row['id'], 
        'question': entry[0], 
        'volume24hrClob': entry[1],
        'TokenID': entry[2],
        'conditionId': entry[3],
        'spread': entry[4],
        'startDate': entry[5],
        'endDate': entry[6]
    }
    for _, row in df_market.iterrows()
    for entry in row['volume24hr_spread_list']
    # 如果需要过滤：if lower_threshold <= entry[1] <= upper_threshold
]

df_filtered_volume = pd.DataFrame(filtered_data)
df_filtered_volume

Unnamed: 0,id,question,volume24hrClob,TokenID,conditionId,spread,startDate,endDate
0,12483,Manchester City wins the Premier League?,101861.994991,"[""88643218703041538921477211574402135868068849...",0xeba50b6a5e1a5a682c8aaf34152d4eb91e0410d59117...,0.001,2024-09-09T20:24:52.495018Z,2025-05-25T12:00:00Z
1,12483,Arsenal wins the Premier League?,39759.880513,"[""73195466598131388170821776880758003788512546...",0x320045be25e331375755d9126cbe20a319ace7c7d925...,0.001,2024-09-09T20:28:43.533604Z,2025-05-25T12:00:00Z
2,12483,Aston Villa wins the Premier League?,105332.26,"[""83156492313921775878087837698449837132415407...",0x951ead358ab84c4314b8a619593471d80c687cc02720...,0.001,2024-09-09T20:30:02.775002Z,2025-05-25T12:00:00Z
3,12483,Bournemouth wins the Premier League?,163988.361,"[""20043121189468219599437255198027106640631597...",0xd35868afa9257d08411f4c7d61601213fc3d9fa1ebbc...,0.001,2024-09-09T20:30:39.18419Z,2025-05-25T12:00:00Z
4,12483,Everton wins the Premier League?,Unknown,"[""28179090282093807467382718742019031223670663...",0xa5bfa29f27399fce767abbff2d875bb5ad7a236748fb...,0.001,2024-09-09T20:55:43.375953Z,2025-05-25T12:00:00Z
...,...,...,...,...,...,...,...,...
2036,18413,Will Michael Jackson be named in Epstein files?,Unknown,"[""13903852899791791768774287014862422511014893...",0xb893d2b5bfa03f1edf16f11b70ed49944884db162821...,0.001,2025-02-11T22:59:31.647Z,2025-06-30T12:00:00Z
2037,18413,Will Bill Gates be named in Epstein files?,3418.780812,"[""28582195134731523840207476474967643776971082...",0x6571a94807018a6a3eef609970519b98a401219ec508...,0.020,2025-02-11T22:55:56.803Z,2025-06-30T12:00:00Z
2038,18413,Will Bill Clinton be named in Epstein files?,1250.961112,"[""22729481157690156894947185070287650678670985...",0x5eea74e5f882260097cdf0cfc61af03865630560d294...,0.027,2025-02-21T19:30:35.123Z,2025-06-30T12:00:00Z
2039,18413,Will Hillary Clinton be named in Epstein files?,699.739526,"[""27564416029236591509072533596074763864489970...",0x357bea56adb8a468f2a2b3067257b7756225bfeeeb14...,0.020,2025-02-21T19:30:51.452Z,2025-06-30T12:00:00Z


## 提取yes和no的token id, spread, market id

In [6]:
import ast
import pandas as pd

def extract_tokens(token_str):
    try:
        tokens = ast.literal_eval(token_str)
        if isinstance(tokens, list):
            yes_token = str(tokens[0]) if len(tokens) > 0 and tokens[0] is not None else None
            no_token = str(tokens[1]) if len(tokens) > 1 and tokens[1] is not None else None
        else:
            yes_token, no_token = None, None
        return [yes_token, no_token]
    except Exception:
        return [None, None]

# 示例：假设 df_filtered_volume 已存在且包含 'TokenID' 列
# 拆分为两个新列，并确保原始的 TokenID 列也被转换为列表，每个元素均为字符串
df_filtered_volume[['yes_token_id', 'no_token_id']] = df_filtered_volume['TokenID'].apply(lambda x: pd.Series(extract_tokens(x)))
df_filtered_volume['TokenID'] = df_filtered_volume['TokenID'].apply(lambda x: extract_tokens(x))

df_filtered_volume

Unnamed: 0,id,question,volume24hrClob,TokenID,conditionId,spread,startDate,endDate,yes_token_id,no_token_id
0,12483,Manchester City wins the Premier League?,101861.994991,[886432187030415389214772115744021358680688493...,0xeba50b6a5e1a5a682c8aaf34152d4eb91e0410d59117...,0.001,2024-09-09T20:24:52.495018Z,2025-05-25T12:00:00Z,8864321870304153892147721157440213586806884930...,5202270048105592915861981052115390705541146660...
1,12483,Arsenal wins the Premier League?,39759.880513,[731954665981313881708217768807580037885125462...,0x320045be25e331375755d9126cbe20a319ace7c7d925...,0.001,2024-09-09T20:28:43.533604Z,2025-05-25T12:00:00Z,7319546659813138817082177688075800378851254626...,2380758099274361309614515288149550771276495124...
2,12483,Aston Villa wins the Premier League?,105332.26,[831564923139217758780878376984498371324154078...,0x951ead358ab84c4314b8a619593471d80c687cc02720...,0.001,2024-09-09T20:30:02.775002Z,2025-05-25T12:00:00Z,8315649231392177587808783769844983713241540781...,4124576035317661748767784573536443551908988807...
3,12483,Bournemouth wins the Premier League?,163988.361,[200431211894682195994372551980271066406315978...,0xd35868afa9257d08411f4c7d61601213fc3d9fa1ebbc...,0.001,2024-09-09T20:30:39.18419Z,2025-05-25T12:00:00Z,2004312118946821959943725519802710664063159781...,7530822654051987323923362314024017098865350041...
4,12483,Everton wins the Premier League?,Unknown,[281790902820938074673827187420190312236706635...,0xa5bfa29f27399fce767abbff2d875bb5ad7a236748fb...,0.001,2024-09-09T20:55:43.375953Z,2025-05-25T12:00:00Z,2817909028209380746738271874201903122367066354...,2156099967661373030229856509463824941534153655...
...,...,...,...,...,...,...,...,...,...,...
2036,18413,Will Michael Jackson be named in Epstein files?,Unknown,[139038528997917917687742870148624225110148933...,0xb893d2b5bfa03f1edf16f11b70ed49944884db162821...,0.001,2025-02-11T22:59:31.647Z,2025-06-30T12:00:00Z,1390385289979179176877428701486242251101489331...,1035874496761483131953793653384338010043667848...
2037,18413,Will Bill Gates be named in Epstein files?,3418.780812,[285821951347315238402074764749676437769710823...,0x6571a94807018a6a3eef609970519b98a401219ec508...,0.020,2025-02-11T22:55:56.803Z,2025-06-30T12:00:00Z,2858219513473152384020747647496764377697108234...,5897157442993849393395087591688594409892656551...
2038,18413,Will Bill Clinton be named in Epstein files?,1250.961112,[227294811576901568949471850702876506786709855...,0x5eea74e5f882260097cdf0cfc61af03865630560d294...,0.027,2025-02-21T19:30:35.123Z,2025-06-30T12:00:00Z,2272948115769015689494718507028765067867098558...,2134264467713910708871256164350781633886753836...
2039,18413,Will Hillary Clinton be named in Epstein files?,699.739526,[275644160292365915090725335960747638644899706...,0x357bea56adb8a468f2a2b3067257b7756225bfeeeb14...,0.020,2025-02-21T19:30:51.452Z,2025-06-30T12:00:00Z,2756441602923659150907253359607476386448997065...,8103151863883472820402324860543621067316146905...


## 设置threshold过滤事件

In [7]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def calculate_threshold(idx, row):
    volume24h = row['volume24hrClob']
    yes_token = row["yes_token_id"]
    threshold = None
    try:
        # 获取订单簿数据
        order_book_yes = client.get_order_book(yes_token)
        
        # 计算最佳买价和最佳卖价
        best_bid_yes = max(float(order.price) for order in order_book_yes.bids)
        best_ask_yes = min(float(order.price) for order in order_book_yes.asks)
        
        size_at_best_bid_price = next(float(order.size) for order in order_book_yes.bids if float(order.price) == best_bid_yes)
        size_at_best_ask_price = next(float(order.size) for order in order_book_yes.asks if float(order.price) == best_ask_yes)
        
        threshold = volume24h / (size_at_best_bid_price + size_at_best_ask_price)
    except Exception as e:
        pass

    # 返回时保留原始索引，方便后续排序或合并
    new_row = dict(row)
    new_row["threshold"] = threshold
    new_row["_idx"] = idx  # 添加原始索引
    
    return new_row

filtered_rows = []
with ThreadPoolExecutor(max_workers=30) as executor:

    # 提交任务时同时传入索引和值
    futures = [executor.submit(calculate_threshold, idx, row_data) for idx, row_data in df_filtered_volume.iterrows()]
    
    # as_completed(futures)：这是一个生成器，它会按照任务完成的顺序依次返回 Future 对象，而不一定按照原始提交顺序。
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing Order Books"):
        # print(future.result())
        filtered_rows.append(future.result())

# 将结果转换为 DataFrame
results_df = pd.DataFrame(filtered_rows)

# 根据原始索引排序
results_df.sort_values(by="_idx", inplace=True)



# 把 threshold 加入到 df_filtered_volume
df_filtered_volume["threshold"] = results_df["threshold"].values

df_filtered_volume.sort_values(by="threshold", ascending=False, inplace=True)

Processing Order Books: 100%|██████████| 2041/2041 [04:26<00:00,  7.65it/s]


In [9]:
# 转换日期列，无法解析的将变为 NaT
df_filtered_volume['startDate'] = pd.to_datetime(df_filtered_volume['startDate'], errors='coerce')
df_filtered_volume['endDate'] = pd.to_datetime(df_filtered_volume['endDate'], errors='coerce')


# 获取今天的日期（带utc时区）
today = pd.Timestamp.today(tz='UTC')

# 筛选符合 threshold、spread 和时间差大于7天条件的记录
df_top_10_threshold = df_filtered_volume[
    (df_filtered_volume['threshold'] >= df_filtered_volume['threshold'].quantile(0.95)) & # volume 前 24 h 最高的top percentile
    (df_filtered_volume['spread'] >= 0.03) & # spread 大于 。。。。
    ((df_filtered_volume['endDate'] - today) > pd.Timedelta(days=30)) # 距离事件结束大于n天
].sort_values(by='threshold', ascending=False).reset_index(drop=True)

# 构造 question 到 TokenID、conditionId 和 spread 的字典
question_to_tokenid = df_top_10_threshold.set_index("question")[["TokenID", "conditionId", "spread"]].to_dict("index")

question_to_tokenid

{'Will Russia recapture Sudzha by March 31?': {'TokenID': ['40001158490980829331315202760805197309678407198352511114569399283632572307422',
   '22752183918002848078277748029145662550669719932108767430775139318472934671798'],
  'conditionId': '0x515b15f2d8fb8a3af69e2dcb7283a3078914f3ee6429e8c8806017b39bef190b',
  'spread': 0.04},
 'Will George Simion win the most votes in the 1st round of the Romanian presidential election?': {'TokenID': ['88452921474581314067876584765704791390820283118652110520565872529003608690280',
   '14465463482473387434149752047335597794226118395834187653939186725766811829543'],
  'conditionId': '0x3a369bc286924ad5ec7b8a6c906ebd7e79739d8b97fc1bc0ca0f629d71963a94',
  'spread': 0.17}}