In [None]:
pip install py_clob_client

In [None]:
import os
#from dotenv import load_dotenv
import py_clob_client as clob
from py_clob_client.client import ClobClient
import urllib.parse
import time
import json
import numpy as np
import csv
from tqdm import tqdm
import matplotlib.pyplot as plt

# Setup

In [None]:
gammaUrl = "https://gamma-api.polymarket.com/markets"
host = 'https://clob.polymarket.com/'

client = clob.client.ClobClient(host, chain_id=clob.constants.POLYGON)

In [None]:
# Function to build the query string
def build_query_string(base_url, params):
    query_string = []
    for key, value in params.items():
        if isinstance(value, list):
            for v in value:
                query_string.append(f"{key}={urllib.parse.quote(str(v))}")
        else:
            query_string.append(f"{key}={urllib.parse.quote(str(value))}")

    return base_url + "?" + "&".join(query_string)

# Get all Markets

In [None]:
file_path = "./markets_data.json"
all_results = []
if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
    print("file exists")
    with open(file_path, "r", encoding="utf-8") as f:
        all_results = json.load(f)  # Load the JSON file into a Python list
    print(len(all_results))
else:
    print("file doesn't exist")
    next_cursor = ""  # Start from the beginning
    counter = 0
    while next_cursor != "LTE=":
        response = client.get_markets(next_cursor=next_cursor)
        all_results.extend(response["data"])  # Assuming data is inside "data" key
        next_cursor = response.get("next_cursor", "LTE=")  # Get the next cursor or default to "LTE="
        counter += len(response["data"])
        print(counter)
        if next_cursor == "LTE=":
            break  # Stop if we reach the end
        time.sleep(0.1)  # Avoid rate limits
    with open(file_path, "w") as f:
        json.dump(all_results, f, indent=4)

# Filter Markets

In [None]:
all_markets = all_results.copy()

#check for condition_id
noConditionId = list(filter(lambda m: m.get("condition_id") == "" or None, all_markets))
print(f"No ConditionId: {len(noConditionId)}")

print(f"All markets: {len(all_markets)}")

all_markets = list(filter(lambda item: item["closed"] == True, all_markets))
print(f"Closed: {len(all_markets)}")

all_markets = list(filter(lambda m: not any(t.get("token_id") == "" for t in m.get("tokens", [])), all_markets))
print(f"With existing tokens: {len(all_markets)}")
all_markets = list(filter(lambda m: len(m.get("tokens", [])) == 2, all_markets))
print(f"With 2 tokens: {len(all_markets)}")

doubleTrue = list(filter(lambda m: all(t.get("winner") == True for t in m.get("tokens", [])), all_markets))
print(f"Both options true: {len(doubleTrue)}")
#print(json.dumps(doubleTrue))

doubleFalse = list(filter(lambda m: all(t.get("winner") == False for t in m.get("tokens", [])), all_markets))
print(f"Both options false: {len(doubleFalse)}")

all_markets = list(filter(lambda m: sum(t.get("winner") == True for t in m.get("tokens", [])) == 1, all_markets))
print(f"With one winner: {len(all_markets)}")

all_markets = list(filter(lambda m: sum(t.get("winner") == False for t in m.get("tokens", [])) == 1, all_markets))
print(f"With one loser: {len(all_markets)}")

In [None]:
from collections import Counter

# Assuming your list of markets is called all_markets
first_outcomes = [market["tokens"][0]["outcome"] for market in all_markets if market.get("tokens")]

# Count how many times each outcome appears
counts = Counter(first_outcomes)

# Print nicely
for outcome, count in counts.items():
    print(f"{outcome}: {count}")

# Get Prices

In [None]:
def GetHistoricalPrice(tokenId):
    fidelity = 10  # Start with default 10-minute fidelity

    params = {
        "market": tokenId, #number, the CLOB token id for which to fetch price history
        "startTs": 1, #number 	the start time, a unix timestamp in UTC
        "fidelity": fidelity, #number 	the resolution of the data, in minutes
    }

    final_url = build_query_string(host + "/prices-history", params)
    httpRes = clob.http_helpers.helpers.get(final_url)
    time.sleep(0.205)  # Respect rate limits
    history = httpRes["history"]

    if not history:
        return []

    if len(history) >= 100:
        return history
    timestamps = [point["t"] for point in history]
    timespan = max(timestamps) - min(timestamps)

    # Convert timespan from seconds to minutes for fidelity calculation
    timespan_minutes = timespan / 60
    ideal_fidelity = max(int(timespan_minutes / 100), 1)
    #print(f"{len(history)} prices, try with {ideal_fidelity} on token {tokenId}")
    params["fidelity"] = ideal_fidelity

    final_url = build_query_string(host + "/prices-history", params)
    httpRes = clob.http_helpers.helpers.get(final_url)
    time.sleep(0.205)  # Respect rate limits
    history = httpRes["history"]

    return history

In [None]:
def resample(prices):
    # sort ascending for timestamp
    pricesSorted = sorted(prices, key=lambda p: p["t"])

    # Convert to arrays
    timestamps = np.array([p["t"] for p in pricesSorted])
    price_values = np.array([p["p"] for p in pricesSorted])

    # 100 evenly spaced timestamps between first and last
    even_timestamps = np.linspace(timestamps[0], timestamps[-1], 100)

    # Find closest previous price for each timestamp
    indices = np.searchsorted(timestamps, even_timestamps, side='right') - 1
    indices = np.clip(indices, 0, len(pricesSorted) - 1)

    resampled = price_values[indices]
    return resampled

In [None]:
existing_condition_ids = set()

filename = "processed_markets.csv"
if not os.path.exists(filename):
    open(filename, 'w').close()

with open(filename, "r", encoding="utf-8") as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        existing_condition_ids.add(row['condition_id'])

print(f"existing: {len(existing_condition_ids)}")
noPriceFoundMarkets = []
invalidPricesFoundMarkets = []
writtenToFile = 0
pricesLength = []

with open(filename, "a", newline="", encoding="utf-8") as csvfile:
    fieldnames = [
        "condition_id", "market_slug", "tags", "start", "end", "yes_token_id", "winner_token",
        *[f"price_{i}" for i in range(100)]
    ]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Only write the header if the file is empty
    csvfile.seek(0, 2)
    if csvfile.tell() == 0:
        writer.writeheader()

    with tqdm(total=len(all_markets), desc="Processing markets", initial=len(existing_condition_ids)) as progress_bar:
        for market_data in all_markets:
            condition_id = str(market_data["condition_id"])  # Convert to string for comparison

            # Skip if condition_id already exists in the CSV
            if condition_id in existing_condition_ids:
                continue

            tokens = market_data["tokens"]

            yes_token_id = tokens[0]["token_id"]
            winner_token = tokens[0]["winner"]
            market_slug = market_data["market_slug"]
            tags = str(market_data.get("tags", []) or [])

            price_history = GetHistoricalPrice(yes_token_id)
            if not price_history:
                noPriceFoundMarkets.append(market_data)
                progress_bar.update(1)
                #print(f"No historical prices found for market with conditionId {condition_id}, slug {market_slug}.")
                continue

            resampled_prices = resample(price_history)

            if any(price < 0 or price > 1 for price in resampled_prices):
                invalidPricesFoundMarkets.append(market_data)
                progress_bar.update(1)
                continue

            pricesLength.append(len(price_history))

            timestamps = [entry["t"] for entry in price_history]
            lowest_timestamp = min(timestamps)
            highest_timestamp = max(timestamps)

            # Prepare market data to write to CSV
            processed_market = {
                "condition_id": condition_id,
                "market_slug": market_slug,
                "tags": tags,
                "start": lowest_timestamp,
                "end": highest_timestamp,
                "yes_token_id": yes_token_id,
                "winner_token": winner_token,
            }

            # Add 100 price columns: price_0, price_1, ..., price_99
            for i, price in enumerate(resampled_prices):
                processed_market[f"price_{i}"] = price

            # Write the processed market row to the CSV file
            writer.writerow(processed_market)
            writtenToFile += 1

            # Update progress bar
            progress_bar.update(1)
print(f"Wrote {writtenToFile} markets to file")
print(f"Markets with no prices: {len(noPriceFoundMarkets)}")
print(f"Markets with invalid prices: {len(invalidPricesFoundMarkets)}")

In [None]:
# Plot the histogram
plt.hist(pricesLength, bins='auto', edgecolor='black')  # 'auto' lets matplotlib determine the optimal number of bins

# Adding labels and title
plt.xlabel('Length of Price Lists')
plt.ylabel('Frequency')
plt.title('Distribution of Price List Lengths')

# Show the plot
plt.show()