In [214]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from tqdm import tqdm

## Load motions from calico tabs

In [215]:
motions_suburl = f"motions/statistics"

calico_tabs = [
    ('nsdc25', 'NordicSDC2025'),
    ('eurosdc25', 'eurosdc-25'),
    # ('esdc', 'esdc2025'),
    ('argument', 'eco2025'),
    ('argument', 'ECO2024'),
    ('nordics2023', 'nordics2023')
]

In [216]:
def parse_motion(div): 
    """
    Given a motion top-level div, parse out the relevant details.
    """
    round = div.find('span', class_="badge").get_text(strip=True)
    motion = div.find('h4').find(string=True, recursive=False).strip()

    info_slide = div.find(string=re.compile("View Info Slide"))
    info_slide_text = None

    if info_slide is not None:
        info_slide_text = info_slide.find_next('div', class_='modal-body').get_text(strip=True)

    # regex match the wins for prop and opp
    text = div.get_text()

    re_prop = re.compile(r"(\d+) (Prop|Gov) win")
    re_opp = re.compile(r"(\d+) Opp win")

    prop_wins = re.findall(re_prop, text)[0][0]
    opp_wins = re.findall(re_opp, text)[0][0]

    balance = div.find(string=re.compile('balance')).get_text(strip=True)

    return motion, round, info_slide_text, prop_wins, opp_wins, balance

In [217]:
data = {
    "Tournament": [],
    "Motion": [],
    "Round": [],
    "Info Slide": [],

    "Prop wins": [],
    "Opp wins": [],
    "Balance": [],
}

for subdomain, slug in calico_tabs:
    calico_url = f"https://{subdomain}.calicotab.com/{slug}"
    motions_url = f"{calico_url}/{motions_suburl}"

    response = requests.get(motions_url) 
    try: 
        assert response.status_code == 200, f"Failed to fetch {motions_url}: {response.status_code}"
    except: 
        continue

    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    motion_divs = soup.find_all('div', class_="list-group mt-3")
    print(f"Found {len(motion_divs)} motions from {slug}.")

    for start in tqdm(range(len(motion_divs))):
        div = motion_divs[start]
        motion, round, info_slide_text, prop_wins, opp_wins, balance = parse_motion(div)
        

        data["Motion"].append(motion)
        data["Round"].append(round)
        data["Info Slide"].append(info_slide_text)
        data["Prop wins"].append(int(prop_wins))
        data["Opp wins"].append(int(opp_wins))
        data["Balance"].append(balance)
        data["Tournament"].append(slug)

Found 11 motions from NordicSDC2025.


100%|██████████| 11/11 [00:00<00:00, 4057.46it/s]


Found 10 motions from eurosdc-25.


100%|██████████| 10/10 [00:00<00:00, 4921.16it/s]


Found 10 motions from eco2025.


100%|██████████| 10/10 [00:00<00:00, 6632.36it/s]


Found 10 motions from ECO2024.


100%|██████████| 10/10 [00:00<00:00, 8628.48it/s]


Found 9 motions from nordics2023.


100%|██████████| 9/9 [00:00<00:00, 3824.59it/s]


In [218]:
df1 = pd.DataFrame(data)
df1.tail()

Unnamed: 0,Tournament,Motion,Round,Info Slide,Prop wins,Opp wins,Balance
45,nordics2023,THBT the European Union should abolish the req...,Round 5,On issues considered sensitive to the EU (incl...,6,7,probably balanced
46,nordics2023,THR the expectation that committed romantic re...,Round 6,"Just like a lot of you, we were very tired and...",10,3,imbalanced at 10% level
47,nordics2023,THR the aesthetisation of suffering,Quarterfinals,Aesthetisation is a depiction or glorification...,4,0,balance inconclusive
48,nordics2023,THBT criminal justice policy should be decided...,Semifinals,Technocrats are decision-makers who are select...,1,1,balance inconclusive
49,nordics2023,THR religious faith being predominantly experi...,Grand Final,,1,0,balance inconclusive


## Parse WSDC motions from .txt on GitHub
https://github.com/tokyodebate/motions/blob/main/International/WSDC.txt

In [219]:
import requests

url = 'https://raw.githubusercontent.com/tokyodebate/motions/refs/heads/main/International/WSDC.txt'

response = requests.get(url)
wsdc_motions_raw = response.text

In [220]:
def count_tabs(line: str):
    """
    Count the number of tabs in a line.
    """
    return line.count('\t')

In [221]:
data2 = {
    "Tournament": [],
    "Motion": [],
    "Round": [],
    "Info Slide": [],

    # "Prop wins": [],
    # "Opp wins": [],
    # "Balance": [],
}

def parse_wsdc_motions():
    lines = wsdc_motions_raw.split('\n')
    lines = [line.strip('\n') for line in lines]
    lines = lines[1:] # Skip the header

    tournament = None
    round = None

    for i in range(len(lines)):
        curr = lines[i]

        if count_tabs(curr) == 1: 
            tournament = curr.strip()
        elif count_tabs(curr) == 2:
            round = curr.strip()
        elif count_tabs(curr) == 3:
            next_line = lines[i+1] if i+1 < len(lines) else None
            info_text = next_line.strip() if next_line and count_tabs(next_line) == 4 else None

            motion = curr.strip()   
            data2["Motion"].append(motion)
            data2["Round"].append(round)
            data2["Info Slide"].append(info_text)
            data2["Tournament"].append(tournament)

        else:
            continue

parse_wsdc_motions()

In [222]:
df2 = pd.DataFrame(data2)
df2.value_counts('Tournament')

Tournament
Netherlands WSDC 2022                                                           21
WSDC Mexico 2020                                                                20
24th World Schools Debating Championships 2012, South Africa                    13
21st World Schools Debating Championships 2009 in Athens, Greece                13
Vietnam WSDC 2023                                                               13
30th World Schools Debating Championships 2018, Croatia and Slovenia            13
29th World Schools Debating Championships 2017, Indonesia                       13
28th World Schools Debating Championships 2016, Germany                         13
27th World Schools Debating Championships 2015, Singapore                       13
WSDC Thailand 2019                                                              13
16th World Schools Debating Championships 2004 in Stuttgart, Germany            13
15th World Schools Debating Championships 2003 in Lima, Peru                

In [223]:
df = pd.concat([df1, df2], ignore_index=True)
df.head()

Unnamed: 0,Tournament,Motion,Round,Info Slide,Prop wins,Opp wins,Balance
0,NordicSDC2025,This House Supports gentle parenting becoming ...,Round 1,Gentle parenting is a parenting style that say...,16.0,1.0,imbalanced at 50% level
1,NordicSDC2025,THR the creation of the series and film industry,Round 2,,13.0,1.0,probably balanced
2,NordicSDC2025,This House Believes That it is in the interest...,Round 3,Democratic backsliding is a process of regime ...,15.0,1.0,probably balanced
3,NordicSDC2025,This House Opposes the Globalization of the Ma...,Round 4,“Major European Football Leagues” include the ...,10.0,1.0,imbalanced at 50% level
4,NordicSDC2025,This House Would implement a weekly 'blackout ...,Round 5,A 'blackout day' refers to a day where all soc...,13.0,1.0,probably balanced


## Data polishing 

In [224]:
df['Motion'] = df['Motion'].astype("string")

### Duplicates

In [225]:
# check for duplicates on motion and round 
# motions can be duplicated (if same is used in two different rounds)
duplicates = df[df.duplicated(subset=['Motion', 'Round'], keep=False)]

if len(duplicates) > 0:
    raise ValueError(f"Found {len(duplicates)} duplicates in the data. Please check the data for inconsistencies.")

### Tournament year

In [226]:
def get_year(tournament: str):
    pattern = re.compile(r'\d{4}')
    match = pattern.search(tournament)

    if tournament == 'eurosdc-25':
        return 2025

    if match:
        return int(match.group(0))
    else:
        return None

df['Year'] = df['Tournament'].apply(get_year)
df.value_counts('Year')

Year
2025    31
2023    22
2022    21
2020    20
2012    13
2019    13
2018    13
2003    13
2004    13
2005    13
2017    13
2007    13
2016    13
2009    13
2015    13
2013    12
2011    12
2010    12
2008    12
2006    12
2002    12
2001    12
1999    12
2014    11
2021    11
2024    10
1994     9
1997     5
1998     5
2000     4
Name: count, dtype: int64

### Motion types

In [227]:
def fix_motions(motion: str):
    if motion == "This House as the environmental movement would support the use of extremist tactics":
        return "This House, as the environmental movement, would support the use of extremist tactics."
    
    if motion == "TH, as the average 25 year old Romanian, would wait for their 'statistical soulmate'(josh is sad and lonely)":
        return motion.replace('would', "W")

    if motion.startswith('That'):
        motion = 'This House believes that' + motion[4:]
    
    return motion

df['Motion'] = df['Motion'].apply(fix_motions)

In [228]:
def extract_motion_header(motion: str): 
    # https://regex101.com/r/XB1k1w/1
    pattern = re.compile(r"(?:This (h|H)ouse?|TH)[ ,]?(?:,.*?, )?(?P<type>\w+)\s?")

    match = re.search(pattern, motion)
    return match

In [229]:
full_to_abbr = {
    'BT': "believes",
    'B': "believes",
    "S": "supports",
    "O": "opposes",
    "W": "would",
    "P": "prefers",
    "R": "regrets",
}


def get_motion_type(motion: str): 
    match = extract_motion_header(motion)

    if match: 
        motion_type = match.group('type')
        all_caps = all(map(str.isupper, motion_type))

        if all_caps:
            return f'{full_to_abbr[motion_type]}'
        else: 
            return f'{motion_type.lower()}'
    else:
        raise ValueError(f"Could not determine motion type for: {motion}")

df["Motion Type"] = df["Motion"].apply(get_motion_type)

## Vectorization
NB! This step requires access to a Google Cloud project with billing enabled. If desired, you can sign up for a $300 free trial that will be plenty to cover the costs of vectorization.

Alternatively, all vector space information is already included in the attached `motions.csv` file.

In [157]:
%pip install --upgrade -q google-genai vertexai dotenv

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [158]:
# only run this once
# !gcloud auth application-default login

In [230]:
import vertexai
from vertexai.language_models import TextEmbeddingModel
from dotenv import load_dotenv
import os

load_dotenv()
vertexai.init(project=os.environ['VERTEXAI_PROJECT_ID'])

In [235]:
def clean_motion(motion: str):
    match = extract_motion_header(motion)

    if match is None:
        print(f'No header found for motion: {motion}')
        return motion

    header_end = match.span()[1]
    chopped = motion[header_end:].lower()

    filler_words = ['that', 'the', 'it', 'is']

    flag = True
    while flag: 
        flag = False

        for w in filler_words: 
            if chopped.startswith(w): 
                chopped = chopped.replace(w, '')
                flag=True
    
    punctuations = ['.', ',' ';']
    for p in punctuations: 
        chopped = chopped.replace(p, '')

    return chopped


In [236]:
df['Motion (cleaned)'] = df['Motion'].map(clean_motion)
df.head()

Unnamed: 0,Tournament,Motion,Round,Info Slide,Prop wins,Opp wins,Balance,Year,Motion Type,Motion (cleaned)
0,NordicSDC2025,This House Supports gentle parenting becoming ...,Round 1,Gentle parenting is a parenting style that say...,16.0,1.0,imbalanced at 50% level,2025,supports,gentle parenting becoming the norm
1,NordicSDC2025,THR the creation of the series and film industry,Round 2,,13.0,1.0,probably balanced,2025,regrets,creation of series and film industry
2,NordicSDC2025,This House Believes That it is in the interest...,Round 3,Democratic backsliding is a process of regime ...,15.0,1.0,probably balanced,2025,believes,it is in the interest of georgian government ...
3,NordicSDC2025,This House Opposes the Globalization of the Ma...,Round 4,“Major European Football Leagues” include the ...,10.0,1.0,imbalanced at 50% level,2025,opposes,globalization of major european football lea...
4,NordicSDC2025,This House Would implement a weekly 'blackout ...,Round 5,A 'blackout day' refers to a day where all soc...,13.0,1.0,probably balanced,2025,would,implement a weekly 'blackout day'


In [237]:
from typing import Optional


def embed(texts, output_dims: Optional[int] = None):
  model_id = 'text-embedding-005'
  # model_id = 'gemini-embedding-001'
  model = TextEmbeddingModel.from_pretrained(model_id)
  embeddings = model.get_embeddings(texts, output_dimensionality=output_dims)

  return embeddings

In [243]:
# vertexai allows a max of 250 input elements at once
chunk_size = 100

for start in tqdm(range(0, len(df), chunk_size)):
    end = start + chunk_size
    motion_strings = df['Motion (cleaned)'].values[start:end]

    embeddings = embed(list(motion_strings), 20)
    df.loc[start:end-1, 'Embedding'] = embeddings #NB! pandas uses an inclusive end index (hence the end-1)

    # if start > 100:
    #     break

100%|██████████| 4/4 [00:08<00:00,  2.14s/it]


In [244]:
#FIXME: the beginning is cut-off
df.loc[89]

Tournament                                     WSDC Macau Online 2021
Motion              In the event of a Chinese invasion of of Taiwa...
Round                                               SF: Invade Taiwan
Info Slide                                                       None
Prop wins                                                         NaN
Opp wins                                                          NaN
Balance                                                           NaN
Year                                                             2021
Motion Type                                                     would
Motion (cleaned)                                 intervene militarily
Embedding           TextEmbedding(values=[-0.041190557181835175, -...
Name: 89, dtype: object

## Save data locally

In [245]:
#TODO: maybe better to store each value in its own column? or just use a diff file format specifically for floats
from vertexai.language_models import TextEmbedding

df['Embedding'] = df['Embedding'].map(lambda e: e.values if isinstance(e, TextEmbedding) else None) #get only the numbers

In [246]:
import os

data_dir = 'data'

os.makedirs(data_dir, exist_ok=True)
df.to_csv(os.path.join(data_dir, 'motions_d20.csv'))