In [2]:
import pandas as pd
import plotly.express as px
from datetime import datetime
import os

## Load data 

In [628]:
data_dir = 'data'
motions_data_path = os.path.join(data_dir, 'motions_d20.csv')

if not os.path.exists(motions_data_path): 
    raise FileNotFoundError('Local data file not found. Please run the data collection notebook first.')

df = pd.read_csv(motions_data_path, index_col=0)
df.head()

Unnamed: 0,Tournament,Motion,Round,Info Slide,Prop wins,Opp wins,Balance,Year,Motion Type,Motion (cleaned),Embedding
0,NordicSDC2025,This House Supports gentle parenting becoming the norm,Round 1,"Gentle parenting is a parenting style that says that parents should be non-confrontational. The style states that parents should focus on deliberation with (young) children, have high levels of patience and avoid punishments and ultimatums.",16.0,1.0,imbalanced at 50% level,2025,supports,gentle parenting becoming the norm,"[-0.07073014974594116, -0.017622996121644974, 0.023448124527931213, -0.010209158062934875, 0.006813352461904287, 0.039929646998643875, 0.02401009015738964, 0.007765816058963537, -0.023403815925121307, -0.008534936234354973, 0.008615216240286827, -0.021525176241993904, -0.004473365843296051, -0.01983877643942833, -0.03936992958188057, 0.044233355671167374, -0.007928065024316311, -0.09341483563184738, 0.006272569764405489, -0.023473497480154037]"
1,NordicSDC2025,THR the creation of the series and film industry,Round 2,,13.0,1.0,probably balanced,2025,regrets,creation of series and film industry,"[0.046621181070804596, -0.0058633433654904366, 0.008297571912407875, -0.06645385175943375, -0.03304266184568405, 0.00014940612891223282, 0.0053434525616467, 0.02122129127383232, -0.005419842898845673, -0.009445874020457268, -0.03535092622041702, -0.009264633990824223, 0.033422693610191345, -0.03075386956334114, 0.016625789925456047, 0.04602214694023132, 0.03123985044658184, -0.030950099229812622, -0.008966410532593727, 0.04880686104297638]"
2,NordicSDC2025,This House Believes That it is in the interest of Georgian government to democratically backslide,Round 3,"Democratic backsliding is a process of regime change toward autocracy in which the votes of individuals have less power and political power becomes repressive. Examples include limiting free speech/press, corruption, nepotism or gaining control over state institutions.",15.0,1.0,probably balanced,2025,believes,it is in the interest of georgian government to democratically backslide,"[-0.012221300974488258, -0.029059894382953644, -0.0028698991518467665, 0.016281524673104286, 0.057276975363492966, 0.009461416862905025, 0.008052892051637173, -0.05019576475024223, 0.004408178851008415, 0.0186170544475317, -0.04507594555616379, -0.026625478640198708, 0.05502500385046005, -0.007176734507083893, -0.0031587223056703806, -0.021722041070461273, 0.07130083441734314, 0.023778816685080528, -0.0006534297135658562, 0.05382782220840454]"
3,NordicSDC2025,This House Opposes the Globalization of the Major European Football Leagues,Round 4,"“Major European Football Leagues” include the English Premier League, the German Bundesliga, the Italian Seria A, the French Ligue 1 and the Spanish La Liga.For the purpose of this debate, globalization of football leagues refers to increasing the number of international players, creating ownership franchises and actively promoting the league in foreign countries.",10.0,1.0,imbalanced at 50% level,2025,opposes,globalization of major european football leagues,"[0.04815811291337013, 0.001889492617920041, 0.03153463080525398, -0.016297733411192894, 0.016674082726240158, 0.02057991735637188, -0.02087770774960518, 0.04858285188674927, 0.022735200822353363, 0.0076142181642353535, 0.00871062558144331, -0.008508776314556599, 0.0246577151119709, 0.01915111020207405, -0.021667951717972755, 0.03133944422006607, 0.014839502982795238, -0.021487513557076454, -0.005065857898443937, 0.004158458206802607]"
4,NordicSDC2025,This House Would implement a weekly 'blackout day',Round 5,"A 'blackout day' refers to a day where all social media sites, streaming services, TV, internet, radio, and similar technologies are inaccessible for personal, non-emergency use. For example, one would still be able to call hospitals and banks but would not be able to watch downloaded Netflix series",13.0,1.0,probably balanced,2025,would,implement a weekly 'blackout day',"[-0.058066967874765396, -0.016530131921172142, -0.0174716804176569, 0.012600692920386791, 0.018199829384684563, -0.03894367814064026, -0.026138579472899437, -0.004512433893978596, -0.0009739733650349081, -0.006850811652839184, -0.04922435060143471, -0.07136769592761993, -0.019539140164852142, -0.044131238013505936, 0.019940920174121857, 0.029648175463080406, 0.048657748848199844, -0.06078634411096573, -0.014944484457373619, -0.014256168156862259]"


### Parse the embedding field as float[]

In [629]:
import json
df['Embedding'] = df['Embedding'].map(json.loads)

## Analysis

### All-time motion types

In [630]:
counts = df.value_counts('Motion Type')
print(counts)

px.bar(
    counts, 
    y='count', 
    title='Motion Types',
    labels={'Motion Type': 'Motion Type'},
).show()

Motion Type
believes       174
would          136
supports        26
regrets         19
prefers         18
opposes         10
condemns         2
disapproves      1
fears            1
refuses          1
Name: count, dtype: int64


### This year's motion types

In [631]:
current_year = datetime.now().year

df_this_year = df[df['Year'] == current_year]
df_this_year.tail()

Unnamed: 0,Tournament,Motion,Round,Info Slide,Prop wins,Opp wins,Balance,Year,Motion Type,Motion (cleaned),Embedding
26,eco2025,THS the development of genetic engineering,Round 6,"Genetic engineering (also called genetic modification) is a process that uses laboratory-based technologies to alter the DNA makeup of an organism. This may involve changing a single base pair, deleting a region of DNA or adding a new segment of DNA. For example, genetic engineering may involve adding a gene from one species to an organism from a different species to produce a desired trait. Used in research and industry, genetic engineering has been applied to the production of cancer therapies, brewing yeasts, genetically modified plants and livestock, and more. Currently, genetic engineering is being worked on by scientists in order to be applied on humans in the womb.",10.0,5.0,imbalanced at 50% level,2025,supports,development of genetic engineering,"[-0.0051087564788758755, -0.012383205816149712, 0.07001379132270813, -0.048676300793886185, 0.013805512338876724, 0.017069922760128975, -0.05945022031664848, 0.03229012340307236, 0.016017422080039978, 0.034026212990283966, -0.016650844365358353, -0.06639625877141953, -0.00690110120922327, -0.0016901224153116345, 0.020256318151950836, 0.012823861092329025, -0.004870959557592869, -0.011242911219596863, 0.021194642409682274, -0.02345607988536358]"
27,eco2025,THP a world in which all people have chronic Pinocchiosis,Novice Grand Final,"Chronic Pinocchiosis is a condition that causes an individual’s nose to horizontally grow by a millimetre every time they knowingly lie. Once it grows, it does not shrink back, and it is magically immune to rhinoplasty or any other attempt at artificial shrinkage. Long noses do not cause any significant health issues and nose growth caused by Chronic Pinocchiosis is not hereditary.",1.0,0.0,balance inconclusive,2025,prefers,a world in which all people have chronic pinocchiosis,"[-0.03956710919737816, -0.011850863695144653, 0.05029597505927086, -0.005007102154195309, -0.0010839354945346713, 0.04243796318769455, 0.02984938956797123, -0.059253714978694916, -0.03023284487426281, -0.005569561384618282, -0.03202180936932564, -0.03771503269672394, 0.030181247740983963, 0.006252242252230644, -0.02372823841869831, 0.0013990127481520176, 0.08946572989225388, -0.03871817886829376, -0.03875712305307388, -0.01736418344080448]"
28,eco2025,"This House, as the environmental movement, would support the use of extremist tactics.",Quarterfinals,<div>Extremist tactics includes the destruction of property and large-scale civil disobedience i.e blocking roads and buildings</div><div><br></div>,0.0,4.0,balance inconclusive,2025,would,support the use of extremist tactics,"[-0.1085641011595726, 0.003305028425529599, 0.009076996706426144, 0.009243869222700596, 0.024098727852106094, -0.01934395730495453, -0.010543419048190117, 0.005745511502027512, 0.002622008789330721, -0.00637624179944396, -0.05909115821123123, 0.000886576424818486, -0.029527511447668076, -0.01871384307742119, 0.0033700289204716682, -0.0006114280549809337, 0.05873996764421463, -0.04958033934235573, 0.017453866079449654, -0.003998314496129751]"
29,eco2025,"THBT the heads of central banks (e.g. the Federal Reserve, the European Central Bank) should be democratically elected",Semifinals,,0.0,2.0,balance inconclusive,2025,believes,"heads of central banks (eg federal reserve, european central bank) should be democratically elected","[-0.014936745166778564, 0.030797796323895454, 0.03431565687060356, -0.001864211750216782, 0.040309611707925797, 0.056442636996507645, -0.04370389133691788, 0.054735761135816574, -0.004131869412958622, 0.013895363546907902, -0.03360489010810852, -0.07527333498001099, 0.003735985141247511, 0.013430689461529255, 0.008548382669687271, -0.008154243230819702, 0.057836730033159256, -0.01164541020989418, 0.0018745653796941042, 0.03999968245625496]"
30,eco2025,THW ban pet ownership,Grand Final,,1.0,0.0,balance inconclusive,2025,would,ban pet ownership,"[-0.021138371899724007, 0.004608393646776676, 0.06737670302391052, -0.028685418888926506, 0.022726871073246002, -0.00576575705781579, 0.006046940106898546, 0.003200804116204381, -0.03038228675723076, 0.033630549907684326, -0.0052011036314070225, -0.04331322759389877, -0.09585375338792801, -0.011081529781222343, -0.034504909068346024, 0.023286692798137665, 0.026898939162492752, -0.014448635280132294, 0.02899855375289917, 0.006256431341171265]"


In [632]:
counts = df_this_year.value_counts('Motion Type')

px.bar(
    counts, 
    y='count', 
    title=f'Motion Types for {current_year}',
    labels={'Motion Type': 'Motion Type'},
).show()

### Semantic analysis

In [633]:
%pip install -q scikit-learn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [634]:
# spread embeddings across multiple columns

df_embeddings = pd.DataFrame(df['Embedding'].to_list())
df_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.07073,-0.017623,0.023448,-0.010209,0.006813,0.03993,0.02401,0.007766,-0.023404,-0.008535,0.008615,-0.021525,-0.004473,-0.019839,-0.03937,0.044233,-0.007928,-0.093415,0.006273,-0.023473
1,0.046621,-0.005863,0.008298,-0.066454,-0.033043,0.000149,0.005343,0.021221,-0.00542,-0.009446,-0.035351,-0.009265,0.033423,-0.030754,0.016626,0.046022,0.03124,-0.03095,-0.008966,0.048807
2,-0.012221,-0.02906,-0.00287,0.016282,0.057277,0.009461,0.008053,-0.050196,0.004408,0.018617,-0.045076,-0.026625,0.055025,-0.007177,-0.003159,-0.021722,0.071301,0.023779,-0.000653,0.053828
3,0.048158,0.001889,0.031535,-0.016298,0.016674,0.02058,-0.020878,0.048583,0.022735,0.007614,0.008711,-0.008509,0.024658,0.019151,-0.021668,0.031339,0.01484,-0.021488,-0.005066,0.004158
4,-0.058067,-0.01653,-0.017472,0.012601,0.0182,-0.038944,-0.026139,-0.004512,-0.000974,-0.006851,-0.049224,-0.071368,-0.019539,-0.044131,0.019941,0.029648,0.048658,-0.060786,-0.014944,-0.014256


### Standardize features

In [637]:
def check_distr(df: pd.DataFrame):
    mse_mean = ((df.mean() - 0) ** 2).mean()
    mse_std = ((df.std() - 1) ** 2).mean()

    print(f"Mean: {df.values.mean()}, mse: {mse_mean}")
    print(f"St.d.: {df.values.std()}, mse: {mse_std}")

check_distr(df_embeddings)

Mean: -1.6023837468816693e-18, mse: 1.962143731799393e-33
St.d.: 1.0, mse: 1.6670841314218946e-06


In [638]:
from sklearn.preprocessing import StandardScaler

scaled = StandardScaler().fit_transform(df_embeddings)
df_embeddings = pd.DataFrame(scaled)

check_distr(df_embeddings)

Mean: 1.6023837468816693e-18, mse: 2.297770133246185e-34
St.d.: 1.0, mse: 1.6670841314220378e-06


### KMeans

In [639]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans

silhouettes = []

def find_best_n_clusters(start: int = 2, end: int = len(df_embeddings)):
    best_n = dict(
        n=0.0,
        score=-1.0
    )

    for n_clusters in range(start, end):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(df_embeddings)

        df_kmeans = pd.DataFrame(kmeans.labels_, columns=['centroid'])

        silhouette_avg = float(silhouette_score(df_embeddings, df_kmeans.values.ravel()))
        silhouettes.append(silhouette_avg)

        if silhouette_avg > best_n['score']:
            best_n['score'] = silhouette_avg
            best_n['n'] = n_clusters
    
    return best_n

best_n_clusters = find_best_n_clusters()


Number of distinct clusters (378) found smaller than n_clusters (379). Possibly due to duplicate points in X.


Number of distinct clusters (378) found smaller than n_clusters (380). Possibly due to duplicate points in X.


Number of distinct clusters (378) found smaller than n_clusters (381). Possibly due to duplicate points in X.


Number of distinct clusters (378) found smaller than n_clusters (382). Possibly due to duplicate points in X.


Number of distinct clusters (378) found smaller than n_clusters (383). Possibly due to duplicate points in X.


Number of distinct clusters (378) found smaller than n_clusters (384). Possibly due to duplicate points in X.


Number of distinct clusters (378) found smaller than n_clusters (385). Possibly due to duplicate points in X.


Number of distinct clusters (378) found smaller than n_clusters (386). Possibly due to duplicate points in X.


Number of distinct clusters (378) found smaller than n_clusters (387). Possibly due to duplicate points

In [640]:
print(f"Best n clusters = {best_n_clusters['n']} with a silhouette score of {best_n_clusters['score']}")
px.line(x=range(2, len(df_embeddings)), y=silhouettes)

Best n clusters = 266 with a silhouette score of 0.10219155019522985


In [648]:
n_clusters = 45

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(df_embeddings)

labels = kmeans.labels_
centroids = kmeans.cluster_centers_

df_kmeans = pd.DataFrame(labels, columns=['centroid'])
df_kmeans.head()

Unnamed: 0,centroid
0,28
1,42
2,41
3,11
4,7


In [649]:
# import matplotlib.pyplot as plt
# import matplotlib.cm as cm
# import numpy as np
# from sklearn.metrics import silhouette_samples

# sample_silhouette_values = silhouette_samples(df_embeddings, df_kmeans.values.ravel())

# ax1 = plt.subplot()

# y_lower = 10
# for i in range(n_clusters):
#     # Aggregate the silhouette scores for samples belonging to
#     # cluster i, and sort them
#     ith_cluster_silhouette_values = sample_silhouette_values[df_kmeans.values.ravel() == i]

#     ith_cluster_silhouette_values.sort()

#     size_cluster_i = ith_cluster_silhouette_values.shape[0]
#     y_upper = y_lower + size_cluster_i

#     color = cm.nipy_spectral(float(i) / n_clusters)
#     ax1.fill_betweenx(
#         np.arange(y_lower, y_upper),
#         0,
#         ith_cluster_silhouette_values,
#         facecolor=color,
#         edgecolor=color,
#         alpha=0.7,
#     )

#     # Label the silhouette plots with their cluster numbers at the middle
#     ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

#     # Compute the new y_lower for next plot
#     y_lower = y_upper + 10  # 10 for the 0 samples

# ax1.set_title("The silhouette plot for the various clusters.")
# ax1.set_xlabel("The silhouette coefficient values")
# ax1.set_ylabel("Cluster label")

# # The vertical line for average silhouette score of all the values
# ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

# ax1.set_yticks([])  # Clear the yaxis labels / ticks
# # ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

### PCA

In [650]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_embeddings = pca.fit_transform(df_embeddings)
df_pca = pd.DataFrame(pca_embeddings, columns=['pca_1', 'pca_2'])

df_pca.head(), pca_embeddings.shape, pca.explained_variance_ratio_

(      pca_1     pca_2
 0  0.855470  0.472238
 1  2.526407 -0.609624
 2  0.696321 -1.157138
 3  2.542346  1.900799
 4 -0.172645 -1.956667,
 (388, 2),
 array([0.11069209, 0.08704453]))

### Visualize

In [651]:
# centroids through PCA
pca_centroids = pca.transform(centroids)
pca_centroids.shape

(45, 2)

In [652]:
df_with_emb = pd.concat([df.drop(columns=['Embedding']), df_pca, df_kmeans], axis=1)
df_with_emb.head()

Unnamed: 0,Tournament,Motion,Round,Info Slide,Prop wins,Opp wins,Balance,Year,Motion Type,Motion (cleaned),pca_1,pca_2,centroid
0,NordicSDC2025,This House Supports gentle parenting becoming the norm,Round 1,"Gentle parenting is a parenting style that says that parents should be non-confrontational. The style states that parents should focus on deliberation with (young) children, have high levels of patience and avoid punishments and ultimatums.",16.0,1.0,imbalanced at 50% level,2025,supports,gentle parenting becoming the norm,0.85547,0.472238,28
1,NordicSDC2025,THR the creation of the series and film industry,Round 2,,13.0,1.0,probably balanced,2025,regrets,creation of series and film industry,2.526407,-0.609624,42
2,NordicSDC2025,This House Believes That it is in the interest of Georgian government to democratically backslide,Round 3,"Democratic backsliding is a process of regime change toward autocracy in which the votes of individuals have less power and political power becomes repressive. Examples include limiting free speech/press, corruption, nepotism or gaining control over state institutions.",15.0,1.0,probably balanced,2025,believes,it is in the interest of georgian government to democratically backslide,0.696321,-1.157138,41
3,NordicSDC2025,This House Opposes the Globalization of the Major European Football Leagues,Round 4,"“Major European Football Leagues” include the English Premier League, the German Bundesliga, the Italian Seria A, the French Ligue 1 and the Spanish La Liga.For the purpose of this debate, globalization of football leagues refers to increasing the number of international players, creating ownership franchises and actively promoting the league in foreign countries.",10.0,1.0,imbalanced at 50% level,2025,opposes,globalization of major european football leagues,2.542346,1.900799,11
4,NordicSDC2025,This House Would implement a weekly 'blackout day',Round 5,"A 'blackout day' refers to a day where all social media sites, streaming services, TV, internet, radio, and similar technologies are inaccessible for personal, non-emergency use. For example, one would still be able to call hospitals and banks but would not be able to watch downloaded Netflix series",13.0,1.0,probably balanced,2025,would,implement a weekly 'blackout day',-0.172645,-1.956667,7


In [653]:
import plotly.graph_objects as go

fig = go.Figure()

marker = go.scatter.Marker(
    color=df_with_emb['centroid'],
    colorscale='rainbow',
    size=10,
    opacity=0.7
)

centroid_marker = go.scatter.Marker(
    symbol='x',
    size=20,
    opacity=0.5,
    color='gray'
)

fig.add_trace(go.Scatter(x=df_with_emb['pca_1'], y=df_with_emb['pca_2'], mode='markers', hovertext=df_with_emb['Motion'], marker=marker))
# fig.add_trace(go.Scatter(x=pca_centroids[:, 0], y=pca_centroids[:, 1], mode='markers', marker=centroid_marker))

fig.show()

In [654]:
pd.set_option("display.max_colwidth", None)  # after this you can print any column length
pd.set_option("display.max_rows", None)  # after this you can print any column length

df_with_emb.sort_values('centroid')[['Motion (cleaned)', 'centroid']]

Unnamed: 0,Motion (cleaned),centroid
73,"developing countries should privatise their state owned enterprises (such as airlines, utility companies like electricity, gas companies, etc)",0
256,humanities courses should be part of every undergraduate programme,0
182,allow prisoners to volunteer for drug trials in exchange for lighter sentences,0
145,make labour union membership compulsory in large industries (prepared),0
264,expand the permanent membership of the un security council,1
228,we should support military intervention in somalia,1
253,military intervention to deliver emergency aid in humanitarian crises,1
302,permanent members of united nations security council should not have veto power,1
363,bombing of iraq,1
52,internet gurus* selling their own products and services,1


### Team wins (balance)

In [10]:
prop_wins = df['Prop wins'].sum()
opp_wins = df['Opp wins'].sum()

print(prop_wins, opp_wins)

px.pie(
    values=[prop_wins, opp_wins],
    names=['Prop wins', 'Opp wins'],
    title='Total Prop vs Opp Wins',
).show()

290.0 165.0


In [11]:
prop_wins = df_this_year['Prop wins'].sum()
opp_wins = df_this_year['Opp wins'].sum()

print(prop_wins, opp_wins)

px.pie(
    values=[prop_wins, opp_wins],
    names=['Prop wins', 'Opp wins'],
    title='Total Prop vs Opp Wins',
).show()

196.0 85.0
