In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [3]:
df=pd.read_csv("/content/drive/Shareddrives/Master_Thesis/exploded_df.csv")
print(f"df shape: {df.shape}")
df.head()

df shape: (182145, 6)


Unnamed: 0.1,Unnamed: 0,id,title,categories,type,content
0,0,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:main}\n\t\nConsider any Kac-Moody L...
1,1,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:my refined}\n\nLet $\fg$ be of fini...
2,2,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:toroidal}\n\nFor a polynomial $\ell...
3,3,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:simple}\n\n(\cite{HJ}) Up to isomor...
4,4,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:quantum to shuffle}\n\nWe have $\em...


In [None]:
# check that papers are ordered by their id
df["id"].is_monotonic_increasing

True

In [None]:
df.rename(columns={'id': 'paper id'}, inplace=True)
df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
df.head()

Unnamed: 0,id,paper id,title,categories,type,content
0,0,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:main}\n\t\nConsider any Kac-Moody L...
1,1,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:my refined}\n\nLet $\fg$ be of fini...
2,2,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:toroidal}\n\nFor a polynomial $\ell...
3,3,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:simple}\n\n(\cite{HJ}) Up to isomor...
4,4,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:quantum to shuffle}\n\nWe have $\em...


In [None]:
# Summary statistics for the length of statements
lengths = df['content'].str.len()
print(lengths.describe())

count    182132.000000
mean        527.731986
std         623.660913
min           1.000000
25%         231.000000
50%         386.000000
75%         642.000000
max       56737.000000
Name: content, dtype: float64


In [None]:
# Check summary again for the statements with over 1000 characters
max_lengths= lengths[lengths>1000]
print(max_lengths.describe())

count    18924.000000
mean      1664.985838
std       1362.912685
min       1001.000000
25%       1134.000000
50%       1341.000000
75%       1741.000000
max      56737.000000
Name: content, dtype: float64


In [None]:
# drop the statements with over 2000 tokens
MAX_CHARS = 2000
df = df[df['content'].str.len() <= MAX_CHARS].reset_index(drop=True)
df['id'] = df.index
print(f"Dropped {182132-df.shape[0]} statements\n")
print(f"df shape: {df.shape}\n")
df.head()

Dropped 3299 statements

df shape: (178833, 6)



Unnamed: 0,id,paper id,title,categories,type,content
0,0,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:main}\n\t\nConsider any Kac-Moody L...
1,1,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:my refined}\n\nLet $\fg$ be of fini...
2,2,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:toroidal}\n\nFor a polynomial $\ell...
3,3,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:simple}\n\n(\cite{HJ}) Up to isomor...
4,4,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:quantum to shuffle}\n\nWe have $\em...


In [None]:
# count how many values we have for each type of passage
df['type'].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
lemmas,56241
theorems,43938
propositions,32765
definitions,28989
corollaries,16900


In [None]:
# evaluate average length of each type:
df.groupby('type')['content'].apply(lambda x: x.str.len().mean())

Unnamed: 0_level_0,content
type,Unnamed: 1_level_1
corollaries,397.180888
definitions,530.714305
lemmas,442.846731
propositions,461.58004
theorems,526.65399


In [None]:
np.random.seed(43)
df_200=df.sample(200)
print(f"df shape: {df.shape}")
print(f"df_200 shape: {df_200.shape}")
df_200.head()

df shape: (178833, 6)
df_200 shape: (200, 6)


Unnamed: 0,id,paper id,title,categories,type,content
65949,67127,2502.02991,the derrida-retaux model on a geometric galton...,math.pr,lemmas,"\label{lem:limits}\nGiven a solution $(u_{n},..."
59018,60083,2502.00525,a projected variable smoothing for weakly conv...,math.oc,lemmas,\label{lema_penalty}\nFor every $k\in\mathbb{N...
169274,172443,2503.21479,quantum umlaut information,quant-ph cs.it math-ph math.it math.mp,definitions,[(Petz--R\'{e}nyi $\alpha$-umlaut information)...
87166,88727,2502.10965,higher rank macdonald polynomials,math.co math.qa math.rt,lemmas,"The generators $T_j,X_i,\pi$ of $\sD_n(q_1)$ a..."
156935,159859,2503.16723,divergence-free drifts decrease concentration,math.ap,theorems,[Riesz rearrangement inequality]\n\label{thm:r...


In [None]:
df_200.to_csv("/content/drive/Shareddrives/Master_Thesis/Query_Creation/Data/passages_200.csv")

In [None]:
df_200['type'].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
lemmas,75
theorems,39
propositions,36
definitions,33
corollaries,17


In [None]:
lengths200 = df_200['content'].str.len()

# 2) Summary statistics
print(lengths200.describe())

count     200.000000
mean      496.375000
std       385.432192
min        44.000000
25%       229.750000
50%       382.500000
75%       622.750000
max      1999.000000
Name: content, dtype: float64


# Chunking the dataset

First of all, we only use 100.000 rows for our fine-tuning. The dataset is ordered by paper ir, so we use the first 100.000 rows such that we keep everything tidy and use fewer papers with all of their statements

In [None]:
df_100=df.loc[:99999,:]
print(f"df_100 shape: {df_100.shape}\n")
df_100.head()

df_100 shape: (100000, 6)



Unnamed: 0,id,paper id,title,categories,type,content
0,0,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:main}\n\t\nConsider any Kac-Moody L...
1,1,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:my refined}\n\nLet $\fg$ be of fini...
2,2,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:toroidal}\n\nFor a polynomial $\ell...
3,3,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:simple}\n\n(\cite{HJ}) Up to isomor...
4,4,2501.00724,category o for quantum loop algebras,math.rt math.qa,theorems,\label{thm:quantum to shuffle}\n\nWe have $\em...


In [None]:
df_100.tail()

Unnamed: 0,id,paper id,title,categories,type,content
99995,99995,2502.16073,indicated list colouring game on graphs,math.co,lemmas,"\label{lem-color}\n Assume $(G,L)$ is an in..."
99996,99996,2502.16073,indicated list colouring game on graphs,math.co,lemmas,"\label{lem-color1}\n Assume $(G,L)$ is an i..."
99997,99997,2502.16073,indicated list colouring game on graphs,math.co,lemmas,"\label{lem-main1}\n If $(G,L)$ is infeasi..."
99998,99998,2502.16073,indicated list colouring game on graphs,math.co,lemmas,"\label{lem-main2}\n If $(G,L)$ is infeasi..."
99999,99999,2502.16073,indicated list colouring game on graphs,math.co,lemmas,"\label{lem-vs} \n If $(G_i, L_i)$ is infeas..."


Creating queries for the entire dataset takes around 35h. We split them in 6 smaller chunks, which we will run separately.

In [None]:
output_dir = "/content/drive/Shareddrives/Master_Thesis/Query_Creation/Data"
os.makedirs(output_dir, exist_ok=True)

n_splits = 6
splits = np.array_split(df_100, n_splits)

for i, chunk in enumerate(splits, start=1):
    # reset index so each chunk runs 0…len-1
    chunk = chunk.reset_index(drop=True)
    # overwrite the id column if you want it to match the new index
    chunk['id'] = chunk.index

    fname = f"df_100_chunk_{i}.csv"
    path = os.path.join(output_dir, fname)
    chunk.to_csv(path, index=False)
    print(f"Chunk {i}: {len(chunk)} rows → saved to {path}")

  return bound(*args, **kwds)


Chunk 1: 16667 rows → saved to /content/drive/Shareddrives/Master_Thesis/Query_Creation/Data/df_100_chunk_1.csv
Chunk 2: 16667 rows → saved to /content/drive/Shareddrives/Master_Thesis/Query_Creation/Data/df_100_chunk_2.csv
Chunk 3: 16667 rows → saved to /content/drive/Shareddrives/Master_Thesis/Query_Creation/Data/df_100_chunk_3.csv
Chunk 4: 16667 rows → saved to /content/drive/Shareddrives/Master_Thesis/Query_Creation/Data/df_100_chunk_4.csv
Chunk 5: 16666 rows → saved to /content/drive/Shareddrives/Master_Thesis/Query_Creation/Data/df_100_chunk_5.csv
Chunk 6: 16666 rows → saved to /content/drive/Shareddrives/Master_Thesis/Query_Creation/Data/df_100_chunk_6.csv
