In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from google.colab import drive
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
df=pd.read_csv('/content/drive/Shareddrives/Master_Thesis/Data/final_dataset.csv')
print(f"Dataframe shape: {df.shape}")
df.head()

Dataframe shape: (88775, 7)


Unnamed: 0,id,paper id,title,categories,type,content,question
0,0,2501.00724,category o for quantum loop algebras,"['math.rt', 'math.qa']",theorems,\label{thm:main}\n\t\nConsider any Kac-Moody L...,What is the relationship between the modified ...
1,1,2501.00724,category o for quantum loop algebras,"['math.rt', 'math.qa']",theorems,\label{thm:toroidal}\n\nFor a polynomial $\ell...,What is the formula for the q-characteized val...
2,2,2501.00724,category o for quantum loop algebras,"['math.rt', 'math.qa']",theorems,\label{thm:simple}\n\n(\cite{HJ}) Up to isomor...,Is the simple representation generated by a si...
3,3,2501.00724,category o for quantum loop algebras,"['math.rt', 'math.qa']",theorems,\label{thm:quantum to shuffle}\n\nWe have $\em...,Is the isomorphism $\Upsilon^+$ between the sm...
4,4,2501.00724,category o for quantum loop algebras,"['math.rt', 'math.qa']",theorems,\label{thm:is rational} \n\nA simple module $\...,Is a simple module $\CA^\geq \curvearrowright ...


In [17]:
print(df.dtypes)

id              int64
paper id      float64
title          object
categories     object
type           object
content        object
question       object
dtype: object


## Paper statements distribution

In [8]:
# 1. Count statements per paper
paper_counts = df['paper id'].value_counts()

# 2. Find papers with fewer than 4 statements
low_count_papers = paper_counts[paper_counts < 4].index
vc = df['paper id'].value_counts()
# 3. Split the DataFrame
df_low_count = df[df['paper id'].isin(low_count_papers)].copy()
df_sufficient_count = df[~df['paper id'].isin(low_count_papers)].copy()

# Optional: inspect
print("Papers with <4 statements:", len(low_count_papers))
print("Rows in df_low_count:", len(df_low_count))
print("Rows in df_sufficient_count:", len(df_sufficient_count))

Papers with <4 statements: 505
Rows in df_low_count: 1041
Rows in df_sufficient_count: 87734


## Unseen papers separation

We save a few papers for a special test set, which we will use to evaluate the generalization for new papers

In [25]:
# 2) Identify papers in df_sufficient_count with exactly 14 statements
paper_counts = df_sufficient_count['paper id'].value_counts()
papers_14 = paper_counts[paper_counts == 14].index.tolist() # this saves a list with the ids of papers with exaclty 14 statements

# 3) Randomly sample a handful of them for your unseen-doc test
k = 15
if len(papers_14) < k:
    selected_papers_14 = papers_14
else:
    selected_papers_14 = np.random.choice(papers_14, size=k, replace=False).tolist()

# 4) Extract unseen-test slice of those papers
df_unseen_test = df_sufficient_count[df_sufficient_count['paper id'].isin(selected_papers_14)].copy()

# 5) Remove them from the pool
df_remain = df_sufficient_count[~df_sufficient_count['paper id'].isin(selected_papers_14)].copy()

# 6) Compute sizes to get a 90% overall train set
n_low     = len(df_low_count)         # all low-count papers go to train
n_unseen  = len(df_unseen_test)       # these go to test
n_remain  = len(df_remain)
n_total   = n_low + n_unseen + n_remain

# Desired total train size = 90% of n_total
train_target = 0.90 * n_total

# Remaining train needed from df_remain
train_remain_needed = train_target - n_low

# Fraction of df_remain to sample
train_frac = train_remain_needed / n_remain
train_frac = max(min(train_frac, 1.0), 0.0)  # clamp to [0,1]

# 7) Randomly split df_remain into train vs. test
df_remain_train = df_remain.sample(frac=train_frac, random_state=42)
df_test  = df_remain.drop(df_remain_train.index)

# 8) Final train / test sets
df_train = pd.concat([df_low_count, df_remain_train], ignore_index=True)

# 9) Sanity-check proportions
print(f"Total statements:        {n_total}")
print(f"  -> Train:              {len(df_train)} ({len(df_train)/n_total:.1%})")
print(f"  -> Regular Test:        {len(df_test)} ")
print(f"  -> Unseen Test:         {len(df_unseen_test)} ")
print(f"  -> Total Test:          {len(df_test )+ len(df_unseen_test)} ({(len(df_test)+ len(df_unseen_test))/n_total:.1%})")

# df_unseen_test holds your “few papers of size 14” test slice,
# df_train has EVERYTHING else reserved for training,
# and df_test also includes the random remainder from df_remain.

Total statements:        88775
  -> Train:              79897 (90.0%)
  -> Regular Test:        8668 
  -> Unseen Test:         210 
  -> Total Test:          8878 (10.0%)


In [26]:
df_unseen_test.to_csv('/content/drive/Shareddrives/Master_Thesis/Data/unseen_test.csv', index=False)
df_train.to_csv('/content/drive/Shareddrives/Master_Thesis/Data/train.csv', index=False)
df_test.to_csv('/content/drive/Shareddrives/Master_Thesis/Data/test.csv', index=False)