In [None]:
import ujson   
import csv

json_path = r'C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\raw\arxiv-metadata.json'
csv_path  = r'C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\processed\arxiv-metadata.csv'

fields = [
    'id',
    'title',
    'abstract',
    'categories',
    'submitter',
    'doi',
    'journal-ref',
    'update_date',
    'first_version_date',
    'last_version_date',
    'authors',
    'authors_parsed'
]

with open(json_path, 'r', encoding='utf-8') as jf, \
     open(csv_path,  'w', newline='', encoding='utf-8') as cf:

    writer = csv.DictWriter(cf, fieldnames=fields)
    writer.writeheader()

    for i, line in enumerate(jf, 1):
        try:
            obj = ujson.loads(line)

            # Extract version dates
            versions = obj.get('versions', [])
            first_v = versions[0]['created'] if versions else ''
            last_v  = versions[-1]['created'] if versions else ''

            # Robust authors_parsed flattening
            parsed = []
            for name in obj.get('authors_parsed', []):
                if not isinstance(name, list) or len(name) == 0:
                    continue
                last = name[0]
                first = name[1] if len(name) > 1 else ''
                parsed.append(f"{last}, {first}")
            auth_parsed_str = '; '.join(parsed)

            row = {
                'id':                obj.get('id', ''),
                'title':             obj.get('title', '').replace('\n', ' ').strip(),
                'abstract':          obj.get('abstract', '').replace('\n', ' ').strip(),
                'categories':        obj.get('categories', ''),
                'submitter':         obj.get('submitter', ''),
                'doi':               obj.get('doi', ''),
                'journal-ref':       obj.get('journal-ref', ''),
                'update_date':       obj.get('update_date', ''),
                'first_version_date': first_v,
                'last_version_date':  last_v,
                'authors':           obj.get('authors', ''),
                'authors_parsed':    auth_parsed_str
            }
            writer.writerow(row)

        except Exception as e:
            print(f"Skipping line {i}: {e}")
            continue

        if i % 500000 == 0:
            print(f"Processed {i:,} lines...")

print("Done! CSV saved to:", csv_path)


Processed 500,000 lines...
Processed 1,000,000 lines...
Processed 1,500,000 lines...
Processed 2,000,000 lines...
Processed 2,500,000 lines...
Done! CSV saved to: C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\processed\arxiv-metadata.csv


In [3]:
import pandas as pd

csv_path  = r'C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\processed\arxiv-metadata.csv'
df = pd.read_csv(csv_path, encoding='utf-8')

  df = pd.read_csv(csv_path, encoding='utf-8')


In [7]:
df.head()

Unnamed: 0,id,title,abstract,categories,submitter,doi,journal-ref,update_date,first_version_date,last_version_date,authors,authors_parsed
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturbati...,hep-ph,Pavel Nadolsky,10.1103/PhysRevD.76.013009,"Phys.Rev.D76:013009,2007",2008-11-26,"Mon, 2 Apr 2007 19:18:42 GMT","Tue, 24 Jul 2007 20:10:27 GMT","C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...","Balázs, C.; Berger, E. L.; Nadolsky, P. M.; Yu..."
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe...",math.CO cs.CG,Louis Theran,,,2008-12-13,"Sat, 31 Mar 2007 02:26:18 GMT","Sat, 13 Dec 2008 17:26:00 GMT",Ileana Streinu and Louis Theran,"Streinu, Ileana; Theran, Louis"
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is describe...,physics.gen-ph,Hongjun Pan,,,2008-01-13,"Sun, 1 Apr 2007 20:46:54 GMT","Sun, 13 Jan 2008 00:36:28 GMT",Hongjun Pan,"Pan, Hongjun"
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle n...,math.CO,David Callan,,,2007-05-23,"Sat, 31 Mar 2007 03:16:14 GMT","Sat, 31 Mar 2007 03:16:14 GMT",David Callan,"Callan, David"
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\Lam...,math.CA math.FA,Alberto Torchinsky,,"Illinois J. Math. 52 (2008) no.2, 681-689",2013-10-15,"Mon, 2 Apr 2007 18:09:58 GMT","Mon, 2 Apr 2007 18:09:58 GMT",Wael Abu-Shammala and Alberto Torchinsky,"Abu-Shammala, Wael; Torchinsky, Alberto"


In [9]:
df.describe()

Unnamed: 0,id,title,abstract,categories,submitter,doi,journal-ref,update_date,first_version_date,last_version_date,authors,authors_parsed
count,2771128,2771128,2771128,2771128,2755938,1247563,897886,2771128,2771128,2771128,2771128,2771128
unique,2771104,2766890,2769305,90821,574133,1245291,886365,5417,2736601,2753251,2178104,2054357
top,math-ph/0512019,Discussion of: A statistical analysis of multi...,This paper has been withdrawn.,astro-ph,EPTCS,10.1145/1122445.1122456,"Dans Design, Automation and Test in Europe - D...",2007-05-23,"Wed, 27 Jun 2012 19:59:59 GMT","Wed, 27 Jun 2012 19:59:59 GMT",CMS Collaboration,"CMS Collaboration,"
freq,4,12,91,86911,4012,18,128,129945,109,103,1361,1361


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2771128 entries, 0 to 2771127
Data columns (total 12 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   id                  object
 1   title               object
 2   abstract            object
 3   categories          object
 4   submitter           object
 5   doi                 object
 6   journal-ref         object
 7   update_date         object
 8   first_version_date  object
 9   last_version_date   object
 10  authors             object
 11  authors_parsed      object
dtypes: object(12)
memory usage: 253.7+ MB


In [12]:
top20 = (
    df['categories']
    .value_counts()
    .head(20)
    .reset_index()
    .rename(columns={'index': 'category', 'categories': 'count'})
)

print(top20)

                count  count
0            astro-ph  86911
1              hep-ph  83510
2            quant-ph  74999
3               cs.CV  73704
4              hep-th  60534
5   cond-mat.mtrl-sci  42212
6             math.AP  37608
7   cond-mat.mes-hall  36528
8         astro-ph.GA  33880
9             math.CO  32460
10              gr-qc  32166
11              cs.CL  32008
12        astro-ph.SR  28152
13    cond-mat.str-el  27314
14            math.PR  25253
15        astro-ph.HE  25036
16      cs.IT math.IT  23794
17            math.NT  23786
18              cs.LG  23719
19        astro-ph.CO  23130


In [8]:
df.shape

(2771128, 12)

In [None]:
null_counts = df.isnull().sum()

null_percentages = (null_counts / len(df)) * 100

null_summary = pd.DataFrame({
    'Null Count': null_counts,
    'Null Percentage': null_percentages
}).sort_values('Null Count', ascending=False)

print(null_summary)

                    Null Count  Null Percentage
journal-ref            1873242        67.598537
doi                    1523565        54.979958
submitter                15190         0.548152
id                           0         0.000000
categories                   0         0.000000
abstract                     0         0.000000
title                        0         0.000000
update_date                  0         0.000000
first_version_date           0         0.000000
last_version_date            0         0.000000
authors                      0         0.000000
authors_parsed               0         0.000000


In [4]:
df = df.drop(columns=['id', 'categories' ,'submitter', 'doi', 'journal-ref', 'update_date', 'first_version_date', 'last_version_date', 'authors', 'authors_parsed'])

In [5]:
df.head()

Unnamed: 0,title,abstract
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturbati...
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-pe..."
2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is describe...
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle n...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\Lam...


In [None]:
cleaned_csv_path = r'C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\processed\arxiv-metadata-cleaned.csv'
df.to_csv(cleaned_csv_path, index=False, encoding='utf-8')

In [4]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import pandas as pd

cleaned_csv_path = r'C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\processed\arxiv-metadata-cleaned.csv'

df = pd.read_csv(cleaned_csv_path, encoding='utf-8')

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_df.to_csv(r'C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\splits\arxiv-train.csv', index=False, encoding='utf-8')
val_df.to_csv(r'C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\splits\arxiv-val.csv', index=False, encoding='utf-8')
test_df.to_csv(r'C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\splits\arxiv-test.csv', index=False, encoding='utf-8')

In [1]:
from datasets import load_dataset


data_files = {
    "train": r"C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\splits\arxiv-train.csv",
    "validation": r"C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\splits\arxiv-val.csv",
    "test": r"C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\splits\arxiv-test.csv"
}
raw_datasets = load_dataset("csv", data_files=data_files)


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from datasets import load_dataset
import multiprocessing


def tokenize_function(examples):
    from transformers import PegasusTokenizer
    tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
    
    model_inputs = tokenizer(
        examples["abstract"],
        max_length=512,
        padding="max_length",
        truncation=True
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["title"],
            max_length=64,
            padding="max_length",
            truncation=True
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    num_proc=multiprocessing.cpu_count(),   
    load_from_cache_file=True               
)

tokenized_datasets.save_to_disk(r"C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\tokenized\arxiv-tokenized")


Map (num_proc=16): 100%|██████████| 2216902/2216902 [14:56<00:00, 2471.89 examples/s] 
Map (num_proc=16): 100%|██████████| 277113/277113 [01:00<00:00, 4617.63 examples/s]
Map (num_proc=16): 100%|██████████| 277113/277113 [00:58<00:00, 4731.28 examples/s]
Saving the dataset (14/14 shards): 100%|██████████| 2216902/2216902 [00:34<00:00, 64450.39 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 277113/277113 [00:04<00:00, 60785.51 examples/s]
Saving the dataset (2/2 shards): 100%|██████████| 277113/277113 [00:05<00:00, 54283.43 examples/s]


In [None]:
from datasets import load_from_disk
tokenized_datasets = load_from_disk(r"C:\Users\nikhi\OneDrive\Documents\GitHub\TitleForge\data\tokenized\arxiv-tokenized")

print(tokenized_datasets)
print(tokenized_datasets['train'].to_pandas().head())

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2216902
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 277113
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 277113
    })
})
                                           input_ids  \
0  [184, 5731, 109, 986, 113, 1273, 112, 9523, 14...   
1  [1825, 21924, 3792, 7002, 1625, 432, 27728, 25...   
2  [184, 692, 109, 15474, 37703, 575, 122, 45791,...   
3  [139, 2211, 1298, 640, 112, 5014, 7143, 108, 2...   
4  [139, 2560, 113, 109, 1972, 562, 117, 112, 120...   

                                      attention_mask  \
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...   
4  [1, 1, 1, 1, 1, 