In [None]:
!kaggle datasets download -d Cornell-University/arxiv
!unzip /content/arxiv.zip

Dataset URL: https://www.kaggle.com/datasets/Cornell-University/arxiv
License(s): CC0-1.0
Downloading arxiv.zip to /content
100% 1.32G/1.32G [00:17<00:00, 136MB/s]
100% 1.32G/1.32G [00:17<00:00, 79.7MB/s]
Archive:  /content/arxiv.zip
  inflating: arxiv-metadata-oai-snapshot.json  


In [None]:
import pandas as pd
import json

def read_json_in_batches(file_path, batch_size=10000):
    dataframes = []
    with open(file_path, 'r') as f:
        batch = []
        for line in f:
            batch.append(json.loads(line.strip()))

            if len(batch) == batch_size:
                dataframes.append(pd.DataFrame(batch))
                batch = []

        if batch:
            dataframes.append(pd.DataFrame(batch))

    result_df = pd.concat(dataframes, ignore_index=True)
    return result_df


file_path = '/content/arxiv-metadata-oai-snapshot.json'
df = read_json_in_batches(file_path, batch_size=10000)

df



# authors write papers
# paper has versions
# categories have papers

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,0704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,0704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,0704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,0704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,0704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2535063,supr-con/9608008,Ruslan Prozorov,"R. Prozorov, M. Konczykowski, B. Schmidt, Y. Y...",On the origin of the irreversibility line in t...,"19 pages, LaTex, 6 PostScript figures; Author'...",,10.1103/PhysRevB.54.15530,,supr-con cond-mat.supr-con,,We report on measurements of the angular dep...,"[{'version': 'v1', 'created': 'Mon, 26 Aug 199...",2009-10-30,"[[Prozorov, R., ], [Konczykowski, M., ], [Schm..."
2535064,supr-con/9609001,Durga P. Choudhury,"Durga P. Choudhury, Balam A. Willemsen, John S...",Nonlinear Response of HTSC Thin Film Microwave...,"4 pages, LaTeX type, Uses IEEE style files, 60...",,10.1109/77.620744,,supr-con cond-mat.supr-con,,The non-linear microwave surface impedance o...,"[{'version': 'v1', 'created': 'Sat, 31 Aug 199...",2016-11-18,"[[Choudhury, Durga P., , Physics Department, N..."
2535065,supr-con/9609002,Durga P. Choudhury,"Balam A. Willemsen, J. S. Derov and S.Sridhar ...",Critical State Flux Penetration and Linear Mic...,"20 pages, LaTeX type, Uses REVTeX style files,...",,10.1103/PhysRevB.56.11989,,supr-con cond-mat.supr-con,,The vortex contribution to the dc field (H) ...,"[{'version': 'v1', 'created': 'Tue, 3 Sep 1996...",2009-10-30,"[[Willemsen, Balam A., , Physics Department,\n..."
2535066,supr-con/9609003,Hasegawa Yasumasa,Yasumasa Hasegawa (Himeji Institute of Technol...,Density of States and NMR Relaxation Rate in A...,"7 pages, 4 PostScript Figures, LaTeX, to appea...",,10.1143/JPSJ.65.3131,,supr-con cond-mat.supr-con,,We show that the density of states in an ani...,"[{'version': 'v1', 'created': 'Wed, 18 Sep 199...",2009-10-30,"[[Hasegawa, Yasumasa, , Himeji Institute of Te..."


In [None]:
def generate_id(prefix, value):
    return f"{prefix}_{hash(value)}"

base_path = '/content/drive/MyDrive/arxiv-semantic-search/'
authors_file = base_path + 'authors.csv'
papers_file = base_path + 'papers.csv'
categories_file = base_path + 'categories.csv'
authorship_file = base_path + 'authorship.csv'
categorization_file = base_path + 'categorization.csv'

In [None]:
pd.DataFrame(columns=['author_id', 'name']).to_csv(authors_file, index=False)
pd.DataFrame(columns=['paper_id', 'title', 'abstract', 'doi']).to_csv(papers_file, index=False)
pd.DataFrame(columns=['category_id', 'name']).to_csv(categories_file, index=False)
pd.DataFrame(columns=['author_id', 'paper_id']).to_csv(authorship_file, index=False)
pd.DataFrame(columns=['paper_id', 'category_id']).to_csv(categorization_file, index=False)

In [8]:
def append_to_csv(data, file_path):
    pd.DataFrame(data).to_csv(file_path, mode='a', header=False, index=False)

chunksize = 1000

num_chunks = len(df) // chunksize + 1
for i in range(num_chunks):
    chunk = df[i * chunksize:(i + 1) * chunksize]

    authors_set = set()
    categories_set = set()
    authorship_set = set()
    categorization_set = set()

    papers_data = []
    authors_data = []
    authorship_data = []
    categories_data = []
    categorization_data = []

    for index, paper in chunk.iterrows():
        paper_id = generate_id('paper', paper['id'])

        papers_data.append({'paper_id': paper_id, 'title': paper['title'], 'abstract': paper['abstract'], 'doi': paper['doi']})

        for author in paper['authors_parsed']:
            author_name = " ".join(author)
            author_id = generate_id('author', author_name)

            if author_id not in authors_set:
                authors_data.append({'author_id': author_id, 'name': author_name})
                authors_set.add(author_id)

            if (author_id, paper_id) not in authorship_set:
                authorship_data.append({'author_id': author_id, 'paper_id': paper_id})
                authorship_set.add((author_id, paper_id))

        for category in paper['categories'].split():
            category_id = generate_id('category', category)

            if category_id not in categories_set:
                categories_data.append({'category_id': category_id, 'name': category})
                categories_set.add(category_id)

            if (paper_id, category_id) not in categorization_set:
                categorization_data.append({'paper_id': paper_id, 'category_id': category_id})
                categorization_set.add((paper_id, category_id))

    append_to_csv(papers_data, papers_file)
    append_to_csv(authors_data, authors_file)
    append_to_csv(authorship_data, authorship_file)
    append_to_csv(categories_data, categories_file)
    append_to_csv(categorization_data, categorization_file)

In [None]:
categories_data

[{'category_id': 'category_2125424055383828907', 'name': 'supr-con'},
 {'category_id': 'category_3350492521992799337', 'name': 'cond-mat.supr-con'},
 {'category_id': 'category_-6739780127139360658', 'name': 'mtrl-th'},
 {'category_id': 'category_-5450845467737262685', 'name': 'chao-dyn'},
 {'category_id': 'category_-2660188382381771893', 'name': 'nlin.CD'},
 {'category_id': 'category_-4463802297506937463', 'name': 'chem-ph'},
 {'category_id': 'category_-6218086052284457043', 'name': 'hep-th'},
 {'category_id': 'category_-1418411104583732977', 'name': 'hep-ex'},
 {'category_id': 'category_-646775090454500245', 'name': 'nucl-ex'},
 {'category_id': 'category_-6208117593686288048', 'name': 'gr-qc'},
 {'category_id': 'category_-3927698617548139781', 'name': 'hep-ph'},
 {'category_id': 'category_2167819096694581637', 'name': 'cond-mat.mtrl-sci'},
 {'category_id': 'category_3376668483642221774', 'name': 'nlin.PS'},
 {'category_id': 'category_-7891277034850252893', 'name': 'patt-sol'},
 {'cate