In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import pandas as pd
from pylatexenc.latex2text import LatexNodes2Text # to turn latex into string
import string
import re


file_path = './archive/arxiv-metadata-oai-snapshot.json'

# Load the first 5000 papers into a dataframe
data = []
with open(file_path, 'r') as f:
    for i, line in enumerate(f):
        if i >= 5000:  # Change number to adjust sample size
            break
        data.append(json.loads(line))

df = pd.DataFrame(data)

In [2]:
df

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,0704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,0704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,0704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,0704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,0704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0705.0993,Heather Knutson,"Heather A. Knutson, David Charbonneau, Lori E....",A map of the day-night contrast of the extraso...,"To appear in the May 10 2007 issue of Nature, ...","Nature 447:183-186,2007",10.1038/nature05782,,astro-ph,,"""Hot Jupiter"" extrasolar planets are expecte...","[{'version': 'v1', 'created': 'Mon, 7 May 2007...",2010-04-06,"[[Knutson, Heather A., ], [Charbonneau, David,..."
4996,0705.0994,Andreas Karch,"Dongsu Bak, Andreas Karch, and Laurence G. Yaffe",Debye screening in strongly coupled N=4 supers...,"21 pages, 5 figures, significantly expanded di...","JHEP 0708:049,2007",10.1088/1126-6708/2007/08/049,,hep-th,,"Using the AdS/CFT correspondence, we examine...","[{'version': 'v1', 'created': 'Mon, 7 May 2007...",2009-04-22,"[[Bak, Dongsu, ], [Karch, Andreas, ], [Yaffe, ..."
4997,0705.0995,Zhongyuan Zhou,"Zhongyuan Zhou, Shih-I Chu, Siyuan Han",Decoherence of a driven multilevel quantum sys...,,,,,quant-ph,,A general theory is presented for the treatm...,"[{'version': 'v1', 'created': 'Tue, 8 May 2007...",2007-05-23,"[[Zhou, Zhongyuan, ], [Chu, Shih-I, ], [Han, S..."
4998,0705.0996,Anzhong Wang,Yungui Gong and Anzhong Wang,Energy conditions and current acceleration of ...,"revtex4, five figures. Corrected some typos an...","Phys.Lett.B652:63-68,2007",10.1016/j.physletb.2007.06.065,,astro-ph gr-qc hep-ph hep-th,,The energy conditions provide a very promisi...,"[{'version': 'v1', 'created': 'Mon, 7 May 2007...",2009-06-23,"[[Gong, Yungui, ], [Wang, Anzhong, ]]"


In order to process the recommendations, I will check the similarity to the user input to the abstracts. 
The abstracts are a high quality source, so misspellings are assumed to be rare. However, they are highly specialized as well. 
## Pre-processing
The data needs some pre-processing, particularly punctuation and line feeds need to be deleted as they carry no semantic meaning. The same applies to upper case letters.

In [4]:
df_cp = df.copy()

In [14]:
df_cp["abstract"] =df_cp["abstract"].str.lower()
df_cp["abstract"].head()

0      a fully differential calculation in perturba...
1      we describe a new algorithm, the $(k,\ell)$-...
2      the evolution of earth-moon system is descri...
3      we show that a determinant of stirling cycle...
4      in this paper we show how to compute the $\l...
Name: abstract, dtype: object

The abstracts are using Latex ($(k,\ell)$) which could cause noise, worsening the recommendations. 

In [89]:
df_cp = df.copy()

converter = LatexNodes2Text()

def clean_abstract(abstract):
    text = converter.latex_to_text(abstract)
    text = text.lower().replace("\n", " ") # Delete line feeds, lowercase everything
    text = " ".join([a.strip(string.punctuation) for a in text.split()]) # Delete punctuation from beginning and end of words
    text = re.sub(r'\b\d+(\.\d+)?\b', '', text) # Delete isolated numbers as they have no meaning
    return text

df_cp['clean_abstract'] = df_cp['abstract'].apply(clean_abstract)



In [90]:
df_cp['clean_abstract'][200]

'associated to the classical weyl groups we introduce the notion of degenerate spin affine hecke algebras and affine hecke-clifford algebras for these algebras we establish the pbw properties formulate the intertwiners and describe the centers we further develop connections of these algebras with the usual degenerate i.e graded affine hecke algebras of lusztig by introducing a notion of degenerate covering affine hecke algebras'