In [1]:
import json
import pandas as pd
import re

In [6]:
# All Arxiv category codes
# Source: https://www.kaggle.com/code/artgor/arxiv-metadata-exploration

# https://arxiv.org/category_taxonomy
# https://info.arxiv.org/help/api/user-manual.html#subject_classifications


category_map = {
# These created errors when mapping categories to descriptions
'acc-phys': 'Accelerator Physics',
'adap-org': 'Not available',
'q-bio': 'Not available',
'cond-mat': 'Not available',
'chao-dyn': 'Not available',
'patt-sol': 'Not available',
'dg-ga': 'Not available',
'solv-int': 'Not available',
'bayes-an': 'Not available',
'comp-gas': 'Not available',
'alg-geom': 'Not available',
'funct-an': 'Not available',
'q-alg': 'Not available',
'ao-sci': 'Not available',
'atom-ph': 'Atomic Physics',
'chem-ph': 'Chemical Physics',
'plasm-ph': 'Plasma Physics',
'mtrl-th': 'Not available',
'cmp-lg': 'Not available',
'supr-con': 'Not available',
###

# Added
'econ.GN': 'General Economics', 
'econ.TH': 'Theoretical Economics', 
'eess.SY': 'Systems and Control', 
    
'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics',
'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
'cond-mat.mtrl-sci': 'Materials Science',
'cond-mat.other': 'Other Condensed Matter',
'cond-mat.quant-gas': 'Quantum Gases',
'cond-mat.soft': 'Soft Condensed Matter',
'cond-mat.stat-mech': 'Statistical Mechanics',
'cond-mat.str-el': 'Strongly Correlated Electrons',
'cond-mat.supr-con': 'Superconductivity',
'cs.AI': 'Artificial Intelligence',
'cs.AR': 'Hardware Architecture',
'cs.CC': 'Computational Complexity',
'cs.CE': 'Computational Engineering, Finance, and Science',
'cs.CG': 'Computational Geometry',
'cs.CL': 'Computation and Language',
'cs.CR': 'Cryptography and Security',
'cs.CV': 'Computer Vision and Pattern Recognition',
'cs.CY': 'Computers and Society',
'cs.DB': 'Databases',
'cs.DC': 'Distributed, Parallel, and Cluster Computing',
'cs.DL': 'Digital Libraries',
'cs.DM': 'Discrete Mathematics',
'cs.DS': 'Data Structures and Algorithms',
'cs.ET': 'Emerging Technologies',
'cs.FL': 'Formal Languages and Automata Theory',
'cs.GL': 'General Literature',
'cs.GR': 'Graphics',
'cs.GT': 'Computer Science and Game Theory',
'cs.HC': 'Human-Computer Interaction',
'cs.IR': 'Information Retrieval',
'cs.IT': 'Information Theory',
'cs.LG': 'Machine Learning',
'cs.LO': 'Logic in Computer Science',
'cs.MA': 'Multiagent Systems',
'cs.MM': 'Multimedia',
'cs.MS': 'Mathematical Software',
'cs.NA': 'Numerical Analysis',
'cs.NE': 'Neural and Evolutionary Computing',
'cs.NI': 'Networking and Internet Architecture',
'cs.OH': 'Other Computer Science',
'cs.OS': 'Operating Systems',
'cs.PF': 'Performance',
'cs.PL': 'Programming Languages',
'cs.RO': 'Robotics',
'cs.SC': 'Symbolic Computation',
'cs.SD': 'Sound',
'cs.SE': 'Software Engineering',
'cs.SI': 'Social and Information Networks',
'cs.SY': 'Systems and Control',
'econ.EM': 'Econometrics',             
'eess.AS': 'Audio and Speech Processing',
'eess.IV': 'Image and Video Processing',
'eess.SP': 'Signal Processing',               
'gr-qc': 'General Relativity and Quantum Cosmology',
'hep-ex': 'High Energy Physics - Experiment',
'hep-lat': 'High Energy Physics - Lattice',
'hep-ph': 'High Energy Physics - Phenomenology',
'hep-th': 'High Energy Physics - Theory',
'math.AC': 'Commutative Algebra',
'math.AG': 'Algebraic Geometry',
'math.AP': 'Analysis of PDEs',
'math.AT': 'Algebraic Topology',
'math.CA': 'Classical Analysis and ODEs',
'math.CO': 'Combinatorics',
'math.CT': 'Category Theory',
'math.CV': 'Complex Variables',
'math.DG': 'Differential Geometry',
'math.DS': 'Dynamical Systems',
'math.FA': 'Functional Analysis',
'math.GM': 'General Mathematics',
'math.GN': 'General Topology',
'math.GR': 'Group Theory',
'math.GT': 'Geometric Topology',
'math.HO': 'History and Overview',
'math.IT': 'Information Theory',
'math.KT': 'K-Theory and Homology',
'math.LO': 'Logic',
'math.MG': 'Metric Geometry',
'math.MP': 'Mathematical Physics',
'math.NA': 'Numerical Analysis',
'math.NT': 'Number Theory',
'math.OA': 'Operator Algebras',
'math.OC': 'Optimization and Control',
'math.PR': 'Probability',
'math.QA': 'Quantum Algebra',
'math.RA': 'Rings and Algebras',
'math.RT': 'Representation Theory',
'math.SG': 'Symplectic Geometry',
'math.SP': 'Spectral Theory',
'math.ST': 'Statistics Theory',
'math-ph': 'Mathematical Physics',
'nlin.AO': 'Adaptation and Self-Organizing Systems',
'nlin.CD': 'Chaotic Dynamics',
'nlin.CG': 'Cellular Automata and Lattice Gases',
'nlin.PS': 'Pattern Formation and Solitons',
'nlin.SI': 'Exactly Solvable and Integrable Systems',
'nucl-ex': 'Nuclear Experiment',
'nucl-th': 'Nuclear Theory',
'physics.acc-ph': 'Accelerator Physics',
'physics.ao-ph': 'Atmospheric and Oceanic Physics',
'physics.app-ph': 'Applied Physics',
'physics.atm-clus': 'Atomic and Molecular Clusters',
'physics.atom-ph': 'Atomic Physics',
'physics.bio-ph': 'Biological Physics',
'physics.chem-ph': 'Chemical Physics',
'physics.class-ph': 'Classical Physics',
'physics.comp-ph': 'Computational Physics',
'physics.data-an': 'Data Analysis, Statistics and Probability',
'physics.ed-ph': 'Physics Education',
'physics.flu-dyn': 'Fluid Dynamics',
'physics.gen-ph': 'General Physics',
'physics.geo-ph': 'Geophysics',
'physics.hist-ph': 'History and Philosophy of Physics',
'physics.ins-det': 'Instrumentation and Detectors',
'physics.med-ph': 'Medical Physics',
'physics.optics': 'Optics',
'physics.plasm-ph': 'Plasma Physics',
'physics.pop-ph': 'Popular Physics',
'physics.soc-ph': 'Physics and Society',
'physics.space-ph': 'Space Physics',
'q-bio.BM': 'Biomolecules',
'q-bio.CB': 'Cell Behavior',
'q-bio.GN': 'Genomics',
'q-bio.MN': 'Molecular Networks',
'q-bio.NC': 'Neurons and Cognition',
'q-bio.OT': 'Other Quantitative Biology',
'q-bio.PE': 'Populations and Evolution',
'q-bio.QM': 'Quantitative Methods',
'q-bio.SC': 'Subcellular Processes',
'q-bio.TO': 'Tissues and Organs',
'q-fin.CP': 'Computational Finance',
'q-fin.EC': 'Economics',
'q-fin.GN': 'General Finance',
'q-fin.MF': 'Mathematical Finance',
'q-fin.PM': 'Portfolio Management',
'q-fin.PR': 'Pricing of Securities',
'q-fin.RM': 'Risk Management',
'q-fin.ST': 'Statistical Finance',
'q-fin.TR': 'Trading and Market Microstructure',
'quant-ph': 'Quantum Physics',
'stat.AP': 'Applications',
'stat.CO': 'Computation',
'stat.ME': 'Methodology',
'stat.ML': 'Machine Learning',
'stat.OT': 'Other Statistics',
'stat.TH': 'Statistics Theory'
}

In [7]:
cols = ['id', 'authors', 'journal-ref', 'comments', 'title', 'abstract', 'categories']
data = []
topics = ['cs.AI', 'cs.CV', 'cs.IR', 'cs.LG', 'cs.CL', 'cs.MS', 'cs.PL', 'cs.RO', 'cs.SE']
file_name = '/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json'

In [8]:
with open(file_name, encoding='utf-8') as f:
    for line in f:
        doc = json.loads(line)
        lst = [doc['id'], doc['authors'], doc['journal-ref'], doc['comments'],  doc['title'], doc['abstract'], doc['categories']]
        data.append(lst)

df = pd.DataFrame(data=data, columns=cols)

print(df.shape)

df.head()

(2459562, 7)


Unnamed: 0,id,authors,journal-ref,comments,title,abstract,categories
0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...","Phys.Rev.D76:013009,2007","37 pages, 15 figures; published version",Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph
1,704.0002,Ileana Streinu and Louis Theran,,To appear in Graphs and Combinatorics,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG
2,704.0003,Hongjun Pan,,"23 pages, 3 figures",The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph
3,704.0004,David Callan,,11 pages,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO
4,704.0005,Wael Abu-Shammala and Alberto Torchinsky,"Illinois J. Math. 52 (2008) no.2, 681-689",,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA


In [9]:
def get_cat_text(x):
    
    cat_text = ''
    
    # Put the codes into a list
    cat_list = x.split(' ')
    
    for i, item in enumerate(cat_list):
        
        cat_name = category_map[item]
        
        # If there was no description available
        # for the category code then don't include it in the text.
        if cat_name != 'Not available':
            
            if i == 0:
                cat_text = cat_name
            else:
                cat_text = cat_text + ', ' + cat_name
 
    # Remove leading and trailing spaces
    cat_text = cat_text.strip()
    
    return cat_text

In [11]:
filtered_data = df[df['categories'].isin(topics)]

In [12]:
print(filtered_data['journal-ref'].isnull().sum())
print(filtered_data['comments'].isnull().sum())

119170
51046


In [13]:
drop_columns = ['journal-ref', 'comments', 'categories']

In [14]:
filtered_data['category'] = filtered_data['categories'].apply(get_cat_text)
filtered_data = filtered_data.drop(drop_columns, axis=1)
filtered_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['category'] = filtered_data['categories'].apply(get_cat_text)


Unnamed: 0,id,authors,title,abstract,category
1266,704.1267,"Laurence Likforman-Sulem, Abderrazak Zahour, B...",Text Line Segmentation of Historical Documents...,There is a huge amount of historical documen...,Computer Vision and Pattern Recognition
1273,704.1274,David H. Wolpert and Dev G. Rajnarayan,Parametric Learning and Monte Carlo Optimization,This paper uncovers and explores the close r...,Machine Learning
1293,704.1294,"Ahmed Sidky, James Arthur, Shawn Bohner",A Disciplined Approach to Adopting Agile Pract...,Many organizations aspire to adopt agile pro...,Software Engineering
1372,704.1373,"Burgy Laurent (INRIA Futurs), Laurent R\'eveil...",A Language-Based Approach for Improving the Ro...,The secure and robust functioning of a netwo...,Programming Languages
1393,704.1394,"Tarik Hadzic, Rune Moller Jensen, Henrik Reif ...",Calculating Valid Domains for BDD-Based Intera...,In these notes we formally describe the func...,Artificial Intelligence


In [15]:
def clean_text(text):
    
    # Replace newline characters with a space
    clean_text = text.replace("\n", " ")
    
    # Remove leading and trailing spaces
    clean_text = clean_text.strip()
    
    clean_text = re.sub('<[^<]+?>', '', clean_text)
    
    return clean_text

In [16]:
filtered_data['title'] = filtered_data['title'].apply(clean_text)
filtered_data['abstract'] = filtered_data['abstract'].apply(clean_text)
filtered_data['prepared_text'] = filtered_data['title'] + ' \n ' + filtered_data['abstract']

In [17]:
filtered_data.to_csv('prepared_data.csv', index=False)

In [18]:
df.to_csv('full_data.csv', index=False)

# Preprocess

In [4]:
df = pd.read_csv('..\data\interim\prepared_data.csv')
df.head()

  df = pd.read_csv('..\data\interim\prepared_data.csv')


Unnamed: 0,id,authors,versions,title,abstract,category,prepared_text
0,704.1267,"Laurence Likforman-Sulem, Abderrazak Zahour, B...","[{'version': 'v1', 'created': 'Tue, 10 Apr 200...",Text Line Segmentation of Historical Documents...,There is a huge amount of historical documents...,Computer Vision and Pattern Recognition,Text Line Segmentation of Historical Documents...
1,704.1274,David H. Wolpert and Dev G. Rajnarayan,"[{'version': 'v1', 'created': 'Tue, 10 Apr 200...",Parametric Learning and Monte Carlo Optimization,This paper uncovers and explores the close rel...,Machine Learning,Parametric Learning and Monte Carlo Optimizati...
2,704.1294,"Ahmed Sidky, James Arthur, Shawn Bohner","[{'version': 'v1', 'created': 'Tue, 10 Apr 200...",A Disciplined Approach to Adopting Agile Pract...,Many organizations aspire to adopt agile proce...,Software Engineering,A Disciplined Approach to Adopting Agile Pract...
3,704.1373,"Burgy Laurent (INRIA Futurs), Laurent R\'eveil...","[{'version': 'v1', 'created': 'Wed, 11 Apr 200...",A Language-Based Approach for Improving the Ro...,The secure and robust functioning of a network...,Programming Languages,A Language-Based Approach for Improving the Ro...
4,704.1394,"Tarik Hadzic, Rune Moller Jensen, Henrik Reif ...","[{'version': 'v1', 'created': 'Wed, 11 Apr 200...",Calculating Valid Domains for BDD-Based Intera...,In these notes we formally describe the functi...,Artificial Intelligence,Calculating Valid Domains for BDD-Based Intera...


In [37]:
new_col_data = []
target_col =  'versions'
for id, line in df.iterrows():
    input_string = line[target_col]
    json_string = input_string.replace("'", '"')
    json_object = json.loads(json_string)[-1]['created']
    new_col_data.append(json_object)

df['version'] =  new_col_data
df.drop(columns=[target_col], axis=1, inplace=True)
df.head()


Unnamed: 0,id,authors,title,abstract,category,prepared_text,version
0,704.1267,"Laurence Likforman-Sulem, Abderrazak Zahour, B...",Text Line Segmentation of Historical Documents...,There is a huge amount of historical documents...,Computer Vision and Pattern Recognition,Text Line Segmentation of Historical Documents...,"Tue, 10 Apr 2007 16:26:42 GMT"
1,704.1274,David H. Wolpert and Dev G. Rajnarayan,Parametric Learning and Monte Carlo Optimization,This paper uncovers and explores the close rel...,Machine Learning,Parametric Learning and Monte Carlo Optimizati...,"Tue, 10 Apr 2007 17:01:07 GMT"
2,704.1294,"Ahmed Sidky, James Arthur, Shawn Bohner",A Disciplined Approach to Adopting Agile Pract...,Many organizations aspire to adopt agile proce...,Software Engineering,A Disciplined Approach to Adopting Agile Pract...,"Tue, 10 Apr 2007 19:11:51 GMT"
3,704.1373,"Burgy Laurent (INRIA Futurs), Laurent R\'eveil...",A Language-Based Approach for Improving the Ro...,The secure and robust functioning of a network...,Programming Languages,A Language-Based Approach for Improving the Ro...,"Wed, 11 Apr 2007 08:35:32 GMT"
4,704.1394,"Tarik Hadzic, Rune Moller Jensen, Henrik Reif ...",Calculating Valid Domains for BDD-Based Intera...,In these notes we formally describe the functi...,Artificial Intelligence,Calculating Valid Domains for BDD-Based Intera...,"Wed, 11 Apr 2007 10:59:56 GMT"


In [38]:
df.to_csv('..\data\interim\prepared_data.csv', index=False)