In [1]:
# Import all libraries
import scispacy
import spacy
import json
import pandas as pd
from nltk.stem.snowball import SnowballStemmer

In [2]:
# Load the sciSpacy model
nlp = spacy.load("en_core_sci_sm")

In [3]:
# Load all articles from the specified category
articles = []
category = 'nucl-ex'
with open("arxiv-metadata-oai-snapshot.json", "r") as f:
    for l in f:
        d = json.loads(l)
        if category in d['categories'].split(' '):
            articles.append(d)
df = pd.DataFrame.from_records(articles)

In [4]:
# Clean the abstracts and prepare for Spacy pipe
df['clean_abstract'] = [x.replace('\n',' ').strip() for x in df['abstract']]
df['tuple_input'] = [(row['clean_abstract'], row['id']) for _,row in df.iterrows()]

In [5]:
# SciSpacy NER extraction
results = []
for doc, context in nlp.pipe(df['tuple_input'].to_list(), as_tuples=True):
    results.append({'id':context, 'entities':doc.ents})

In [6]:
stemmer = SnowballStemmer("english")

In [7]:
# Clean the results by checking if token is longer than 1 character and consists of alphabetic characters
# Stem the filtered tokens
exploded = []
for row in results:
    for el in row['entities']:
        if el.text.isalpha() and len(el.text) > 1:
            exploded.append({'id':row['id'], 'entity':" ".join([stemmer.stem(word) for word in el.text.split(' ')])})
results_df = pd.DataFrame.from_records(exploded)

In [10]:
results_df.head()

Unnamed: 0,id,entity
0,704.0075,sector
1,704.0075,heavi
2,704.0075,baryon
3,704.0075,year
4,704.0075,decay


In [16]:
# Get the count of entities
counts = results_df['entity'].value_counts().rename_axis('unique_values').reset_index(name='counts')
counts.head(30)

Unnamed: 0,unique_values,counts
0,measur,13382
1,data,9518
2,collis,8215
3,result,6153
4,experi,5909
5,product,5622
6,experiment,4962
7,energi,4373
8,calcul,4129
9,nuclei,4060


In [21]:
# Remove words that appear fewer than 5 times or more than 4500
remove_words = counts[(counts['counts'] < 5) | (counts['counts'] > 4500)]['unique_values'].to_list()
final_df = results_df.drop(results_df[results_df['entity'].isin(remove_words)].index)

In [25]:
# Check the results
final_df['entity'].value_counts()

energi       4373
calcul       4129
nuclei       4060
interact     3829
studi        3730
             ... 
crustal         5
scd             5
jedi            5
lbne            5
inventori       5
Name: entity, Length: 3695, dtype: int64

In [27]:
final_df.to_csv('nucl-ex-entities.csv', index=False)