# Notebook to create the predictive model for ArXiv papers

In [133]:
# First import the library
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import nltk
from nltk.corpus import stopwords
import re
import numpy as np

In [134]:
# Seteamos la semiilla para que los resultados sean reproducibles
np.random.seed(42)
seed = 42

In [135]:
# Open de dataset
df = pd.read_csv('data/arxiv_finetune_sample.csv')
df.head(4)

Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,2011.08488,Stefano Zacchiroli,"Stefano Zacchiroli (UP, Inria, DGD-I)",Gender Differences in Public Code Contribution...,,"IEEE Software, Institute of Electrical and Ele...",,,cs.SE,http://arxiv.org/licenses/nonexclusive-distrib...,Gender imbalance in information technology i...,"[{'version': 'v1', 'created': 'Tue, 17 Nov 202...",2020-11-18,"[['Zacchiroli', 'Stefano', '', 'UP, Inria, DGD..."
1,1707.05778,Andr\'es Garc\'ia Medina,"Andr\'es Garc\'ia-Medina, Leonidas Sandoval Ju...",Correlations and Flow of Information between T...,"18 pages, 14 figures",,10.1016/j.physa.2018.02.154,,q-fin.ST physics.soc-ph,http://arxiv.org/licenses/nonexclusive-distrib...,We use Random Matrix Theory (RMT) and inform...,"[{'version': 'v1', 'created': 'Tue, 18 Jul 201...",2018-04-04,"[['García-Medina', 'Andrés', ''], ['Junior', '..."
2,705.2567,Tomas Ortin,Jorge Bellorin and Tomas Ortin,Characterization of all the supersymmetric sol...,Some references and two comments added,"JHEP0708:096,2007",10.1088/1126-6708/2007/08/096,IFT-UAM/CSIC-07-22,hep-th,,We find a complete characterization of all t...,"[{'version': 'v1', 'created': 'Thu, 17 May 200...",2008-11-26,"[['Bellorin', 'Jorge', ''], ['Ortin', 'Tomas',..."
3,812.1352,Kazem Azizi,"R. Khosravi, K. Azizi, N. Ghahramany",Semileptonic $D_{q}\to K_{1}\ell \nu$ and nonl...,"28 Pages, 20 Figures and 9 Tables","Phys.Rev.D79:036004,2009",10.1103/PhysRevD.79.036004,,hep-ph,http://arxiv.org/licenses/nonexclusive-distrib...,We analyze the semileptonic $D_{q}\to K_1 \e...,"[{'version': 'v1', 'created': 'Sun, 7 Dec 2008...",2009-11-06,"[['Khosravi', 'R.', ''], ['Azizi', 'K.', ''], ..."


In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              200 non-null    object
 1   submitter       197 non-null    object
 2   authors         200 non-null    object
 3   title           200 non-null    object
 4   comments        156 non-null    object
 5   journal-ref     73 non-null     object
 6   doi             107 non-null    object
 7   report-no       20 non-null     object
 8   categories      200 non-null    object
 9   license         159 non-null    object
 10  abstract        200 non-null    object
 11  versions        200 non-null    object
 12  update_date     200 non-null    object
 13  authors_parsed  200 non-null    object
dtypes: object(14)
memory usage: 22.0+ KB


In [124]:
# Extract the abstracts and the categories
abstracts = df['abstract'].tolist()
categories_splitted = df['categories'].apply(lambda x: x.split()).tolist()

# If a element in the lists starts with 'cs.' we convert it to 1, otherwise 0
categories_one_hot = [[1 if cat.startswith('cs.') else 0 for cat in cat_list] for cat_list in categories_splitted]
# If a sublist contains a 1, we convert it to 1, otherwise 0
is_computer_science_paper = [1 if 1 in cat_list else 0 for cat_list in categories_one_hot]

In [125]:
print(abstracts[4])
print(is_computer_science_paper[4])

  We use covariant techniques to examine the implications of the dynamical
equivalence between geodesic motions and adiabatic hydrodynamic flows. Assuming
that the metrics of a geodesically and a non-geodesically moving fluid are
conformally related, we calculate and compare their mass densities. The density
difference is then expressed in terms of the fundamental physical quantities of
the fluid, such as its energy and isotropic pressure. Both the relativistic and
the non-relativistic case are examined and their differences identified. Our
analysis suggests that observational determinations of astrophysical masses
based on purely Keplerian motions could underestimate the available amount of
matter.

0


In [126]:
# Remove the stopwords and special characters
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

clean_abstracts = [clean_text(abstract) for abstract in abstracts]

print(clean_abstracts[4])    

use covariant techniques examine implications dynamical equivalence geodesic motions adiabatic hydrodynamic flows assuming metrics geodesically nongeodesically moving fluid conformally related calculate compare mass densities density difference expressed terms fundamental physical quantities fluid energy isotropic pressure relativistic nonrelativistic case examined differences identified analysis suggests observational determinations astrophysical masses based purely keplerian motions could underestimate available amount matter


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [127]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(abstracts)

In [128]:
# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, is_computer_science_paper, test_size=0.2, random_state=seed)

In [129]:
clf = DecisionTreeClassifier(random_state=seed)
clf.fit(X_train, y_train)

In [130]:
y_pred = clf.predict(X_test)

# Calcular métricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.78
Precision: 0.29
Recall: 0.33
F1 Score: 0.31


In [131]:
joblib.dump(clf, 'model/pretrained_model.pkl')
joblib.dump(vectorizer, 'model/vectorizer.pkl')

['model/vectorizer.pkl']

In [132]:
# We can now use the model to predict if a new abstract is from a computer science paper
new_abstract = ['This paper presents a new model for classifying computer science papers']
new_abstract_clean = [clean_text(abstract) for abstract in new_abstract]
new_abstract_vectorized = vectorizer.transform(new_abstract_clean)
prediction = clf.predict(new_abstract_vectorized)
print(prediction)

[0]
