In [1]:
import joblib

# Open the model
clf = joblib.load('model/pretrained_model.pkl')
vectorizer = joblib.load('model/vectorizer.pkl')

In [2]:
import nltk
from nltk.corpus import stopwords

# Remove the stopwords and special characters
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [4]:
# Tomamos el dataset y lo dividimos en dos partes, una para entrenar y otra para testear
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('data/arxiv_test_sample.csv')
X = df['abstract']
y = df['categories']

In [5]:
# Para las categorias en y_test, si contiene la palabra 'cs' entonces es 1, de lo contrario es 0
y_test = y.apply(lambda x: x.split()).tolist()
categories_one_hot = [[1 if cat.startswith('cs.') else 0 for cat in cat_list] for cat_list in y_test]
is_computer_science_paper = [1 if 1 in cat_list else 0 for cat_list in categories_one_hot]

In [6]:
# Limpiamos el texto
X_test_clean = X.apply(clean_text)
X_test_clean

0      rise mobile devices abundant sensory data loca...
1      generative foundation models susceptible impli...
2      simplest quantum composite body hydrogen atom ...
3      basic concept twostate vector formalism time s...
4      results longterm investigations variation coba...
                             ...                        
995    scalability single qubit operations multiqubit...
996    weak convergence stochastic evolutionary syste...
997    highlystrained bifeo3 exhibits tetragonallike ...
998    deep generative models demonstrated problemati...
999    order satisfy current fcnc cp violation bounds...
Name: abstract, Length: 1000, dtype: object

In [7]:
# Vectorizamos el texto
X_test_vect = vectorizer.transform(X_test_clean)

In [8]:
# Hacemos la predicción y calculamos el accuracy
from sklearn.metrics import accuracy_score

y_predicted = clf.predict(X_test_vect)
accuracy = accuracy_score(is_computer_science_paper, y_predicted)

print(f'Accuracy: {accuracy}')

Accuracy: 0.765


In [9]:
# Guardamos el dataset con el que hicimos la predicción
df_pred = pd.DataFrame({'abstract': X_test_clean, 'is_computer_science_paper': y_predicted})
df_pred.to_csv('data/test_data.csv', index=False)

In [10]:
# Abrimos el dataset con el que hicimos la predicción
df_pred = pd.read_csv('data/test_data.csv')
df_pred

Unnamed: 0,abstract,is_computer_science_paper
0,rise mobile devices abundant sensory data loca...,0
1,generative foundation models susceptible impli...,0
2,simplest quantum composite body hydrogen atom ...,0
3,basic concept twostate vector formalism time s...,0
4,results longterm investigations variation coba...,0
...,...,...
995,scalability single qubit operations multiqubit...,1
996,weak convergence stochastic evolutionary syste...,0
997,highlystrained bifeo3 exhibits tetragonallike ...,0
998,deep generative models demonstrated problemati...,1
