In [7]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

## Data preparation

In [2]:
# Exploring categories
# with open("/mnt/HDD/tmp/arxiv-metadata-oai-snapshot.json", "r") as dataset_file:
#     dataset = []
#     for line in tqdm(dataset_file):
#         dataset.append(json.loads(line)['categories'])

Selected categories:

stat.ML: Machine Learning
cs.LG: Machine Learning
cs.CL: Computation and Language
cs.CV: Computer Vision and Pattern Recognition
cs.AI: Artificial Intelligence
~~cond-mat.dis-nn: Disordered Systems and Neural Networks~~

In [3]:
# ml_categories = ["stat.ML", "cs.AI", "cs.CL", "cs.CV", "cs.LG"]
# print(f"all: {len([0 for cat_str in dataset if np.all([category in cat_str for category in ml_categories])])}")

In [4]:
# any: 253542
# all: 19
# {"stat.ML": 57543,
#  "cs.AI": 57379,
#  "cs.CL": 41573,
#  "cs.CV": 88800,
#  "cs.LG": 129563} = 374858 total category markers per 253542 articles => every article has about 1.5 relevant categories

In [5]:
# ml_categories = ["stat.ML", "cs.AI", "cs.CL", "cs.CV", "cs.LG"]
# with open("/mnt/HDD/tmp/arxiv-metadata-oai-snapshot.json", "r") as dataset_file:
#     with open("/mnt/HDD/tmp/arxiv-metadata-oai-snapshot-only-ml-categories.json", "w") as only_ml_file:
#         for line in tqdm(dataset_file):
#             cat_str = json.loads(line)['categories']
#             if np.any([category in cat_str for category in ml_categories]):
#                 only_ml_file.write(line)

In [8]:
with open("/mnt/HDD/tmp/arxiv-metadata-oai-snapshot-only-ml-categories.json", "r") as dataset_file:
    df = pd.DataFrame.from_records((json.loads(line) for line in tqdm(dataset_file)))

253542it [00:10, 25162.64it/s]


In [6]:
df.columns

Index(['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'],
      dtype='object')

In [0]:
!git clone -b dependency_version_fix https://github.com/ArtemPt239/easy-elasticsearch

In [None]:
!pip install -e easy-elasticsearch

In [9]:
# This code will install a docker container with elasticsearch into your system
# Don't forget to delete it once you finished
from benchmarking import BaseSearchEngine
import re
from easy_elasticsearch import ElasticSearchBM25
from nltk.corpus import stopwords


class BM25Search(BaseSearchEngine):
    def __init__(self):
        self.esbm25 = None

    def preprocess(self, text: str) -> str:
        t = re.sub('\(|\)|:|,|;|\.|’|”|“|\?|%|>|<', '', text)
        t = re.sub('/', ' ', t)
        t = t.replace("'",'')

        _stopwords = set(stopwords.words('english'))
        return ' '.join([word for word in t.split() if word not in _stopwords])

    def index(self, dataset: pd.DataFrame, ids_column_name: str = 'id', abstract_column_name: str = 'abstract'):
        pool = {_id: self.preprocess(text) for _id, text in tqdm(zip(dataset[ids_column_name], dataset[abstract_column_name]))}
        self.esbm25 = ElasticSearchBM25(pool)

    def retrieve(self, query: str, amount: int):
        return self.esbm25.query(self.preprocess(query), topk=amount)

    # def retrieve_scores(self, query: str, amount: int):
    #     # return self.esbm25.score(self.preprocess(query), [_id for _id in self.retrieve(query, amount)])
    #     pass

In [10]:
%%time #this benchmark might be unrepresentative because colbert in background
searcher = BM25Search()
searcher.index(df)

253542it [01:18, 3231.91it/s]
2023-02-13 14:29:22 - No host running. Now start a new ES service via docker

Usage:  docker [OPTIONS] COMMAND

A self-sufficient runtime for containers

Options:
      --config string      Location of client config files (default
                           "/home/hououin/.docker")
  -c, --context string     Name of the context to use to connect to the
                           daemon (overrides DOCKER_HOST env var and
                           default context set with "docker context use")
  -D, --debug              Enable debug mode
  -H, --host list          Daemon socket(s) to connect to
  -l, --log-level string   Set the logging level
                           ("debug"|"info"|"warn"|"error"|"fatal")
                           (default "info")
      --tls                Use TLS; implied by --tlsverify
      --tlscacert string   Trust certs signed only by this CA (default
                           "/home/hououin/.docker/ca.pem")
      --tlscert stri

dfda5d8ecf22363ca5cbcedf6e9084f74fc73334b77bbd7368a9d284449789c6


2023-02-13 14:29:24 - Waiting for the ES service to be well started. Maximum time waiting: 100s
 27%|██▋       | 27/100 [00:27<01:14,  1.03s/it]
2023-02-13 14:29:51 - Successfully started a ES container with name "easy-elasticsearch-node1676284162"
2023-02-13 14:29:51 - Successfully built connection to ES service at http://localhost:9200
2023-02-13 14:29:51 - No index found and now do indexing
100%|██████████| 508/508 [01:11<00:00,  7.13it/s]
2023-02-13 14:31:10 - Indexing work done: 253542 documents indexed
2023-02-13 14:31:10 - All set up.


CPU times: user 1min 25s, sys: 4.49 s, total: 1min 30s
Wall time: 3min 7s


In [11]:
from benchmarking import benchmark

In [12]:
%%time
benchmark(df, searcher, "results/bm25")

CPU times: user 33.5 s, sys: 77.8 ms, total: 33.6 s
Wall time: 38.3 s
