# Download Arxiv from Kaggle

In [2]:
!pip install -r requirements.txt

Collecting opendatasets
  Using cached opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting redis
  Using cached redis-4.3.4-py3-none-any.whl (246 kB)
Collecting redis-om
  Using cached redis_om-0.1.0-py3-none-any.whl (76 kB)
Collecting kaggle
  Using cached kaggle-1.5.12-py3-none-any.whl
Collecting sentencepiece
  Using cached sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
Collecting transformers<5.0.0,>=4.6.0
  Using cached transformers-4.24.0-py3-none-any.whl (5.5 MB)
Collecting nltk
  Using cached nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting huggingface-hub>=0.4.0
  Using cached huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
Collecting deprecated>=1.2.3
  Using cached Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting async-timeout>=4.0.2
  Using cached async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting aioredis<3.0.0,>=2.0.0
  Usi

In [3]:
import opendatasets as od

od.download("https://www.kaggle.com/datasets/Cornell-University/arxiv")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

  abdullahiahmed


Your Kaggle Key:

  ································


Downloading arxiv.zip to ./arxiv


100%|██████████| 1.09G/1.09G [00:13<00:00, 85.5MB/s]





# Clean The Data

In [4]:
import json
import pandas as pd
import os
import re
import string


DATA_PATH = "arxiv-metadata-oai-snapshot.json"
YEAR_CUTOFF = 2012
YEAR_PATTERN = r"(19|20[0-9]{2})"
ML_CATEGORY = "cs.LG"

In [5]:
def process(paper: dict):
    paper = json.loads(paper)
    if paper['journal-ref']:
        years = [int(year) for year in re.findall(YEAR_PATTERN, paper['journal-ref'])]
        years = [year for year in years if (year <= 2022 and year >= 1991)]
        year = min(years) if years else None
    else:
        year = None
    return {
        'id': paper['id'],
        'title': paper['title'],
        'year': year,
        'authors': paper['authors'],
        'categories': ','.join(paper['categories'].split(' ')),
        'abstract': paper['abstract']
    }

def papers():
    with open(DATA_PATH, 'r') as f:
        for paper in f:
            paper = process(paper)
            if paper['year']:
                if paper['year'] >= YEAR_CUTOFF and ML_CATEGORY in paper['categories']:
                    yield paper

In [7]:
df = pd.DataFrame(papers())
len(df)

11361

In [8]:
df.head(10)

Unnamed: 0,id,title,year,authors,categories,abstract
0,705.4485,Mixed membership stochastic blockmodels,2014,"Edoardo M Airoldi, David M Blei, Stephen E Fie...","stat.ME,cs.LG,math.ST,physics.soc-ph,stat.ML,s...",Observations consisting of measurements on r...
1,808.3231,Multi-Instance Multi-Label Learning,2012,"Zhi-Hua Zhou, Min-Ling Zhang, Sheng-Jun Huang,...","cs.LG,cs.AI","In this paper, we propose the MIML (Multi-In..."
2,811.4413,A Spectral Algorithm for Learning Hidden Marko...,2012,"Daniel Hsu, Sham M. Kakade, Tong Zhang","cs.LG,cs.AI",Hidden Markov Models (HMMs) are one of the m...
3,903.4817,An Exponential Lower Bound on the Complexity o...,2012,"Bernd G\""artner, Martin Jaggi and Cl\'ement Maria","cs.LG,cs.CG,cs.CV,math.OC,stat.ML",For a variety of regularized optimization pr...
4,909.5175,Bounding the Sensitivity of Polynomial Thresho...,2013,"Prahladh Harsha, Adam Klivans, Raghu Meka","cs.CC,cs.LG",We give the first non-trivial upper bounds o...
5,912.4884,An Invariance Principle for Polytopes,2012,"Prahladh Harsha, Adam Klivans and Raghu Meka","cs.CC,cs.CG,cs.DM,cs.LG,math.PR","Let X be randomly chosen from {-1,1}^n, and ..."
6,1004.4668,Evolutionary Inference for Function-valued Tra...,2012,Nick S. Jones and John Moriarty,"q-bio.QM,cs.LG,physics.data-an,stat.ML",Biological data objects often have both of t...
7,1005.4717,Smoothing proximal gradient method for general...,2012,"Xi Chen, Qihang Lin, Seyoung Kim, Jaime G. Car...","stat.ML,cs.LG,math.OC,stat.AP,stat.CO",We study the problem of estimating high-dime...
8,1005.5141,On Recursive Edit Distance Kernels with Applic...,2014,"Pierre-Fran\c{c}ois Marteau (IRISA), Sylvie Gi...","cs.LG,cs.IR",This paper proposes some extensions to the w...
9,1006.2513,On the Achievability of Cram\'er-Rao Bound In ...,2012,"Rad Niazadeh, Masoud Babaie-Zadeh and Christia...","cs.IT,cs.LG,math.IT","Recently, it has been proved in Babadi et al..."


In [9]:
# Avg length of the abstracts
df.abstract.apply(lambda a: len(a.split())).mean()

169.8317049555497

In [10]:
def clean_description(description: str):
    if not description:
        return ""
    # remove unicode characters
    description = description.encode('ascii', 'ignore').decode()

    # remove punctuation
    description = re.sub('[%s]' % re.escape(string.punctuation), ' ', description)

    # clean up the spacing
    description = re.sub('\s{2,}', " ", description)

    # remove urls
    #description = re.sub("https*\S+", " ", description)

    # remove newlines
    description = description.replace("\n", " ")

    # remove all numbers
    #description = re.sub('\w*\d+\w*', '', description)

    # split on capitalized words
    description = " ".join(re.split('(?=[A-Z])', description))

    # clean up the spacing again
    description = re.sub('\s{2,}', " ", description)

    # make all words lowercase
    description = description.lower()

    return description

# Generate Embeddings

In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [12]:
# Create embeddings from the title and abstract
emb = model.encode(df.apply(lambda r: clean_description(r['title'] + ' ' + r['abstract']), axis=1).tolist())

In [13]:
len(emb)

11361

In [14]:
# Add embeddings to df
df = df.reset_index().drop('index', axis=1)
df['vector'] = emb.tolist()

In [15]:
df['vector'].head()

0    [0.010859647765755653, 0.08288727700710297, -0...
1    [-0.018738871440291405, 0.009955745190382004, ...
2    [-0.007170626427978277, 0.05301477387547493, -...
3    [-0.021994853392243385, 0.019899088889360428, ...
4    [-0.08850026875734329, 0.03196108341217041, -0...
Name: vector, dtype: object

In [16]:
import pickle

# Export to file!
with open('arxiv_embeddings_10000.pkl', 'wb') as f:
    data = pickle.dumps(df)
    f.write(data)

# Load & Index Data in a redis

In [14]:
import typing as t
import asyncio
import numpy as np
import pickle
import redis.asyncio as redis
from redis.commands.search.query import Query
from redis.commands.search.field import TagField
from redis.commands.search.field import VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType


In [15]:
REDIS_URL = f"redis://:ICe9oFTnhXr3s8oVz8CB1ljNu7jHZQiH@redis-16644.c282.east-us-mz.azure.cloud.redislabs.com:16644/arvix-dataset"

In [16]:
def read_paper_df() -> t.List:
    with open("arxiv_embeddings_10000.pkl", "rb") as f:
        df = pickle.load(f)
    return df

In [17]:
async def load_vectors(n, redis_conn, *papers):
    semaphore = asyncio.Semaphore(n)
    async def load_paper(paper: dict):
        async with semaphore:
            # Prep the input dictionary for Redis storage
            key = "paper_vector:" + paper['id']
            paper['paper_id'] = paper.pop('id')
            paper['vector'] = np.array(paper['vector'], dtype=np.float32).tobytes()
            paper['categories'] = paper['categories'].replace(",", "|")
            # Store in Redis
            await redis_conn.hset(key, mapping=paper)
    # Gather results with concurrency
    await asyncio.gather(*[load_paper(p) for p in papers])

### Load and Index ArXiv Dataset

In [18]:
INDEX_NAME = "index"

In [72]:
async def create_hnsw(
    self,
    *fields,
    redis_conn: redis,
    number_of_vectors: int,
    prefix: str,
    distance_metric: str='COSINE'
):
    vector_field = VectorField(
        "vector",
        "HNSW", {
            "TYPE": "FLOAT32",
            "DIM": 768,
            "DISTANCE_METRIC": distance_metric,
            "INITIAL_CAP": number_of_vectors,
        }
    )
    await self._create(
        *fields,
        vector_field,
        redis_conn=redis_conn,
        prefix=prefix
    )

async def _create(
    self,
    *fields,
    redis_conn: redis,
    prefix: str
):
    # Create Index
    await redis_conn.ft(INDEX_NAME).create_index(
        fields = fields,
        definition= IndexDefinition(prefix=[prefix], index_type=IndexType.HASH)
    )

In [21]:
# Redis Connection
redis_conn = redis.from_url(REDIS_URL)

# Schema
categories_field = TagField("categories", separator = "|")
year_field = TagField("year", separator = "|")


# Load papers
if await redis_conn.dbsize() > 500:
    print("Papers already loaded")
else:
    papers = read_paper_df()
    papers = papers.to_dict("records")
    await load_vectors(200, redis_conn, *papers)
    
    vector_field = VectorField(
    "vector",
    "HNSW", {
        "TYPE": "FLOAT32",
        "DIM": 768,
        "DISTANCE_METRIC": "IP",
        "INITIAL_CAP": len(papers),
    }
)
    await redis_conn.ft(INDEX_NAME).create_index(
        fields = [year_field, categories_field, vector_field],
        definition= IndexDefinition(prefix=["paper_vector:"],
                                    index_type=IndexType.HASH)
    )



Papers already loaded


# Query The Top 5 papers

In [23]:
topK = 5

Search_query = 'Best feature store practices and techniques for machine learning models'
query_vector = model.encode(Search_query).astype(np.float32).tobytes()

query = Query(f'*=>[KNN {topK} @vector $vec_param AS vector_score]').sort_by("vector_score").paging(0, topK).return_fields("paper_id", "title", "vector_score").dialect(2)
query_param = {"vec_param": query_vector}

results = await redis_conn.ft(INDEX_NAME).search(query, query_params = query_param)

#print Similar paper found
for p in results.docs:
    print ('paper_id = ' + p.paper_id)
    print ('Score = ' + p.vector_score)
    print ('title = ' + p.title)


paper_id = 2108.05053
Score = 0.366067886353
title = Managing ML Pipelines: Feature Stores and the Coming Wave of Embedding
  Ecosystems
paper_id = 1701.07852
Score = 0.391455054283
title = An Empirical Analysis of Feature Engineering for Predictive Modeling
paper_id = 1906.10366
Score = 0.391716659069
title = Software Engineering Practices for Machine Learning
paper_id = 2107.13821
Score = 0.42319393158
title = Concept for a Technical Infrastructure for Management of Predictive
  Models in Industrial Applications
paper_id = 2103.14539
Score = 0.461458265781
title = FeatureEnVi: Visual Analytics for Feature Engineering Using Stepwise
  Selection and Semi-Automatic Extraction Approaches
