# Getting Started

Load papers and do some preprocessing

In [None]:
! pip install -r requirements.txt

In [None]:
import asyncio
import pickle
import pandas as pd
import numpy as np

from redis.asyncio import Redis
from utils.embeddings import Embeddings

In [None]:
# Connect to the redis instance running in your docker stack at redis:6379
redis_conn = await Redis(host='redis', port='6379')

In [None]:
# Load papers dataframe
def read_paper_df() -> pd.DataFrame:
    with open("arxiv_papers_df.pkl", "rb") as f:
        df = pickle.load(f)
    return df

def paper_key(paper_id: str) -> str:
    return f'paper:{paper_id}'

# Function to concurrently load papers into Redis
async def gather_with_concurrency(n, redis_conn, *papers):
    semaphore = asyncio.Semaphore(n)
    async def load_paper(paper):
        async with semaphore:
            paper['vector'] = np.array(paper['vector'], dtype=np.float32).tobytes()
            await redis_conn.hset(paper_key(paper['id']), mapping=paper)
    # gather with concurrency
    await asyncio.gather(*[load_paper(p) for p in papers])


In [None]:
# Investigate
df = read_paper_df()
df.head()

In [None]:
df['input'] = df.apply(lambda r: r.title + r.abstract, axis=1)
df.reset_index(drop=True, inplace=True)

In [None]:
# For this demo we will take a small sample
df = df.sample(frac=0.1)
len(df)

## Embedding Creation

To create embeddings/vector representations of the papers, we will use a combination of the paper abstract and title fields and pass through an open source `SentenceTransformer` model (after some light preprocessing).

Everything is wrapped into the `Embeddings` class and `gather_with_concurrency` function below to help make this cleaner.

In [None]:
# Create Embeddings
embeddings = Embeddings()
vectors = embeddings.make(df.input.to_list(), show_progress=True) 

In [None]:
df['vector'] = vectors.tolist()

In [None]:
# Convert dataframe to a dict
papers = df.to_dict('records')

In [None]:
# Load papers to Redis
await gather_with_concurrency(50, redis_conn, *papers)

In [None]:
# Check how many items were stored
await redis_conn.dbsize()

In [None]:
# Check a paper
key = paper_key(df.sample(1)['id'].iloc[0])
await redis_conn.hgetall(key)

## RediSearch Index Creation

Now time to create the search index.



In [None]:
from redis.commands.search.field import TagField
from utils.search_index import SearchIndex

search_index = SearchIndex('papers')

In [None]:
categories_field = TagField("categories")
year_field = TagField("year")

await search_index.create_flat(
    categories_field,
    year_field,
    redis_conn=redis_conn,
    number_of_vectors=len(papers),
    prefix="paper:",
    distance_metric="IP",
)
