# Getting Started

Load papers and do some preprocessing

In [None]:
! pip install -r requirements.txt

In [None]:
import asyncio
import pickle
import pandas as pd
import numpy as np

from redis.asyncio import Redis
from utils.embeddings import Embeddings

In [None]:
# Connect to the redis instance running in your docker stack at redis:6379
redis_conn = await Redis(host="redis", port="6379")

In [None]:
# Load papers dataframe
def read_paper_df() -> pd.DataFrame:
    with open("arxiv_papers_df.pkl", "rb") as f:
        df = pickle.load(f)
    return df

def paper_key(paper_id: str) -> str:
    return f"paper:{paper_id}"

# Function to concurrently load papers into Redis
async def gather_with_concurrency(n, redis_conn, *papers):
    semaphore = asyncio.Semaphore(n)
    async def load_paper(paper):
        async with semaphore:
            paper["vector"] = np.array(paper["vector"], dtype=np.float32).tobytes()
            await redis_conn.hset(paper_key(paper["id"]), mapping=paper)
    # gather with concurrency
    await asyncio.gather(*[load_paper(p) for p in papers])


In [None]:
# Investigate
df = read_paper_df()
df.head()

In [None]:
df["input"] = df.apply(lambda r: r.title + r.abstract, axis=1)
df.reset_index(drop=True, inplace=True)

In [None]:
# For this demo we will take a small sample
df = df.sample(frac=0.1)
len(df)

## Embedding Creation

To create embeddings/vector representations of the papers, we will use a combination of the paper abstract and title fields and pass through an open source `SentenceTransformer` model (after some light preprocessing).

Everything is wrapped into the `Embeddings` class and `gather_with_concurrency` function below to help make this cleaner.

In [None]:
# Create Embeddings
embeddings = Embeddings()
vectors = embeddings.make(df.input.to_list(), show_progress=True)

In [None]:
df["vector"] = vectors.tolist()

In [None]:
# Convert dataframe to a dict
papers = df.to_dict("records")

In [None]:
# Load papers to Redis
await gather_with_concurrency(50, redis_conn, *papers)

In [None]:
# Check how many items were stored
await redis_conn.dbsize()

In [None]:
# Check a paper
key = paper_key(df.sample(1)["id"].iloc[0])
await redis_conn.hgetall(key)

## RediSearch Index Creation

Now time to create the search index over all of the documents we have now stored in Redis.



In [None]:
from redis.commands.search.field import (
    TagField,
    VectorField,
    NumericField,
    TextField
)
from utils.search_index import SearchIndex

# Search index helper class
search_index = SearchIndex("papers", redis_conn)

In [None]:
# Create a document schema that includes 3 indexed fields
# --> vector, categories, and year

# vector_field = VectorField(
#     "vector",
#     "HNSW", {
#         "TYPE": "FLOAT32",
#         "DIM": 768,
#         "DISTANCE_METRIC": "COSINE",
#         "INITIAL_CAP": len(papers),
#     }
# )

vector_field = VectorField(
    "vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": 768,
        "DISTANCE_METRIC": "IP",
        "INITIAL_CAP": len(papers),
        "BLOCK_SIZE": len(papers)
    }
)
categories_field = TagField("categories")
year_field = TagField("year")

# Create the index with the schema and over documents containing the prefix "paper:"
await search_index.create(
    categories_field,
    year_field,
    vector_field,
    prefix="paper:"
)


## Test queries!

Use the [`running_queries.ipynb`](running_queries.ipynb) notebook to test out queries.

## Insurance Claims Example

The above code works for the demo dataset or arXiv papers! Extending the example to another use case, we can see how this might work for insurance claims and policy data.

In [None]:
# First delete the other dataset from Redis...
await search_index.delete()

In [None]:
# Claims Schema
# Here we assume some insurance claims might have a schema like this:

claims_index = SearchIndex("claims", redis_conn)

# Sample dummy data
claims = [{
    "claims_id": "1235",
    "customer_id": "5341345",
    "timestamp_of_incident": 1665765963, # Epoch timestamp
    "timestamp_of_submission": 1666716363,
    "claim_description": "This includes written text that describes the incident from the customer's POV",
    "text_vector": np.random.random(size=786),
    "age_of_customer": 33
}]

In [None]:
# To work with this data in RediSearch, we need to construct a schema and create an index

# Schema Definitions
customer = TagField("customer_id") # to be able to filter/sort by customer ID
timestamp_of_incident = NumericField("timestamp_of_incident")
timestamp_of_submission = NumericField("timestamp_of_submission")
claim_description = TextField("claim_description")
text_vector = VectorField(
    "text_vector",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": 768,
        "DISTANCE_METRIC": "IP",
        "INITIAL_CAP": len(claims),
        "BLOCK_SIZE": len(claims)
    }
)
age_of_customer = NumericField("age_of_customer")


# Create Index
await claims_index.create(
    customer,
    timestamp_of_incident,
    timestamp_of_submission,
    claim_description,
    age_of_customer,
    text_vector,
    prefix="claim:"
)


In [None]:
# Similar to above, we also need to load data to Redis

def claim_key(claim_id: str) -> str:
    return f"claim:{claim_id}"

for claim in claims:
    claim["text_vector"] = np.asarray(claim["text_vector"], dtype=np.float32).tobytes()
    await redis_conn.hset(claim_key(claim["claims_id"]), mapping=claim)


In [None]:
await redis_conn.hgetall(f"claim:{claim['claims_id']}")