# Evaluation

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredFileLoader

doc = "bob/bob_1.pdf" # path to pdf or other type of file to load

# set up the file loader/extractor and text splitter to create chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=0
)

loader = UnstructuredFileLoader(
    doc, mode="single", strategy="fast"
)

# extract, load, and make chunks
chunks = loader.load_and_split(text_splitter)

print("Done preprocessing. Created", len(chunks), "chunks of the original pdf", doc)

In [3]:
import json

output_file = "2008-mazda3-chunks.json"

with open(output_file, "w") as f:
    json_chunks = [chunk.page_content for chunk in chunks]
    json.dump(json_chunks, f)

In [1]:
import json
with open("data/2008-mazda3-chunks.json", "r") as f:
    chunks = json.load(f)

In [2]:
import os
from redisvl.utils.vectorize import HFTextVectorizer

hf = HFTextVectorizer("sentence-transformers/all-MiniLM-L6-v2")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Embed each chunk content
embeddings = hf.embed_many([chunk for chunk in chunks])

# Check to make sure we've created enough embeddings, 1 per document chunk
len(embeddings) == len(chunks)



True

In [12]:
from redis import Redis
from redisvl.index import SearchIndex

REDIS_URL = "redis://localhost:6379/0"

# connect to redis
client = Redis.from_url(REDIS_URL)

path_to_yaml = "schema/index_schema.yaml"
# create an index from schema and the client
index = SearchIndex.from_yaml(path_to_yaml)
index.set_client(client)
index.create(overwrite=True, drop=True)

In [13]:
from redisvl.redis.utils import array_to_buffer

data = [
    {
        'chunk_id': i,
        'content': chunk,
        # For HASH -- must convert embeddings to bytes
        'text_embedding': array_to_buffer(embeddings[i], dtype="float32")
    } for i, chunk in enumerate(chunks)
]

# RedisVL handles batching automatically
keys = index.load(data, id_field="chunk_id")

In [11]:
index.info()["num_docs"]

251