In [1]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import CrossEncoder
from langchain_community.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sentence_transformer_ef_L6 = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
# sentence_transformer_ef_L12 = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L12-v2")

In [3]:
# chroma_client = chromadb.PersistentClient(path="chroma_db") # uses sqlite3 with persistent storage
chroma_client = chromadb.Client() # emphermeral client
file_path = 'data.json'

In [4]:
collection = chroma_client.get_or_create_collection(# https://docs.trychroma.com/docs/embeddings/embedding-functions
    name="pp_data",
    metadata={
        "description": "A collection of blog posts from the PocketPandit blog",
        "source": "https://blog.pocketpandit.com",
        "hnsw:space": "cosine", # l2,ip,cosine # https://docs.trychroma.com/docs/collections/configure
        # "hnsw:construction_ef": 128, # default is 100
        # "hnsw:M": 32, # default is 16
        # "hnsw:search_ef": 128,
    },
    embedding_function=sentence_transformer_ef_L6,
    )

In [5]:
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["title"] = record.get("metadata").get("title").lower()
    metadata["tags"] = ",".join(str(e) for e in record.get("metadata").get("tags"))
    if "source" in metadata:
        source = metadata["source"].split("/")
        metadata["source"] = source[-1]

    return metadata

In [6]:
loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    content_key="content",
    metadata_func=metadata_func
    )

textDocs = loader.load()
print(len(textDocs))
print(textDocs[0])

10
page_content='The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.' metadata={'source': 'data.json', 'seq_num': 1, 'title': 'eiffel tower', 'tags': 'landmark,Paris,France,Eiffel Tower'}


In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=50,
    length_function=len,
)
docs = text_splitter.split_documents(textDocs)
print(len(docs))
print(docs[0])

10
page_content='The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.' metadata={'source': 'data.json', 'seq_num': 1, 'title': 'eiffel tower', 'tags': 'landmark,Paris,France,Eiffel Tower'}


In [8]:
for i,doc in enumerate(docs):
    collection.upsert(
        documents=[doc.page_content],
        metadatas=[doc.metadata],
        ids=[f'{doc.metadata["seq_num"]}_{i}_{len(doc.page_content)}'],
    )

In [9]:
# query
query = "What is the Eiffel Tower, and where is it located?"
filter_title = "Eiffel Tower"
tags = "Paris, France"
results = collection.query(
    query_texts=query,
    n_results=5,
    # where={"$or": [{"title": filter_title}, {"tags": tags}]},
    # where_document={"$contains": filter_title},
)

for id,document,metadata,distance in zip(results["ids"][0],results["documents"][0],results["metadatas"][0],results["distances"][0]):
    print(f"ID: {id} -- distance: {distance}")
    print(f"content: {document}")
    print(f"seq_num: {metadata['seq_num']} - {metadata['source']}")
    print(f"tags: {metadata['tags']}")
    print(f"title: {metadata['title']}")
    print('-------------------')


ID: 1_0_74 -- distance: 0.20671212673187256
content: The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.
seq_num: 1 - data.json
tags: landmark,Paris,France,Eiffel Tower
title: eiffel tower
-------------------
ID: 2_1_96 -- distance: 0.7786733508110046
content: The Great Wall of China is a series of fortifications made of stone, brick, and other materials.
seq_num: 2 - data.json
tags: landmark,China,Great Wall,history
title: great wall of china
-------------------
ID: 6_5_73 -- distance: 0.7788379192352295
content: Mount Everest is the highest peak in the world, located in the Himalayas.
seq_num: 6 - data.json
tags: mountain,Everest,Himalayas,adventure
title: mount everest
-------------------
ID: 5_4_65 -- distance: 0.8031020760536194
content: The Mona Lisa is a famous portrait painting by Leonardo da Vinci.
seq_num: 5 - data.json
tags: art,painting,Mona Lisa,Leonardo da Vinci
title: mona lisa
-------------------
ID: 10_9_102 -- distance: 0.8532676696777344
conten

In [10]:
# re-ranker == cross-encoder (type of model) so its slow
# cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-large')
# cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-base')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

In [11]:
# recall — how many of the relevant documents are we retrieving # https://www.pinecone.io/learn/series/rag/rerankers/ # https://medium.com/@sahin.samia/what-is-reranking-in-retrieval-augmented-generation-rag-ee3dd93540ee
# Reranking involves reordering the results based on a different criterion than the initial similarity search. 
reranked_results = []
for i, result in enumerate(results['documents'][0]):
    cross_similarity = cross_encoder.predict([(query, result)])
    reranked_results.append({
        "document": result,
        "metadata": results['metadatas'][0][i],
        "original_similarity": 1 - results['distances'][0][i],
        "cross_similarity": cross_similarity
    })
reranked_results.sort(key=lambda x: x['cross_similarity'], reverse=True)


In [12]:
for doc in reranked_results:
    print(f"original similarity: {doc['original_similarity']}")
    print(f"cross similarity: {doc['cross_similarity']}")
    print(f"title: {doc['metadata']['title']}")
    print(f"content: {doc['document']}")
    print('-------------------')

original similarity: 0.7932878732681274
cross similarity: [0.9133991]
title: eiffel tower
content: The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.
-------------------
original similarity: 0.2211620807647705
cross similarity: [0.0001883]
title: mount everest
content: Mount Everest is the highest peak in the world, located in the Himalayas.
-------------------
original similarity: 0.14673233032226562
cross similarity: [0.00018097]
title: the colosseum
content: The Colosseum in Rome is an ancient amphitheater used for gladiatorial contests and public spectacles.
-------------------
original similarity: 0.22132664918899536
cross similarity: [0.00017619]
title: great wall of china
content: The Great Wall of China is a series of fortifications made of stone, brick, and other materials.
-------------------
original similarity: 0.19689792394638062
cross similarity: [0.00016135]
title: mona lisa
content: The Mona Lisa is a famous portrait painting by Leonardo da Vinci