In [29]:
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import CrossEncoder
from langchain_community.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [30]:
sentence_transformer_ef_L6 = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
# sentence_transformer_ef_L12 = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L12-v2")

In [31]:
# re-ranker == cross-encoder (type of model) so its slow
# cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-large')
# cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-base')
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

In [32]:
# chroma_client = chromadb.PersistentClient(path="chroma_db") # uses sqlite3 with persistent storage
chroma_client = chromadb.Client() # emphermeral client
file_path = 'mixed_data.json'

In [33]:
chroma_client.delete_collection(name="pp_data")
collection = chroma_client.get_or_create_collection(# https://docs.trychroma.com/docs/embeddings/embedding-functions
    name="pp_data",
    metadata={
        "description": "A collection of blog posts from the PocketPandit blog",
        "source": "https://blog.pocketpandit.com",
        "hnsw:space": "cosine", # l2,ip,cosine # https://docs.trychroma.com/docs/collections/configure
        # "hnsw:construction_ef": 128, # default is 100
        # "hnsw:M": 32, # default is 16
        # "hnsw:search_ef": 128,
    },
    embedding_function=sentence_transformer_ef_L6,
    )

In [34]:
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["title"] = record.get("metadata").get("title").lower()
    metadata["tags"] = ",".join(str(e) for e in record.get("metadata").get("tags"))
    if "source" in metadata:
        source = metadata["source"].split("/")
        metadata["source"] = source[-1]

    return metadata

In [35]:
loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    content_key="content",
    metadata_func=metadata_func
    )

textDocs = loader.load()
print(len(textDocs))
print(textDocs[0])

19
page_content='Python is a high-level programming language known for its simplicity and readability.' metadata={'source': 'mixed_data.json', 'seq_num': 1, 'title': 'python', 'tags': 'programming,Python,software,technology'}


In [36]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    length_function=len,
)
docs = text_splitter.split_documents(textDocs)
print(len(docs))
print(docs[0])

19
page_content='Python is a high-level programming language known for its simplicity and readability.' metadata={'source': 'mixed_data.json', 'seq_num': 1, 'title': 'python', 'tags': 'programming,Python,software,technology'}


In [37]:
for i,doc in enumerate(docs):
    collection.upsert(
        documents=[doc.page_content],
        metadatas=[doc.metadata],
        ids=[f'{doc.metadata["seq_num"]}_{i}_{len(doc.page_content)}'],
    )

In [42]:
# query = "What is programming language used in app development?"
# query = "In app development, which language is used for mobile app?"
query = "which is best mobile device?"

# query= "why space travel is diff from app development?"
# query= "what landmark is similar to planets?"

results = collection.query(query_texts=query,n_results=10,
    # where={"$or": [{"title": filter_title}, {"tags": tags}]},
    # where_document={"$contains": filter_title},
)

for id,document,metadata,distance in zip(results["ids"][0],results["documents"][0],results["metadatas"][0],results["distances"][0]):
    print(f"ID: {id} -- distance: {distance}")
    print(f"content: {document}")
    print(f"seq_num: {metadata['seq_num']} - {metadata['source']}")
    print(f"tags: {metadata['tags']}")
    print(f"title: {metadata['title']}")
    print('-------------------')


ID: 17_16_90 -- distance: 0.5378316640853882
content: The iPhone is a line of smartphones designed by Apple, known for its iOS operating system.
seq_num: 17 - mixed_data.json
tags: smartphones,iPhone,Apple,technology
title: iphone
-------------------
ID: 16_15_300 -- distance: 0.5423922538757324
content: Smartphones are mobile devices with advanced computing capabilities, such as the iPhone and Android. Swift is a powerful and intuitive programming language developed by Apple for iOS, macOS, watchOS, and tvOS. Kotlin is a modern programming language that is fully interoperable with Java and Android.
seq_num: 16 - mixed_data.json
tags: mobile phones,smartphones,technology,iPhone,Android
title: smartphones
-------------------
ID: 18_17_188 -- distance: 0.5953792929649353
content: Android is a mobile operating system developed by Google, used by many smartphone manufacturers. While building android application developers use Kotlin and Java as programming languages.
seq_num: 18 - mixed_da

In [39]:
# recall — how many of the relevant documents are we retrieving # https://www.pinecone.io/learn/series/rag/rerankers/ # https://medium.com/@sahin.samia/what-is-reranking-in-retrieval-augmented-generation-rag-ee3dd93540ee
# Reranking involves reordering the results based on a different criterion than the initial similarity search. 
reranked_results = []
for i, result in enumerate(results['documents'][0]):
    cross_similarity = cross_encoder.predict([(query, result)])
    reranked_results.append({
        "document": result,
        "metadata": results['metadatas'][0][i],
        "original_similarity": 1 - results['distances'][0][i],
        "cross_similarity": cross_similarity
    })
reranked_results.sort(key=lambda x: x['cross_similarity'], reverse=True)
for doc in reranked_results:
    print(f"original similarity: {doc['original_similarity']}")
    print(f"cross similarity: {doc['cross_similarity']}")
    print(f"title: {doc['metadata']['title']}")
    print(f"content: {doc['document']}")
    print('-------------------')


original similarity: 0.5897119045257568
cross similarity: [0.7230607]
title: java
content: Java is a programming language used for enterprise-level applications and Android apps.
-------------------
original similarity: 0.4852185845375061
cross similarity: [0.35305983]
title: c++
content: C++ is a powerful programming language used for system programming, game development, and performance-critical applications.
-------------------
original similarity: 0.44139719009399414
cross similarity: [0.07894752]
title: javascript
content: JavaScript is a popular programming language used for web development and creating interactive websites.
-------------------
original similarity: 0.46254658699035645
cross similarity: [0.07511047]
title: smartphones
content: Smartphones are mobile devices with advanced computing capabilities, such as the iPhone and Android. Swift is a powerful and intuitive programming language developed by Apple for iOS, macOS, watchOS, and tvOS. Kotlin is a modern programming 