In [1]:
from langchain_community.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings import HuggingFaceEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = 'data.json'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
sentence_model = SentenceTransformer(model_name)

In [3]:
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["title"] = record.get("metadata").get("title")
    metadata["tags"] = record.get("metadata").get("tags")
    if "source" in metadata:
        source = metadata["source"].split("/")
        metadata["source"] = source[-1]

    return metadata


In [4]:
loader = JSONLoader(
    file_path=file_path,
    jq_schema='.[]',
    content_key="content",
    metadata_func=metadata_func
    )

textDocs = loader.load()
print(len(textDocs))
print(textDocs[0])

10
page_content='The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.' metadata={'source': 'data.json', 'seq_num': 1, 'title': 'Eiffel Tower', 'tags': ['landmark', 'Paris', 'France', 'Eiffel Tower']}


In [5]:
# Not recursive, takes all the text and splits it into chunks
# text_splitter = CharacterTextSplitter(chunk_size=256, chunk_overlap=50)
# docs = text_splitter.split_documents(textDocs)
# print(len(docs))
# print(docs[0])

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256,
    chunk_overlap=50,
    length_function=len,
)
texts = text_splitter.split_documents(textDocs)
print(len(texts))
print(texts[0])

10
page_content='The Eiffel Tower is a wrought-iron lattice tower located in Paris, France.' metadata={'source': 'data.json', 'seq_num': 1, 'title': 'Eiffel Tower', 'tags': ['landmark', 'Paris', 'France', 'Eiffel Tower']}


In [7]:
embeddings = HuggingFaceEmbeddings(model_name=model_name)
db = DocArrayInMemorySearch.from_documents(
    texts, 
    embedding=embeddings,
)

  embeddings = HuggingFaceEmbeddings(model_name=model_name)


In [8]:
query = "What was the Apollo 11 mission, and why is it significant?"
response = db.similarity_search(query,k=5,threshold=0.5) #  FAISS, Chroma, or Weaviate, but not DocArrayInMemorySearch
# response = db.search(query,k=5,search_type='similarity')
# response = db.similarity_search_by_vector(sentence_model.encode(query),k=5)
print('len:',len(response))
for res in response:
    print('content:',res.page_content)
    print('source:',res.metadata['source'])
    print('seq_num:',res.metadata['seq_num'])
    print('title:',res.metadata['title'])
    # print('tags:',res.metadata['tags'])
    print('-------------------')

len: 5
content: The Apollo 11 mission was the first to land humans on the Moon in 1969.
source: data.json
seq_num: 7
title: Apollo 11 Mission
-------------------
content: The Mona Lisa is a famous portrait painting by Leonardo da Vinci.
source: data.json
seq_num: 5
title: Mona Lisa
-------------------
content: The Colosseum in Rome is an ancient amphitheater used for gladiatorial contests and public spectacles.
source: data.json
seq_num: 10
title: The Colosseum
-------------------
content: Python is a high-level programming language known for its simplicity and readability.
source: data.json
seq_num: 4
title: Python Programming Language
-------------------
content: Mount Everest is the highest peak in the world, located in the Himalayas.
source: data.json
seq_num: 6
title: Mount Everest
-------------------


In [9]:
query_title = "Challenges"
docs_with_title = [doc for doc in response if query_title in doc.metadata['title']]
for doc in docs_with_title:
    print('content:',doc.page_content)
    print('source:',doc.metadata['source'])
    print('seq_num:',doc.metadata['seq_num'])
    print('title:',doc.metadata['title'])
    # print('tags:',doc.metadata['tags'])
    print('-------------------')

In [10]:
embeddings1 = sentence_model.encode(query)
for res in response:
    embeddings2 = sentence_model.encode(res.page_content)
    similarity = cosine_similarity(embeddings1.reshape(1, -1), embeddings2.reshape(1, -1))[0][0]
    if similarity > 0.3:
        print(f"Similarity: {similarity} -- {res.page_content}")


Similarity: 0.530174970626831 -- The Apollo 11 mission was the first to land humans on the Moon in 1969.
