# **TEXT FILE**

#### Libararies

In [1]:
import os
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

#### Paths

In [78]:
file_name = "romeo_and_juliet.txt"

current_dir = os.getcwd() # os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(current_dir, "uploads", file_name)
db_dir = os.path.join(current_dir, "db")
idx_name = "FAISS_metadata"

#### Text File Loader

In [79]:
loader = TextLoader(file_path, encoding="utf-8")
data = loader.load()

if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file {file_path} does not exist. Please check the path.")

#### Text Splitter and Chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)

chunks = text_splitter.split_documents(data)

for chunk in chunks:
    chunk.metadata = {'source': file_name}

#### Embedding Model

In [81]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs 
)

#### FAISS DB, Save in Local and Load from local

In [82]:
faissdb = FAISS.from_documents(chunks, embeddings)

faissdb.save_local(folder_path=db_dir, index_name=idx_name)

faissdb = FAISS.load_local(folder_path=db_dir, embeddings=embeddings, index_name=idx_name, allow_dangerous_deserialization=True)

#### Retriever (Similarity seach, mmr, Similarity_score_thershold)

In [86]:
sim_retriever = faissdb.as_retriever(
            search_type='similarity',
            search_kwargs={'k':1},
        )

# mmr_retriever = faissdb.as_retriever(
#             search_type='mmr', 
#             search_kwargs={'k':2, 'fetch_k':20, "lambda_mult": 0.5},
#         )

# sst_retriever = faissdb.as_retriever(
#             search_type='similarity_score_threshold',
#             search_kwargs={'k':2, 'score_threshold': 0.1},
#         )

In [87]:
query = "How did Juliet Die?"

relevant_doc1 = sim_retriever.invoke(query)

for doc in relevant_doc1:
    print("\nRelevant Doc: \n", doc.page_content)
    print('\nSource: ', doc.metadata)

# relevant_doc2 = mmr_retriever.invoke(query)
# print(relevant_doc2)

# relevant_doc3 = sst_retriever.invoke(query)
# print(relevant_doc3)


Relevant Doc: 
 FRIAR LAWRENCE.
I will be brief, for my short date of breath
Is not so long as is a tedious tale.
Romeo, there dead, was husband to that Juliet,
And she, there dead, that Romeo’s faithful wife.
I married them; and their stol’n marriage day
Was Tybalt’s doomsday, whose untimely death
Banish’d the new-made bridegroom from this city;
For whom, and not for Tybalt, Juliet pin’d.
You, to remove that siege of grief from her,
Betroth’d, and would have married her perforce
To County Paris. Then comes she to me,
And with wild looks, bid me devise some means
To rid her from this second marriage,
Or in my cell there would she kill herself.
Then gave I her, so tutored by my art,
A sleeping potion, which so took effect
As I intended, for it wrought on her
The form of death. Meantime I writ to Romeo
That he should hither come as this dire night
To help to take her from her borrow’d grave,
Being the time the potion’s force should cease.
But he which bore my letter, Friar John,

Source

# **PDF**

In [89]:
from langchain_community.document_loaders import PyMuPDFLoader

#### Paths

In [90]:
file_name = "NVResume.pdf"

current_dir = os.getcwd() # os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(current_dir, "uploads", file_name)
db_dir = os.path.join(current_dir, "db")
idx_name = "FAISS_metadata"

In [93]:
loader = PyMuPDFLoader(file_path)
data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    length_function=len,
)

chunks = text_splitter.split_documents(data)

for chunk in chunks:
    chunk.metadata = {'source': file_name}

In [110]:
modelPath = "sentence-transformers/all-MiniLM-l6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,
    model_kwargs=model_kwargs, 
    encode_kwargs=encode_kwargs 
)

In [None]:
faissdb = FAISS.from_documents(chunks, embeddings)

faissdb.save_local(folder_path=db_dir, index_name=idx_name)

faissdb = FAISS.load_local(folder_path=db_dir, embeddings=embeddings, index_name=idx_name, allow_dangerous_deserialization=True)

In [112]:
sim_retriever = faissdb.as_retriever(
            search_type='similarity',
            search_kwargs={'k':1},
        )

# mmr_retriever = faissdb.as_retriever(
#             search_type='mmr', 
#             search_kwargs={'k':2, 'fetch_k':20, "lambda_mult": 0.5},
#         )

# sst_retriever = faissdb.as_retriever(
#             search_type='similarity_score_threshold',
#             search_kwargs={'k':2, 'score_threshold': 0.1},
#         )

In [115]:
query = "projects"

relevant_doc1 = sim_retriever.invoke(query)

for doc in relevant_doc1:
    print("\nRelevant Doc: \n", doc.page_content)
    print('\nSource: ', doc.metadata)

# relevant_doc2 = mmr_retriever.invoke(query)
# print(relevant_doc2)

# relevant_doc3 = sst_retriever.invoke(query)
# print(relevant_doc3)


Relevant Doc: 
 Nava Bharath National School
Annur, Coimbatore
Grade 12, CBSE
Aug 2022 - May 2023
PROJECTS

Source:  {'source': 'NVResume.pdf'}
