In [1]:
from langchain_community.document_loaders import TextLoader
text_loader = TextLoader('speech.txt')

In [2]:
text = text_loader.load()

In [3]:
text

[Document(metadata={'source': 'speech.txt'}, page_content='Data Scientist with 4+ years of experience in developing data-driven models, performing advanced analytics, and delivering actionable business insights. Skilled in Python, MySQL, PySpark, and AWS with expertise in statistical modeling, machine learning, RAG development and data visualization . Experienced with tools such as Airflow, Airbyte, Apache Iceberg, Metabase, and Splunk for building robust data pipelines and enabling end-to-end data science solutions.\n')]

In [4]:
from langchain_community.document_loaders import PyPDFLoader
pdf = PyPDFLoader('Offer Letter.pdf')

In [5]:
pdf = pdf.load()

In [6]:
pdf

[Document(metadata={'producer': 'LibreOffice 7.6', 'creator': 'Writer', 'creationdate': '2024-07-12T15:50:55+05:30', 'author': 'Hernandez, Mikaela', 'source': 'Offer Letter.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  \nyour joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  \nDate of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  \ninform us and no-show on the expected date of joining will efface this employment offer.\n \nYou s

# Text Splitter 

In [7]:
from langchain_community.document_loaders import PyPDFLoader
pdf = PyPDFLoader('Offer Letter.pdf')
pdf = pdf.load()

In [8]:
for i in pdf:
    print(i)

page_content='12 July 2024
Nithin kumar K R
No 1 
Kothanur
Chikkaballapur
Karnataka  562105
Dear Nithin kumar,
 
Offer Letter
This has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  
of Consultant.
 
Kindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  
your joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  
Date of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  
inform us and no-show on the expected date of joining will efface this employment offer.
 
You shall be on probation for a period of six months starting from the date of your joining. Your performance during the probation  
will be appraised and, if found satisfactory, you will be confirmed in services. However, in the event of your perfor

# RecursiveCharacterTextSplitter

In [9]:
from langchain_text_splitters import  RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
text_splitter.split_documents(pdf)

[Document(metadata={'producer': 'LibreOffice 7.6', 'creator': 'Writer', 'creationdate': '2024-07-12T15:50:55+05:30', 'author': 'Hernandez, Mikaela', 'source': 'Offer Letter.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change'),
 Document(metadata={'producer': 'LibreOffice 7.6', 'creator': 'Writer', 'creationdate': '2024-07-12T15:50:55+05:30', 'author': 'Hernandez, Mikaela', 'source': 'Offer Letter.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='your joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of th

# CharacterTextSplitter

In [10]:
from langchain_text_splitters import  CharacterTextSplitter
text_splitter = CharacterTextSplitter(separator= '\n\n' ,chunk_size=100, chunk_overlap=10)
for i in text_splitter.split_documents(pdf):
    print(i)
    print('-'*100)

page_content='12 July 2024
Nithin kumar K R
No 1 
Kothanur
Chikkaballapur
Karnataka  562105
Dear Nithin kumar,
 
Offer Letter
This has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  
of Consultant.
 
Kindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  
your joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  
Date of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  
inform us and no-show on the expected date of joining will efface this employment offer.
 
You shall be on probation for a period of six months starting from the date of your joining. Your performance during the probation  
will be appraised and, if found satisfactory, you will be confirmed in services. However, in the event of your perfor

#  HTMLHeaderTextSplitter

In [11]:
from langchain_text_splitters import  HTMLHeaderTextSplitter
string = """
<!DOCTYPE html>
<html>
<head>
    <title>My First Page</title>
</head>
<body>

    <h1>Hello World!</h1>
    <p>This is my first simple HTML page.</p

    <button>Click Me</button>

</body>
</html>
"""

header_split =[("h1","Header 1")]
html_string = HTMLHeaderTextSplitter(header_split)
docs = html_string.split_text(string)

for doc in docs:
    print("\nMETADATA:", doc.metadata)
    print("CONTENT:", doc.page_content)


METADATA: {}
CONTENT: Click Me

METADATA: {'Header 1': 'Hello World!'}
CONTENT: Hello World!

METADATA: {'Header 1': 'Hello World!'}
CONTENT: This is my first simple HTML page.


In [12]:
# 

In [13]:
import json
import requests
json_data = requests.get("https://api.smith.langchain.com/openapi.json").json()

In [14]:
from langchain_text_splitters import  RecursiveJsonSplitter
json_split = RecursiveJsonSplitter(max_chunk_size=300)
json_chucks = json_split.split_json(json_data)
for i in json_chucks[:5]:
    print(i,'\n')
    print('*'*100)

{'openapi': '3.1.0', 'info': {'title': 'LangSmith', 'description': 'The LangSmith API is used to programmatically create and manage LangSmith resources.\n\n## Host\nhttps://api.smith.langchain.com\n\n## Authentication\nTo authenticate with the LangSmith API, set the `X-Api-Key` header\nto a valid [LangSmith API key](https://docs.langchain.com/langsmith/create-account-api-key#create-an-api-key).\n\n'}} 

****************************************************************************************************
{'info': {'version': '0.1.0'}, 'paths': {'/api/v1/audit-logs': {'get': {'tags': ['audit-logs'], 'summary': 'Get Audit Logs'}}}} 

****************************************************************************************************
{'paths': {'/api/v1/audit-logs': {'get': {'description': "Retrieve audit log records for the authenticated user's organization in OCSF format.\n\nRequires both start_time and end_time parameters to filter logs within a date range.\nSupports cursor-based paginat

In [15]:
from langchain_text_splitters import  RecursiveJsonSplitter
json_split = RecursiveJsonSplitter(max_chunk_size=300)
json_chucks = json_split.create_documents(texts= [json_data])
for i in json_chucks[:5]:
    print(i,'\n')
    print('*'*100)

page_content='{"openapi": "3.1.0", "info": {"title": "LangSmith", "description": "The LangSmith API is used to programmatically create and manage LangSmith resources.\n\n## Host\nhttps://api.smith.langchain.com\n\n## Authentication\nTo authenticate with the LangSmith API, set the `X-Api-Key` header\nto a valid [LangSmith API key](https://docs.langchain.com/langsmith/create-account-api-key#create-an-api-key).\n\n"}}' 

****************************************************************************************************
page_content='{"info": {"version": "0.1.0"}, "paths": {"/api/v1/audit-logs": {"get": {"tags": ["audit-logs"], "summary": "Get Audit Logs"}}}}' 

****************************************************************************************************
page_content='{"paths": {"/api/v1/audit-logs": {"get": {"description": "Retrieve audit log records for the authenticated user's organization in OCSF format.\n\nRequires both start_time and end_time parameters to filter logs within 

# Ollama

In [29]:
from langchain_community.embeddings import OllamaEmbeddings

embedding = OllamaEmbeddings(model='gemma:2b')

r = embedding.embed_documents([
    'Are you sure',
    'Are you not sure'
])


In [38]:
print(type(embedding))   # should be <class 'OllamaEmbeddings'>
print(len(r))            # 2
print(len(r[0]))         # embedding dimension


<class 'langchain_community.embeddings.ollama.OllamaEmbeddings'>
2
2048


In [55]:
#r[0]

In [56]:
#r[1]

# 129. Huggingface Embeddings

In [41]:
#hf = "Replace this"

In [97]:
from langchain_huggingface import HuggingFaceEmbeddings

In [98]:
embeddings = HuggingFaceEmbeddings(model_name = "all-MiniLM-L6-v2")

In [49]:
text ="Good morning"

In [51]:
embeddings.embed_query(text)[0]

-0.03337462991476059

In [52]:
text ="Good mornings"
embeddings.embed_query(text)[0]

-0.048283886164426804

# 130. VectorStores-FAISS

In [58]:
from langchain_community.document_loaders import PyPDFLoader
pdf = PyPDFLoader('Offer Letter.pdf')
pdf = pdf.load()

In [63]:
from langchain_text_splitters import  RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = text_splitter.split_documents(pdf)

In [66]:
from langchain_community.vectorstores import FAISS


In [67]:
embedding =OllamaEmbeddings()
db=FAISS.from_documents(docs,embeddings)

In [73]:
Query = "When the Joining date for this applicate ??"

op = db.similarity_search(Query)
op[0].page_content

'12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  \nyour joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  \nDate of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  \ninform us and no-show on the expected date of joining will efface this employment offer.\n \nYou shall be on probation for a period of six months starting from the date of your joining. Your performance during the probation  \nwill be appraised and, if found satisfactory, you will be confirmed in services. However, in the event of your p

# Retriver

In [75]:
ret = db.as_retriever()

ops  = ret.invoke(Query)

In [82]:
Query = "When the Joining date for this applicate ??"

op = db.similarity_search_with_score(Query)
#op[0].page_content
op[0][0]
#op[0][1]

Document(id='aad0996e-ebc8-48c0-9fe0-86cc37025304', metadata={'producer': 'LibreOffice 7.6', 'creator': 'Writer', 'creationdate': '2024-07-12T15:50:55+05:30', 'author': 'Hernandez, Mikaela', 'source': 'Offer Letter.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  \nyour joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  \nDate of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  \ninform us and no-show on the expected date of joining wi

In [83]:
# Saving and Loading 
db.save_local("Faiss_index")

In [89]:
new_db = FAISS.load_local("Faiss_index", embeddings , allow_dangerous_deserialization=True)

In [90]:
new_db.similarity_search_with_score(Query)


[(Document(id='aad0996e-ebc8-48c0-9fe0-86cc37025304', metadata={'producer': 'LibreOffice 7.6', 'creator': 'Writer', 'creationdate': '2024-07-12T15:50:55+05:30', 'author': 'Hernandez, Mikaela', 'source': 'Offer Letter.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  \nyour joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  \nDate of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  \ninform us and no-show on the expected date of joining 

#  131. VectorStore And Retriever- ChromaDB

In [91]:
from langchain_chroma import Chroma

In [92]:
docs

[Document(metadata={'producer': 'LibreOffice 7.6', 'creator': 'Writer', 'creationdate': '2024-07-12T15:50:55+05:30', 'author': 'Hernandez, Mikaela', 'source': 'Offer Letter.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  \nyour joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  \nDate of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  \ninform us and no-show on the expected date of joining will efface this employment offer.\n \nYou s

In [93]:
embedding =OllamaEmbeddings()
db=Chroma.from_documents(docs,embeddings)

In [94]:
Query = "When the Joining date for this applicate ??"

op = db.similarity_search_with_score(Query)
op[0][0]


Document(id='ccb5ab9e-4d6c-4fe4-8e7a-a4b69a7050e8', metadata={'total_pages': 3, 'page': 0, 'producer': 'LibreOffice 7.6', 'page_label': '1', 'creator': 'Writer', 'source': 'Offer Letter.pdf', 'creationdate': '2024-07-12T15:50:55+05:30', 'author': 'Hernandez, Mikaela'}, page_content='12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  \nyour joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  \nDate of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  \ninform us and no-show on the expected date of joining wi

In [99]:
vector  = Chroma.from_documents(documents=docs , embedding=embeddings, persist_directory="./chroma_db"z)

In [100]:
db2 =Chroma(persist_directory="./chroma_db" ,embedding_function=embeddings)
db2.similarity_search_with_score(Query)

[(Document(id='e20dbafd-be4d-4bf5-a887-5817157a0bc7', metadata={'producer': 'LibreOffice 7.6', 'author': 'Hernandez, Mikaela', 'page_label': '1', 'creator': 'Writer', 'total_pages': 3, 'source': 'Offer Letter.pdf', 'page': 0, 'creationdate': '2024-07-12T15:50:55+05:30'}, page_content='12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  \nyour joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  \nDate of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  \ninform us and no-show on the expected date of joining 

In [102]:
db_ = db2.as_retriever()
db_.invoke(Query)

[Document(id='e20dbafd-be4d-4bf5-a887-5817157a0bc7', metadata={'creationdate': '2024-07-12T15:50:55+05:30', 'total_pages': 3, 'source': 'Offer Letter.pdf', 'creator': 'Writer', 'producer': 'LibreOffice 7.6', 'page_label': '1', 'author': 'Hernandez, Mikaela', 'page': 0}, page_content='12 July 2024\nNithin kumar K R\nNo 1 \nKothanur\nChikkaballapur\nKarnataka  562105\nDear Nithin kumar,\n \nOffer Letter\nThis has reference to your application and subsequent interview you had with us. Further, we are pleased to offer you the post  \nof Consultant.\n \nKindly make a note that your tentative date of joining would be  29 July 2024  and the Company may be constrained to change  \nyour joining date due to the prevailing unprecedented situation. In that event, the Company shall duly inform you of the revised  \nDate of Joining. In case of any changes to the date of joining please inform us before the expected date of onboard. Failure to  \ninform us and no-show on the expected date of joining w