In [99]:
# import sys
# print(sys.executable)

In [100]:
# %pwd

In [5]:
import os 
os.chdir("../")

In [7]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
# Extract Data from the Pdf
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob = "*.pdf",
                             loader_cls = PyPDFLoader)
    document = loader.load()
    return document 

In [9]:
extract_pdf_data = load_pdf_file(data = "Dataset/")

In [10]:
# Assuming extract_pdf_data is already loaded
if extract_pdf_data:
    num_rows = len(extract_pdf_data)
    print(f"Number of rows (documents) created: {num_rows}")
else:
    print("No data found in 'extract_pdf_data'.")

Number of rows (documents) created: 1302


In [11]:
# split the data into chunks
def text_split(extracted_data):
    text_spliter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_spliter.split_documents(extracted_data)
    return text_chunks

In [12]:
text_chunks = text_split(extract_pdf_data)
print(len(text_chunks))

6690


In [15]:
# text_chunks

In [13]:
text_chunks[5].page_content

'renewal. In this revised and updated edition, Van Schendel offers\na fascinating and highly readable account of life in Bangladesh over\nthe last two millennia. Based on the latest academic research and\ncovering the numerous historical developments of the2010s, he\nprovides an eloquent introduction to a fascinating country and its\nresilient and inventive people. A perfect survey for travellers, expats,\nstudents and scholars alike.\nwillem van schendelserved as Professor of Modern Asian'

In [None]:
# from langchain.embeddings import HuggingFaceEmbeddings

# from sentence_transformers import SentenceTransformer
# from sentence_transformers.util import cos_sim

In [65]:
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
#download embedding model
def embedding_model():
    model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")        
    return model

In [None]:
embeddings = embedding_model()

In [None]:
# sentences = ['That is a happy person', 'That is a very happy person']
# emb_output = embeddings.encode(sentences)
# print(cos_sim(emb_output[0], emb_output[1]))

tensor([[0.9429]])


In [None]:
# Query_result = embeddings.embed_query("Hello, world")
# print(len(Query_result))

384


In [86]:
from dotenv import load_dotenv
import os

load_dotenv()

#Pinecone API
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

#OpenAI API
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [41]:
load_dotenv("/path/to/.env")

False

In [53]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [None]:
index_name = "histobot"

pc.create_index(
    name = index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

In [79]:
from langchain_pinecone import PineconeVectorStore
# Embeded each chunk into Pinencone by upsert

document_upsert = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name="histobot",
    embedding=embeddings,
)

In [None]:
docusearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [82]:
docusearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x740641d780b0>

In [83]:
retriever = docusearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [84]:
retrieved_docs = retriever.invoke("when did the first SAARC summit held in Dhaka?")

In [85]:
retrieved_docs

[Document(id='9b2affda-029c-41b9-b59d-57928c3b5006', metadata={'author': 'HPPC', 'creationdate': '2025-02-21T22:55:53+06:00', 'creator': 'Writer', 'page': 305.0, 'page_label': '306', 'producer': 'LibreOffice 24.2', 'source': 'Dataset/ABriefHistoryofBangladedsh.pdf', 'total_pages': 393.0}, page_content='major conferences: Asian Relations Conference (New Delhi), Baguio  \nConference  (Philippines)  and  Colombo  Powers  Conference  (Sri  \nLanka), which were held between 1947 and 1954. Ex-president of  \nBangladesh, Ziaur Rahman was the one who made a formal proposal  \non May 2, 1980. The first SAARC summit was held in Dhaka on 8  \nDecember 1985, during the Presidency of H. M. Ershad, when the  \norganization  was  established.  Initially  the  number  of  SAARC'),
 Document(id='39dc7537-06cd-4a99-b2bc-0677f25564b8', metadata={'author': 'HPPC', 'creationdate': '2025-02-21T22:55:53+06:00', 'creator': 'Writer', 'page': 183.0, 'page_label': '184', 'producer': 'LibreOffice 24.2', 'source':

In [136]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=500)

In [137]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [138]:
System = ("You are a student assistant for question-answering task."
          "Always answer according to the pieces of retrieved context."
          "if the given question you don't the answer, in politly reply you don't know the answer."
          "Use 4 sentences to maximum and keep the answer concise."
          "\n\n"
          "{context}")

In [139]:
prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", System),
        ("human", "{input}"),
    ]
)

In [140]:
question_answering_chain = create_stuff_documents_chain(llm, prompt_template)
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [141]:
response = rag_chain.invoke({"input" : "when did the first SAARC summit held in Dhaka?"})
print(response["answer"])


The first SAARC summit was held in Dhaka on 8 December 1985.


In [98]:
response = rag_chain.invoke({"input" : "What is data science?"})
print(response["answer"])



I am a student assistant and my knowledge is limited to the pieces of retrieved context. Unfortunately, I do not have enough information to answer your question about data science. However, I am happy to assist you with any other questions you may have.
