In [229]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [190]:
%pwd

'/Users/sakib/Documents/Project/Chatbot'

In [230]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [231]:
extracted_data = load_pdf_files("data")

In [232]:
extracted_data

[Document(metadata={'producer': 'Skia/PDF m144 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'Travel Assistant Book', 'source': 'data/Travel Assistant Book.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1'}, page_content='Toronto  —  Detailed  Overview  With  Tourist  Attractions   Toronto  is  the  largest  and  most  influential  city  in  Canada,  positioned  along  the  northwestern  \nshore\n \nof\n \nLake\n \nOntario\n \nand\n \nrecognized\n \nas\n \nthe\n \nnation’s\n \ncultural,\n \nfinancial,\n \nand\n \nentertainment\n \ncapital.\n \nCelebrated\n \nas\n \none\n \nof\n \nthe\n \nmost\n \nmulticultural\n \ncities\n \nin\n \nthe\n \nworld,\n \nToronto\n \nis\n \nhome\n \nto\n \nover\n \n200\n \nethnic\n \ncommunities,\n \ncreating\n \nan\n \nincredibly\n \ndiverse\n \natmosphere\n \nreflected\n \nin\n \nits\n \nneighborhoods,\n \nfood\n \nscene,\n \nfestivals,\n \nand\n \neveryday\n \nlife.\n \nThe\n \ncity’s\n \nmodern\n \nskyline—dominated\n \nby\n

In [233]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs:List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects 
    containing only 'source' in metadata and the page_content.
    """


    minimal_docs: List[Document] = []

    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(Document(
                                     page_content = doc.page_content,
                                     metadata={"source":src}))
        
        return minimal_docs


In [234]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

[Document(metadata={'source': 'data/Travel Assistant Book.pdf'}, page_content='Toronto  —  Detailed  Overview  With  Tourist  Attractions   Toronto  is  the  largest  and  most  influential  city  in  Canada,  positioned  along  the  northwestern  \nshore\n \nof\n \nLake\n \nOntario\n \nand\n \nrecognized\n \nas\n \nthe\n \nnation’s\n \ncultural,\n \nfinancial,\n \nand\n \nentertainment\n \ncapital.\n \nCelebrated\n \nas\n \none\n \nof\n \nthe\n \nmost\n \nmulticultural\n \ncities\n \nin\n \nthe\n \nworld,\n \nToronto\n \nis\n \nhome\n \nto\n \nover\n \n200\n \nethnic\n \ncommunities,\n \ncreating\n \nan\n \nincredibly\n \ndiverse\n \natmosphere\n \nreflected\n \nin\n \nits\n \nneighborhoods,\n \nfood\n \nscene,\n \nfestivals,\n \nand\n \neveryday\n \nlife.\n \nThe\n \ncity’s\n \nmodern\n \nskyline—dominated\n \nby\n \nthe\n \niconic\n \n553-meter\n \nCN\n \nTower—sits\n \nabove\n \na\n \nbustling\n \nurban\n \ncore\n \nfilled\n \nwith\n \ntheatres,\n \nmuseums,\n \nshopping\n \ndistri

####Chunking

In [235]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 10,
    )

    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [236]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks:{len(texts_chunk)}")

Number of chunks:9


### Embedding

In [237]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

In [238]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [239]:
vector = embedding.embed_query("Hi")
print( "Vector length:", len(vector))

Vector length: 384


In [240]:
import os
#os.environ["OPENAI_API_KEY"] = 

###Pinecone dB

In [241]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [242]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [243]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [244]:
from pinecone import ServerlessSpec 

index_name = "chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [245]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [246]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [247]:
## Add more data to existing pinecone index

docs_extra = Document(
    page_content= "Tourist Attraction: Niagara Falls (Ontario), Niagara Falls is one of Canada’s most iconic natural attractions, located on the border between Ontario, Canada, and New York, USA. Famous for its breathtaking beauty and massive water flow, the falls consist of three main sections: Horseshoe Falls (the largest and most impressive), American Falls, and Bridal Veil Falls. Visitors can experience the falls from multiple viewpoints along the Niagara Parkway, but the most immersive experience is through the Hornblower Boat Cruise, where boats take passengers close enough to feel the mist. The area surrounding the falls offers attractions including Journey Behind the Falls, Skylon Tower observation deck, and Clifton Hill, a lively entertainment district filled with restaurants, museums, arcades, and attractions. Niagara Falls is accessible from Toronto in about 1.5 hours by car or GO Train, making it one of the most convenient and popular day trips for tourists visiting Canada.",
    metadata ={"source":"Website"}
)

In [248]:
docsearch.add_documents(documents=[docs_extra])

['f8dcced8-600e-4841-a217-c86ffda99351']

In [249]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [250]:
retrieved_docs = retriever.invoke("What are the best places to visit in Toronto??")
retrieved_docs

[Document(id='2ed0beaa-7a43-4a34-9339-04b0e8ebf07b', metadata={'source': 'data/Travel Assistant Book.pdf'}, page_content='its\n \nwalkable\n \ndowntown,\n \nwaterfront\n \npromenades,\n \nand\n \ninterconnected\n \nneighborhoods\n \nmake\n \nexploration\n \neasy\n \nand\n \nenjoyable.\n \nOverall,\n \nToronto\n \nstands\n \nout\n \nas\n \na\n \nvibrant,\n \ninclusive,\n \nand\n \nculturally\n \ndynamic\n \nmetropolis\n \nwhere\n \nglobal'),
 Document(id='8ac543f5-b1b3-4372-94f8-1e41e52503d0', metadata={'source': 'data/Travel Assistant Book.pdf'}, page_content='Toronto  —  Detailed  Overview  With  Tourist  Attractions   Toronto  is  the  largest  and  most  influential  city  in  Canada,  positioned  along  the  northwestern  \nshore\n \nof\n \nLake\n \nOntario\n \nand\n \nrecognized\n \nas\n \nthe\n \nnation’s\n \ncultural,\n \nfinancial,\n \nand\n \nentertainment\n \ncapital.\n \nCelebrated\n \nas\n \none\n \nof\n \nthe\n \nmost\n \nmulticultural\n \ncities\n \nin\n \nthe\n \nworld,\

In [212]:
from langchain_openai import ChatOpenAI

chatModel = ChatOpenAI(model="gpt-4o-mini", api_key=os.environ["OPENAI_API_KEY"])

In [213]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [132]:
system_prompt = (
    "You are a Travel Information Assistant that answers questions about Canada and its major cities "
    "(Toronto, Vancouver, Montreal, Calgary, and Ottawa). "
    "Use the retrieved context provided to answer the user's question accurately. "
    "If the answer is not in the context, respond with 'I don't know based on the provided information.' "
    "Keep the answer concise, clear, and limited to a maximum of three sentences. "
    "Do not add information that is not supported by the context. "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [133]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [135]:
response = rag_chain.invoke({"input": "What are the main attractions in Toronto?"})
print(response["answer"])

I don't know based on the provided information.


In [251]:
from langchain_openai import ChatOpenAI
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
import os

# ---------------------------------------------------------
# 3) INITIALIZE OPENAI MODEL
# ---------------------------------------------------------
llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0
)

# ---------------------------------------------------------
# 4) SYSTEM PROMPT FOR RAG
# ---------------------------------------------------------
system_prompt = (
    "You are a Travel Information Assistant that answers questions about Canada "
    "and its major cities (Toronto, Vancouver, Montreal, Calgary, and Ottawa). "
    "Use ONLY the retrieved context to answer. "
    "If the context does not contain the answer, say: 'I don't know based on the provided information.'\n\n"
    "CONTEXT:\n{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

# ---------------------------------------------------------
# 5) RAG PIPELINE
# ---------------------------------------------------------
# retriever MUST already be defined earlier in your notebook
# example: retriever = vectorstore.as_retriever()

qa_chain = create_stuff_documents_chain(
    llm=llm,
    prompt=prompt
)

rag_chain = create_retrieval_chain(
    retriever=retriever,
    combine_docs_chain=qa_chain
)

# ---------------------------------------------------------
# 6) TEST THE RAG PIPELINE
# ---------------------------------------------------------
query = "Tell me about Niagara Falls"
response = rag_chain.invoke({"input": query})




In [252]:
query_result = response["answer"]
print(query_result)




Niagara Falls is one of Canada’s most iconic natural attractions, located on the border between Ontario, Canada, and New York, USA. It is famous for its breathtaking beauty and massive water flow, consisting of three main sections: Horseshoe Falls (the largest and most impressive), American Falls, and Bridal Veil Falls. Visitors can experience the falls from multiple viewpoints along the Niagara Parkway, but the most immersive experience is through the Hornblower Boat Cruise, where boats take passengers close enough to feel the mist. The area surrounding the falls offers attractions including Journey Behind the Falls, Skylon Tower observation deck, and Clifton Hill, a lively entertainment district filled with restaurants, museums, arcades, and attractions. Niagara Falls is accessible from Toronto in about 1.5 hours by car or GO Train, making it one of the most convenient and popular day trips for tourists visiting Canada.


In [253]:
query = "Tell me about tourist attractions in Toronto?"
response = rag_chain.invoke({"input": query})
query_result = response["answer"]
print(query_result)

Toronto offers a variety of tourist attractions, including:

1. **Royal Ontario Museum** - Canada’s largest museum featuring global history, dinosaurs, and Indigenous collections.
2. **Art Gallery of Ontario** - Houses more than 95,000 works and offers free admission on Wednesday evenings.
3. **Niagara Falls** - Located about 1.5 hours from Toronto, it is one of Canada’s most iconic natural attractions, known for its breathtaking beauty and massive water flow. Visitors can experience the falls through viewpoints along the Niagara Parkway or take the Hornblower Boat Cruise for an immersive experience.

Additionally, Toronto is known for its shopping districts, such as the Eaton Centre, and its multicultural atmosphere, being home to over 200 ethnic communities.


In [254]:
query = "Tell me about transportation in toronto?"
response = rag_chain.invoke({"input": query})
query_result = response["answer"]
print(query_result)

Toronto has reliable public transportation made up of subways, buses, and streetcars. The city's safety, cleanliness, and transportation options enhance its appeal for visitors. Additionally, Toronto features a walkable downtown, waterfront promenades, and interconnected neighborhoods, making exploration easy and enjoyable.
