In [11]:
import os
import dotenv
from pathlib import Path

from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders import (
    WebBaseLoader, 
    PyPDFLoader, 
    Docx2txtLoader,
)
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

dotenv.load_dotenv()

True

In [16]:
#os.environ["OPENAI_API_KEY"] = "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"


In [12]:
# Load docs

doc_paths = [
    "D:/RAG_Travel_Assistance/rag_travel_assistance/docs/GBERTPaper.pdf",
    "D:/RAG_Travel_Assistance/rag_travel_assistance/docs/GottbertPaper.pdf",
]

docs = [] 
for doc_file in doc_paths:
    file_path = Path(doc_file)
    print(file_path)

    try:
        if doc_file.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
        elif doc_file.endswith(".docx"):
            loader = Docx2txtLoader(file_path)
        elif doc_file.endswith(".txt") or doc_file.name.endswith(".md"):
            loader = TextLoader(file_path)
        else:
            print(f"Document type {doc_file.type} not supported.")
            continue

        docs.extend(loader.load())

    except Exception as e:
        print(f"Error loading document {doc_file.name}: {e}")



url = "https://www.seatguru.com/airlines/Lufthansa/baggage.php"
try:
    print(f"\nLoading content from URL: {url}")
    loader = WebBaseLoader(url)
    url_docs = loader.load()
    docs.extend(url_docs)
    
    # Print the content from URL
    print("\nContent loaded from URL:")
    print("-" * 50)
    for i, doc in enumerate(url_docs):
        print(f"\nDocument {i+1}:")
        print(f"Page Content: {doc.page_content[:500]}...")  # Print first 500 characters
        print(f"Metadata: {doc.metadata}")
    print("-" * 50)
    print(f"\nTotal number of documents loaded from URL: {len(url_docs)}")

except Exception as e:
    print(f"Error loading document from {url}: {e}")

# Print summary
print(f"\nTotal number of all documents loaded: {len(docs)}") 

D:\RAG_Travel_Assistance\rag_travel_assistance\docs\GBERTPaper.pdf
D:\RAG_Travel_Assistance\rag_travel_assistance\docs\GottbertPaper.pdf

Loading content from URL: https://www.seatguru.com/airlines/Lufthansa/baggage.php

Content loaded from URL:
--------------------------------------------------

Document 1:
Page Content: 














 
Lufthansa: Baggage Fees and Policy - SeatGuru




























Seat Maps
Airlines
Cheap Flights

Comparison Charts



Short-haul Economy Class
Short-haul First/Business Class
Long-haul Economy Class
Premium Economy Class
Long-haul Business Class
Long-haul First Class



Rental Cars
Guru Tips




























                    Love travel? Sign up for our free newsletter and get the latest news, insights, and money-saving tips.
		    


			    By proceedin...
Metadata: {'source': 'https://www.seatguru.com/airlines/Lufthansa/baggage.php', 'title': 'Lufthansa: Baggage Fees and Policy - SeatGuru', 'description': ' Before your next 

In [13]:
print(docs)

[Document(metadata={'producer': 'pdfTeX-1.40.20', 'creator': 'TeX', 'creationdate': '2020-11-03T16:20:53+00:00', 'author': 'Branden Chan ; Stefan Schweter ; Timo Möller', 'moddate': '2020-11-03T16:20:53+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.20 (TeX Live 2019) kpathsea version 6.3.1', 'subject': 'COLING2020 2020', 'title': "German's Next Language Model", 'trapped': '/False', 'source': 'D:\\RAG_Travel_Assistance\\rag_travel_assistance\\docs\\GBERTPaper.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='Proceedings of the 28th International Conference on Computational Linguistics, pages 6788–6796\nBarcelona, Spain (Online), December 8-13, 2020\n6788\nGerman’s Next Language Model\nBranden Chan∗†, Stefan Schweter∗‡, Timo M¨oller†\n†deepset\n{branden.chan, timo.moeller}@deepset.ai\n‡Bayerische Staatsbibliothek M¨unchen\nDigital Library/Munich Digitization Center\nstefan.schweter@bsb-muenchen.de\nAbstract\nIn this work we present the experim

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=1000)
document_chunks = text_splitter.split_documents(docs)
print(document_chunks)

[Document(metadata={'producer': 'pdfTeX-1.40.20', 'creator': 'TeX', 'creationdate': '2020-11-03T16:20:53+00:00', 'author': 'Branden Chan ; Stefan Schweter ; Timo Möller', 'moddate': '2020-11-03T16:20:53+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.20 (TeX Live 2019) kpathsea version 6.3.1', 'subject': 'COLING2020 2020', 'title': "German's Next Language Model", 'trapped': '/False', 'source': 'D:\\RAG_Travel_Assistance\\rag_travel_assistance\\docs\\GBERTPaper.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='Proceedings of the 28th International Conference on Computational Linguistics, pages 6788–6796\nBarcelona, Spain (Online), December 8-13, 2020\n6788\nGerman’s Next Language Model\nBranden Chan∗†, Stefan Schweter∗‡, Timo M¨oller†\n†deepset\n{branden.chan, timo.moeller}@deepset.ai\n‡Bayerische Staatsbibliothek M¨unchen\nDigital Library/Munich Digitization Center\nstefan.schweter@bsb-muenchen.de\nAbstract\nIn this work we present the experim

In [17]:
vector_db = Chroma.from_documents(document_chunks, OpenAIEmbeddings())

In [None]:
# Retrieve

def _get_context_retriever_chain(vector_db, llm):
    retriever = vector_db.as_retriever()
    prompt = ChatPromptTemplate.from_messages([
        MessagesPlaceholder(variable_name="messages"),
        ("user", "{input}"),
        ("user", "Given the above conversation, generate a search query to look up in order to get inforamtion relevant to the conversation, focusing on the most recent messages."),
    ])
    retriever_chain = create_history_aware_retriever(llm, retriever, prompt)

    return retriever_chain