# Task 1: Load and Process Unstructured Word Document using LangChain

In [None]:

# Step 1: Import necessary libraries and download NLTK resources
from langchain_community.document_loaders import UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain_openai import ChatOpenAI
from IPython.display import display, Markdown
import nltk

# Download NLTK resources for text processing
print("Downloading NLTK resources...")
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
print("✅ NLTK resources downloaded successfully")

# Step 2: Load the Word document using UnstructuredWordDocumentLoader
print("\n" + "="*60)
print("LOADING WORD DOCUMENT")
print("="*60)

# Load the Word document
loader = UnstructuredWordDocumentLoader("bali travel guide.docx")
docs = loader.load()

# Print confirmation of the number of sections loaded
print(f"✅ Document loaded with {len(docs)} sections.")

# Optional: Display first few characters of the loaded content
if docs:
    print(f"📄 First 200 characters of content:")
    print(f"'{docs[0].page_content[:200]}...'")
    print(f"📊 Total content length: {len(docs[0].page_content)} characters")

# Step 3: Split the document into chunks using RecursiveCharacterTextSplitter
print("\n" + "="*60)
print("SPLITTING DOCUMENT INTO CHUNKS")
print("="*60)

# Initialize the text splitter with chunk size of 800 characters and 50 characters overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800, 
    chunk_overlap=50
)

# Split the document into chunks
chunks = text_splitter.split_documents(docs)

# Print the number of chunks created
print(f"✅ Number of chunks: {len(chunks)}")

# Optional: Display information about the chunks
if chunks:
    print(f"📊 Chunk size range: {min(len(chunk.page_content) for chunk in chunks)} - {max(len(chunk.page_content) for chunk in chunks)} characters")
    print(f"📄 First chunk preview:")
    print(f"'{chunks[0].page_content[:150]}...'")

    if len(chunks) > 1:
        print(f"📄 Last chunk preview:")
        print(f"'{chunks[-1].page_content[:150]}...'")

Downloading NLTK resources...
✅ NLTK resources downloaded successfully

LOADING WORD DOCUMENT


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\yaseen_banu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\yaseen_banu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


✅ Document loaded with 1 sections.
📄 First 200 characters of content:
'BALI TRAVEL GUIDE

INTRODUCTION TO BALI

Bali, known as the "Island of the Gods," is a Indonesian province famous for its forested volcanic mountains, iconic rice paddies, beaches, and coral reefs. Th...'
📊 Total content length: 5934 characters

SPLITTING DOCUMENT INTO CHUNKS
✅ Number of chunks: 10
📊 Chunk size range: 325 - 787 characters
📄 First chunk preview:
'BALI TRAVEL GUIDE

INTRODUCTION TO BALI

Bali, known as the "Island of the Gods," is a Indonesian province famous for its forested volcanic mountains,...'
📄 Last chunk preview:
'CONCLUSION

Bali offers an incredible mix of culture, natural beauty, and adventure. Whether you're seeking spiritual enlightenment, exciting outdoor ...'


# Task 2: Embed Document Chunks using Hugging Face's Sentence Transformers and Store in FAISS

In [None]:

print("\n" + "="*60)
print("CREATING DOCUMENT EMBEDDINGS")
print("="*60)

# Step 1: Load Hugging Face's Sentence Transformers embedding model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

# Confirm the model is loaded successfully
print("✅ Embedding model loaded successfully.")

# Step 2: Embed the document chunks and store them in FAISS database
db_faiss = FAISS.from_documents(chunks, embedding_model)

# Confirm that the chunks have been successfully stored in FAISS
print("✅ Document chunks embedded and stored in FAISS vector database.")

# Optional: Display additional information about the vector database
print(f"📊 Number of vectors in FAISS database: {db_faiss.index.ntotal}")
print(f"📐 Vector dimension: {embedding_model.client[1].get_sentence_embedding_dimension()}")

print("\n" + "="*60)
print("VECTOR DATABASE READY FOR RETRIEVAL")
print("="*60)


CREATING DOCUMENT EMBEDDINGS
✅ Embedding model loaded successfully.
✅ Document chunks embedded and stored in FAISS vector database.
📊 Number of vectors in FAISS database: 10
📐 Vector dimension: 384

VECTOR DATABASE READY FOR RETRIEVAL


# Task 3: Creating a Function to Retrieve Relevant Document Context

In [18]:
print("\n" + "="*60)
print("CREATING DOCUMENT RETRIEVAL FUNCTION")
print("="*60)

# Step 1: Define the function to retrieve relevant document chunks
def retrieve_docs(query, k):
    # Perform similarity search on the FAISS database
    docs_faiss = db_faiss.similarity_search(query, k=k)
    
    # Return the most relevant document chunks
    return docs_faiss

print("✅ Document retrieval function created successfully.")

print("\n" + "="*60)
print("TESTING RETRIEVAL FUNCTION")
print("="*60)

# Step 2: Test the function with a sample query
query = 'where should i go in bali if i want to experience its nightlife?'
context = retrieve_docs(query, 5)

# Display the first retrieved chunk to verify correct retrieval
print(f"📄 Query: '{query}'")
print(f"📊 Number of chunks retrieved: {len(context)}")
print(f"\n🔍 First retrieved chunk:")
print(context[0])

print("\n" + "="*60)
print("DISPLAYING ALL RETRIEVED CHUNKS")
print("="*60)

# Optional: Display all retrieved chunks with their content
for i, doc in enumerate(context, 1):
    print(f"\n📄 Chunk {i}:")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print("-" * 40)


CREATING DOCUMENT RETRIEVAL FUNCTION
✅ Document retrieval function created successfully.

TESTING RETRIEVAL FUNCTION
📄 Query: 'where should i go in bali if i want to experience its nightlife?'
📊 Number of chunks retrieved: 5

🔍 First retrieved chunk:
page_content='PRACTICAL INFORMATION

GETTING AROUND

Scooters are the most popular way to get around Bali, available for rent everywhere. However, traffic can be chaotic and roads may be challenging for inexperienced riders. Taxis and ride-hailing apps like Grab are widely available. For longer distances, consider hiring a private driver.

ACCOMMODATION

Bali offers accommodation for every budget, from backpacker hostels to luxury resorts. Ubud offers boutique hotels and wellness retreats, while Seminyak features luxury beachfront resorts. Canggu has budget-friendly guesthouses and surf camps. Book in advance during peak season (July-August and December-January).

CURRENCY AND COSTS' metadata={'source': 'bali travel guide.docx'}

DISPLAYI

# Task 4: Generating Contextual Travel Responses using LangChain and OpenAI

In [19]:

print("\n" + "="*60)
print("SETTING UP OPENAI CLIENT AND GENERATING RESPONSES")
print("="*60)

# Step 1: Initialize the Langchain ChatOpenAI client
# Azure OpenAI setup (update deployment name if needed)
llm = AzureChatOpenAI(
    azure_deployment=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_key=os.environ["AZURE_OPENAI_API_KEY"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    temperature=0.3
)

print("✅ ChatOpenAI client initialized successfully.")

# Step 2: Define the user query and system prompt
# Define the user query
query = 'where should I go in Bali if I want to experience its nightlife?'

# Define the system prompt for the assistant
system_message = """
    You are a travel planner specializing in Bali, Indonesia.
    Your role is to assist with inquiries related to this destination, providing accurate and detailed information.
    Answer questions using the context provided, ensuring responses are tailored to the specific needs of the user.
"""

print("✅ Query and system message defined.")

# Step 3: Retrieve relevant context from the FAISS database
context = retrieve_docs(query, 10)

print(f"✅ Retrieved {len(context)} relevant document chunks for context.")

# Combine the context into a single string
context_text = "\n\n".join([doc.page_content for doc in context])

# Structure the messages for the assistant with context included
messages = [
    ("system", system_message),
    ("human", f"Context from documents:\n{context_text}\n\nUser question: {query}")
]

print("✅ Messages structured with retrieved context.")

print("\n" + "="*60)
print("GENERATING TRAVEL RESPONSE")
print("="*60)

# Step 4: Generate and display the travel response
# Generate and display the response from the assistant
response = llm.invoke(messages)  # Call the API with the messages

# Display the response in markdown format
display(Markdown(response.content))

print("\n" + "="*60)
print("RESPONSE GENERATED SUCCESSFULLY")
print("="*60)


SETTING UP OPENAI CLIENT AND GENERATING RESPONSES
✅ ChatOpenAI client initialized successfully.
✅ Query and system message defined.
✅ Retrieved 10 relevant document chunks for context.
✅ Messages structured with retrieved context.

GENERATING TRAVEL RESPONSE


If you're looking to experience the nightlife in Bali, Seminyak and Canggu are two of the best areas to explore.

**Seminyak** is known for its upscale beach clubs, vibrant nightlife, and trendy bars. Popular spots include:

- **Ku De Ta**: A famous beach club that offers a lively atmosphere, great cocktails, and stunning sunset views.
- **Potato Head Beach Club**: Known for its infinity pool and beach access, it's a perfect place to relax during the day and enjoy the nightlife.
- **La Plancha**: A colorful beach bar that offers a more laid-back vibe with bean bags on the sand, making it a great spot for sunset drinks.

**Canggu** has a more relaxed and bohemian vibe, popular with surfers and digital nomads. Here, you can find:

- **Old Man's**: A beachfront bar that hosts live music and events, making it a great place to socialize.
- **The Lawn**: A stylish venue with a pool and beachfront views, perfect for enjoying cocktails and watching the sunset.
- **Pretty Poison**: A unique spot known for its skate bowl and vibrant atmosphere, offering a mix of live music and DJ sets.

Both areas offer a mix of bars, clubs, and beach clubs, ensuring you have plenty of options to enjoy Bali's nightlife.


RESPONSE GENERATED SUCCESSFULLY
