## Step 1
# Data Loading & Normalising Text

In [5]:
import os
import re
import pdfplumber
import fitz  # This is PyMuPDF
from langchain_community.document_loaders import PyPDFLoader

# 📁 Base path
base_data_path = r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\data"

# 📄 Input PDFs
pdf_paths = {
    "Travelite Brochure": os.path.join(base_data_path, "raw_docs", "45.pdf"),
    "Air India Security": os.path.join(base_data_path, "raw_docs", "security-regulations-dangerous-goods-restricted-items.pdf"),
    "Air India Express Fees": os.path.join(base_data_path, "raw_docs", "AIX-FeesandCharges-12-OCT-23.pdf"),
    "IndiGo ZED Policy": os.path.join(base_data_path, "raw_docs", "ZEDPolicy.pdf"),
    "Alliance Air Baggage": os.path.join(base_data_path, "raw_docs", "baggage-policy.pdf")
}

# 📁 Output folders
cleaned_output_dir = os.path.join(base_data_path, "cleaned_docs")
images_output_dir = os.path.join(base_data_path, "extracted_images")
os.makedirs(cleaned_output_dir, exist_ok=True)
os.makedirs(images_output_dir, exist_ok=True)

# 🧼 Clean text
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s.,;:!?()/-]", "", text)
    return text

# 🖼️ Extract images from PDF
def extract_images(pdf_path, pdf_name):
    print(f"🖼️ Extracting images from: {pdf_name}")
    doc = fitz.open(pdf_path)
    pdf_image_dir = os.path.join(images_output_dir, pdf_name.replace(" ", "_"))
    os.makedirs(pdf_image_dir, exist_ok=True)
    image_count = 0

    for page_num in range(len(doc)):
        for img_index, img in enumerate(doc[page_num].get_images(full=True)):
            xref = img[0]
            base_img = doc.extract_image(xref)
            image_data = base_img["image"]
            ext = base_img["ext"]
            image_path = os.path.join(pdf_image_dir, f"page{page_num+1}_img{img_index+1}.{ext}")
            with open(image_path, "wb") as f:
                f.write(image_data)
            image_count += 1

    print(f"✅ Extracted {image_count} image(s) to: {pdf_image_dir}")
    return image_count

# 📄 Main extraction
def extract_and_save(name, pdf_path):
    try:
        print(f"\n📄 Processing: {name}")

        # Step 1: Extract text
        loader = PyPDFLoader(pdf_path)
        documents = loader.load_and_split()
        full_text = "\n\n".join([doc.page_content for doc in documents])
        print(f"✅ Extracted {len(documents)} pages of text")

        # Step 2: Extract tables
        table_text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for i, page in enumerate(pdf.pages):
                tables = page.extract_tables()
                for t_index, table in enumerate(tables):
                    table_text += f"\n\n[Table from Page {i+1} - Table {t_index+1}]\n"
                    for row in table:
                        table_text += "\t".join(cell or "" for cell in row) + "\n"
        if table_text.strip():
            print(f"📊 Extracted tables from pdfplumber")

        # Step 3: Extract images
        extract_images(pdf_path, name)

        # Step 4: Combine and clean
        combined_text = full_text + "\n\n" + table_text
        cleaned_text = clean_text(combined_text)

        # Step 5: Save cleaned text
        cleaned_file_path = os.path.join(cleaned_output_dir, f"{name.replace(' ', '_')}.txt")
        with open(cleaned_file_path, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)

        print(f"📁 Cleaned text saved to: {cleaned_file_path}")

    except Exception as e:
        print(f"❌ Error processing {name}: {e}")

# ▶️ Run
if __name__ == "__main__":
    for name, path in pdf_paths.items():
        extract_and_save(name, path)

    print("\n✅ Step 1A complete. All text, tables, and images extracted & saved.")



📄 Processing: Travelite Brochure
✅ Extracted 51 pages of text
📊 Extracted tables from pdfplumber
🖼️ Extracting images from: Travelite Brochure
✅ Extracted 206 image(s) to: C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\data\extracted_images\Travelite_Brochure
📁 Cleaned text saved to: C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\data\cleaned_docs\Travelite_Brochure.txt

📄 Processing: Air India Security
✅ Extracted 5 pages of text
📊 Extracted tables from pdfplumber
🖼️ Extracting images from: Air India Security
✅ Extracted 283 image(s) to: C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\data\extracted_images\Air_India_Security
📁 Cleaned text saved to: C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\data\cleaned_docs\Air_India_Security.txt

📄 Processing: Air India Express Fees
✅ Extracted 3 pages of text
📊 Extracted tables from pdfplumber
🖼️ Extractin

# Creating Embeddings

In [6]:
import os
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

# 📂 Paths
cleaned_docs_dir = r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\data\cleaned_docs"

# 🔧 Chunking settings
CHUNK_SIZE = 800
CHUNK_OVERLAP = 100

# 🔁 Chunker
splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len,
    add_start_index=True,
)

# 📄 Load + chunk each file
documents = []

for filename in os.listdir(cleaned_docs_dir):
    if filename.endswith(".txt"):
        path = os.path.join(cleaned_docs_dir, filename)
        print(f"📄 Loading: {filename}")
        loader = TextLoader(path, encoding="utf-8")
        raw_docs = loader.load()
        chunks = splitter.split_documents(raw_docs)
        for chunk in chunks:
            chunk.metadata["source"] = filename
        documents.extend(chunks)

print(f"\n📦 Total chunks: {len(documents)}")

# 🔍 Embedding model
hf = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)

# 🧠 Generate embeddings
texts = [doc.page_content for doc in documents]
print(f"🔁 Creating embeddings for {len(texts)} chunks...")
embeddings = hf.embed_documents(texts)

print(f"\n✅ Embeddings created for all chunks.")


📄 Loading: Air_India_Express_Fees.txt
📄 Loading: Air_India_Security.txt
📄 Loading: Alliance_Air_Baggage.txt
📄 Loading: IndiGo_ZED_Policy.txt
📄 Loading: Travelite_Brochure.txt

📦 Total chunks: 359


  hf = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


🔁 Creating embeddings for 359 chunks...

✅ Embeddings created for all chunks.


In [None]:
!pip install sentence-transformers

Collecting numpy>=1.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl (15.8 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.3.1
    Uninstalling numpy-2.3.1:
      Successfully uninstalled numpy-2.3.1
Successfully installed numpy-1.26.4


  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pyfume 0.3.4 requires numpy==1.24.4, but you have numpy 1.26.4 which is incompatible.
streamlit 1.30.0 requires packaging<24,>=16.8, but you have packaging 24.1 which is incompatible.


## ✅ validate_embeddings()

In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
from langchain.schema import Document

def validate_embeddings(embeddings: List[List[float]], documents: List[Document], show_preview=True) -> None:
    print("\n🔎 Validating embeddings...\n")

    # 1. Count check
    if len(embeddings) != len(documents):
        print(f"❌ Count mismatch: {len(embeddings)} embeddings vs {len(documents)} documents")
        return
    else:
        print(f"✅ Embedding count matches document count: {len(embeddings)}")

    # 2. Dimensionality check
    vector_lengths = [len(vec) for vec in embeddings]
    unique_dims = set(vector_lengths)
    print(f"📏 Unique embedding dimensions: {unique_dims}")
    if len(unique_dims) > 1:
        print("⚠️ Warning: inconsistent embedding dimensions")

    # 3. NaN check
    nan_count = sum(np.isnan(vec).any() for vec in embeddings)
    print(f"❌ Embeddings with NaN values: {nan_count}")

    # 4. All-zero vector check
    zero_count = sum(np.allclose(vec, 0) for vec in embeddings)
    print(f"⚠️  Embeddings that are all zeros: {zero_count}")

    # 5. Cosine similarity sample
    if len(embeddings) >= 2:
        sim = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        print(f"🧠 Cosine similarity between chunk 0 and 1: {sim:.4f}")

    # 6. Optional preview
    if show_preview:
        print("\n📄 Sample Chunk Text:")
        print(documents[0].page_content[:300], "...\n")
        print("🔢 Sample Embedding Vector (first 5 dims):")
        print(embeddings[0][:5])


In [8]:
validate_embeddings(embeddings, documents)



🔎 Validating embeddings...

✅ Embedding count matches document count: 359
📏 Unique embedding dimensions: {768}
❌ Embeddings with NaN values: 0
⚠️  Embeddings that are all zeros: 0
🧠 Cosine similarity between chunk 0 and 1: 0.6709

📄 Sample Chunk Text:
1. all fees displayed are per pax per journey (unless specified) for one-way direct 
flights including taxes (if applicable)
2. prices may vary depending on your travel period, but will not be higher than 
those reflected in the table below.
3. guests shall be entitled to one piece of checked baggag ...

🔢 Sample Embedding Vector (first 5 dims):
[0.034289319068193436, -0.10171958059072495, -0.013270975090563297, 0.06298941373825073, 0.02094653807580471]


## Vector Store

In [9]:
from langchain.vectorstores import FAISS
import os

# ✅ Combine texts + embeddings into pairs
text_embedding_pairs = [(doc.page_content, emb) for doc, emb in zip(documents, embeddings)]

# ✅ Create FAISS index
vectorstore = FAISS.from_embeddings(
    text_embedding_pairs,  # List of (text, embedding_vector) tuples
    embedding=hf           # Your HuggingFaceEmbeddings instance
)

# ✅ Save FAISS index
faiss_index_path = r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\models\rag_vector_store\travel_docs_index"
os.makedirs(faiss_index_path, exist_ok=True)

vectorstore.save_local(faiss_index_path)

print(f"\n✅ FAISS vector store saved successfully at:\n{faiss_index_path}")



✅ FAISS vector store saved successfully at:
C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\models\rag_vector_store\travel_docs_index


In [10]:
from langchain.vectorstores import FAISS

# ✅ Path to your saved FAISS index
faiss_index_path = r"C:\Users\naray\OneDrive\Pictures\Desktop\01. My Learning\new\smart-travel-advisor\models\rag_vector_store\travel_docs_index"

# ✅ Load the FAISS index (use embeddings=)
vectorstore = FAISS.load_local(faiss_index_path, embeddings=hf,allow_dangerous_deserialization=True)

# ✅ Create retriever
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# 🔍 Test query
query ="what are Dangerous Goods & Restricted Items?"
results = retriever.get_relevant_documents(query)

# 📄 Show results
print(f"\n🔍 Top Results for: {query}\n")
for i, doc in enumerate(results, 1):
    print(f"\n📄 Result {i}:\n{doc.page_content[:100]}")
    print("📎 Metadata:", doc.metadata)



🔍 Top Results for: what are Dangerous Goods & Restricted Items?


📄 Result 1:
flammable items
such as aerosol (any except for personal care or toiletries in limited quantities), 
📎 Metadata: {}

📄 Result 2:
(including cooking fuels and any flammable liquid fuel), gasoline, gas torches, lighter
fluid, strik
📎 Metadata: {}

📄 Result 3:
cartridges, hand guns, fireworks, and pistol caps
 flammable liquids and solids such as lighter refi
📎 Metadata: {}

📄 Result 4:
security regulations
dangerous goods  restricted items
air india does not carry any kind of dangerou
📎 Metadata: {}

📄 Result 5:
security regulations
items which cannot be carried in hand baggage and even as checked -in
baggage
e
📎 Metadata: {}


  results = retriever.get_relevant_documents(query)


In [11]:
faiss_index_path

'C:\\Users\\naray\\OneDrive\\Pictures\\Desktop\\01. My Learning\\new\\smart-travel-advisor\\models\\rag_vector_store\\travel_docs_index'

In [12]:
# 🔍 Create retriever

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [13]:
from langchain.chat_models import ChatOpenAI
# 💬 Load LLM
llm = ChatOpenAI(
    model="gpt-3.5-turbo", 
    temperature=0.2,
    openai_api_key="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"  # Or use os.getenv()
)


  llm = ChatOpenAI(


In [14]:
# 🔗 Create RAG chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

In [15]:
# ✅ Ask a question
query = "what are Dangerous Goods & Restricted Items?"
result = rag_chain(query)

  result = rag_chain(query)


In [16]:
result

{'query': 'what are Dangerous Goods & Restricted Items?',
 'result': 'Dangerous Goods and Restricted Items are items that are considered hazardous or potentially harmful if transported on an aircraft. These items include flammable materials like aerosols, fuels, and gasoline, oxidizing materials like bleaching powder, poisonous and infectious substances like insecticides, radioactive materials, corrosives like acids and alkalis, and other dangerous articles like offensive or irritating materials. These items are restricted or prohibited from being carried on board an aircraft for safety reasons.',
 'source_documents': [Document(id='387822d0-ee21-4a54-ba11-eb615bb40e5f', metadata={}, page_content='flammable items\nsuch as aerosol (any except for personal care or toiletries in limited quantities), fuels\n(including cooking fuels and any flammable liquid fuel), gasoline, gas torches, lighter\nfluid, strike anywhere matches, turpentine and paint thinner, realistic replicas of\nincendiaries

In [17]:
 # ✅ Ask a question
query = "Do domestic flights provide free meals?"
result = rag_chain(query)

In [18]:
result

{'query': 'Do domestic flights provide free meals?',
 'result': 'Based on the provided information, for flights with a duration of less than 75 minutes, only lite bites will be available instead of hot meals. So, it seems that domestic flights may not provide free meals, especially for shorter flights.',
 'source_documents': [Document(id='abaf3186-8a52-400d-abea-8a7798cb5aba', metadata={}, page_content='meals sandwich included included included\nxtra carry on bag 3 kg extra cabin bag - - included\nxpress ahead\npriority check-in -\nincluded includedpriority baggage -\npriority boarding -\nprice - domestic 700 1,000 1,700\nprice  international 700 1,200 2,750\nair india express\nfees  charges - domestic\nadvantage add-on packs\npre-book\ncabin baggage\nfee 7kg 3 kgs 5 kgs\n free 1050 1750\nfor flight duration less than 75 minutes, only lite bites will be available instead of hot meals.\nadd on type airport xtra pack airport \nxecutive pack\nprime\nseats extra leg room seats -\nall seats