### Data Loader

In [1]:
import warnings
warnings.filterwarnings("ignore")
from langchain_community.document_loaders import JSONLoader 

In [2]:
import json
from pathlib import Path
from pprint import pprint


file_path='./processed.json'
data = json.loads(Path(file_path).read_text())
pprint(data)

{'ticket_info': {'baggage_summary': {'total_cabin_baggage': '21kg',
                                     'total_check_in_baggage': '69kg'},
                 'class': 'Economy',
                 'embedded_text_descriptions': {'flight_details': 'Flight '
                                                                  'numbers '
                                                                  'ET846 and '
                                                                  'ET686, '
                                                                  'Economy '
                                                                  'class, from '
                                                                  'CPT to DEL '
                                                                  'on July 11, '
                                                                  '2024',
                                                'passenger_summary': 'Passengers: '
                                      

In [None]:
# Define the metadata extraction function
def metadata_func(record: dict, metadata: dict) -> dict:
    
    ticket_info = record.get("ticket_info", {})

    # Extract basic ticket details
    metadata["ticket_id"] = ticket_info.get("ticket_id")
    metadata["pnr"] = ticket_info.get("pnr")
    metadata["class"] = ticket_info.get("class")
    metadata["user_id"] = ticket_info.get("user_id")

    # Extract flight route details
    flight_route = ticket_info.get("flight_route", {})
    metadata["source_airport"] = flight_route.get("source_airport", {}).get("name")
    metadata["source_iata_code"] = flight_route.get("source_airport", {}).get("iata_code")
    metadata["destination_airport"] = flight_route.get("destination_airport", {}).get("name")
    metadata["destination_iata_code"] = flight_route.get("destination_airport", {}).get("iata_code")
    metadata["layover_duration"] = flight_route.get("layover_duration")

    # Extract flight schedule details
    flight_schedule = ticket_info.get("flight_schedule", {})
    metadata["departure_date"] = flight_schedule.get("departure", {}).get("date")
    metadata["departure_time"] = flight_schedule.get("departure", {}).get("time")
    metadata["arrival_date"] = flight_schedule.get("arrival", {}).get("date")
    metadata["arrival_time"] = flight_schedule.get("arrival", {}).get("time")

    # Extract passenger details summary
    passenger_details = ticket_info.get("passenger_details", {})
    metadata["total_passenger_count"] = passenger_details.get("total_passenger_count")
    
    # Extract embedded text descriptions if needed
    embedded_text_descriptions = ticket_info.get("embedded_text_descriptions", {})
    metadata["route_description"] = embedded_text_descriptions.get("route_description")
    metadata["flight_details"] = embedded_text_descriptions.get("flight_details")
    metadata["passenger_summary"] = embedded_text_descriptions.get("passenger_summary")

    return metadata


In [4]:
import json
from pathlib import Path

# Load the data
file_path = Path("processed.json")
json_data = json.loads(file_path.read_text())

In [5]:
json_data

{'ticket_info': {'ticket_id': 'ticket_227_3183',
  'namespace': 'flight_bookings',
  'pnr': 'HZAVJJ',
  'class': 'Economy',
  'user_id': 227,
  'flight_route': {'source_airport': {'name': 'Cape Town International Airport',
    'iata_code': 'CPT',
    'city': 'Cape Town',
    'country': 'South Africa'},
   'destination_airport': {'name': 'Indira Gandhi International Airport',
    'iata_code': 'DEL',
    'city': 'New Delhi',
    'country': 'India'},
   'intermediate_airports': [{'name': 'Addis Ababa Bole International Airport',
     'iata_code': 'ADD',
     'city': 'Addis Ababa',
     'country': 'Ethiopia'}],
   'layover_duration': '55 minutes'},
  'flight_schedule': {'departure': {'date': '2024-07-11',
    'time': '14:35:00 UTC',
    'full_datetime': '2024-07-11T14:35:00.000Z'},
   'arrival': {'date': '2024-07-12',
    'time': '08:10:00 UTC',
    'full_datetime': '2024-07-12T08:10:00.000Z'}},
  'flight_segments': [{'segment_number': 1,
    'flight_number': 'ET846',
    'departure': {'ai

### Recursive Character Text Splitter

In [6]:
from langchain_text_splitters import RecursiveJsonSplitter

splitter = RecursiveJsonSplitter(max_chunk_size=200)
splitter

<langchain_text_splitters.json.RecursiveJsonSplitter at 0x18c5c430b90>

### Chunks of Text

In [None]:
# Recursively split json data 
json_chunks = splitter.split_json(json_data=json_data, convert_lists=True)

for chunk in json_chunks[:3]:
    print(chunk)

{'ticket_info': {'ticket_id': 'ticket_227_3183', 'namespace': 'flight_bookings', 'pnr': 'HZAVJJ', 'class': 'Economy', 'user_id': 227}}
{'ticket_info': {'flight_route': {'source_airport': {'name': 'Cape Town International Airport', 'iata_code': 'CPT', 'city': 'Cape Town', 'country': 'South Africa'}}}}
{'ticket_info': {'flight_route': {'destination_airport': {'name': 'Indira Gandhi International Airport', 'iata_code': 'DEL', 'city': 'New Delhi', 'country': 'India'}}}}


In [8]:
len(json_chunks)

25

### Documents

In [None]:
# Create documents from the json data using the splitter
docs = splitter.create_documents(texts=[json_data],convert_lists=True)

for doc in docs[:3]:
    print(doc)

page_content='{"ticket_info": {"ticket_id": "ticket_227_3183", "namespace": "flight_bookings", "pnr": "HZAVJJ", "class": "Economy", "user_id": 227}}'
page_content='{"ticket_info": {"flight_route": {"source_airport": {"name": "Cape Town International Airport", "iata_code": "CPT", "city": "Cape Town", "country": "South Africa"}}}}'
page_content='{"ticket_info": {"flight_route": {"destination_airport": {"name": "Indira Gandhi International Airport", "iata_code": "DEL", "city": "New Delhi", "country": "India"}}}}'


### text 

In [None]:
# Create text documents from the json data using the splitter
texts = splitter.split_text(json_data=json_data, convert_lists=True)

In [11]:
print([len(text) for text in texts][:10])
print(texts[1])

[134, 166, 168, 185, 69, 144, 142, 92, 172, 105]
{"ticket_info": {"flight_route": {"source_airport": {"name": "Cape Town International Airport", "iata_code": "CPT", "city": "Cape Town", "country": "South Africa"}}}}


In [12]:
len(texts)

25

In [14]:
docs[0]

Document(metadata={}, page_content='{"ticket_info": {"ticket_id": "ticket_227_3183", "namespace": "flight_bookings", "pnr": "HZAVJJ", "class": "Economy", "user_id": 227}}')

In [15]:
from tqdm.autonotebook import tqdm, trange

In [None]:
import time
from tqdm.autonotebook import tqdm

for i in tqdm(range(100)):
    time.sleep(0.1)  

100%|██████████| 100/100 [00:10<00:00,  9.74it/s]


### Hugging Face Embeddings

In [17]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [18]:
embeddings = embeddings_model.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

(5, 768)

In [None]:
# embeddings[0]

In [19]:
text = "This is a test sentence."
query_result = embeddings_model.embed_query(text)
# query_result

In [20]:
len(query_result)

768

### Pinecone Initialization

In [26]:
! pip install -U sentence-transformers pinecone 



In [27]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [None]:
### API Key
api_key = os.getenv("PINECONE_API_KEY")

In [30]:

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=api_key)

# Create Index
index_name = "flight-search"

if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

In [31]:
# List Indexes
indexes = pc.list_indexes()
indexes

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 768,
              'host': 'flight-search-mrwtq0j.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'flight-search',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [32]:
from langchain_pinecone import PineconeVectorStore

namespace = "vector_store"

docsearch = PineconeVectorStore.from_documents(
    documents=docs,
    index_name=index_name,
    embedding=embeddings_model,
    namespace=namespace
)
import time
time.sleep(5)

# See how many vectors have been upserted
print("Index after upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")
time.sleep(2)

Index after upsert:
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}




In [33]:
index = pc.Index(index_name)
namespace = "vector_store"

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0], 
        namespace=namespace, 
        top_k=1,
        include_values=True,
        include_metadata=True
    )

In [34]:
query = "Can you tell me about the layover time for my journey?"

query_embedding = embeddings_model.embed_query(query)
print(query_embedding)

[0.0018927131313830614, -0.03389356657862663, -0.0544341579079628, 0.017752807587385178, -0.05886246636509895, -0.03966275602579117, -0.03435841575264931, -0.05689047649502754, -0.033438194543123245, -0.005347381811589003, 0.044564224779605865, 0.003117747139185667, -0.0006466050981543958, -0.02765374444425106, 0.016898630186915398, 0.006737298332154751, 0.04302022606134415, 0.006998178083449602, -0.05639045313000679, -0.01769322156906128, -0.055933818221092224, -0.006081584375351667, -0.06205228343605995, -0.032459571957588196, 0.058954887092113495, -0.03596005588769913, -0.04611822962760925, -0.001831723377108574, -0.0037443924229592085, 0.008370908908545971, 0.03567522391676903, 0.010151678696274757, 0.02152826078236103, 0.057715412229299545, 1.25459428090835e-06, 0.023354509845376015, 0.02820037119090557, 0.057250700891017914, -0.018497252836823463, 0.0296862181276083, -0.02423950284719467, 0.04734170436859131, 0.0040662563405931, 0.024376999586820602, 0.002730448730289936, -0.0289

In [35]:
results = index.query(
    namespace="vector_store",
    vector=query_embedding,
    top_k=4,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': '20e40c21-b969-422b-9f5c-70974fdb534a',
              'metadata': {'text': '{"ticket_info": {"flight_route": '
                                   '{"layover_duration": "55 minutes"}}}'},
              'score': 0.618535638,
              'values': []},
             {'id': '6546eaaa-5b86-4abd-bda5-d0c07a716c69',
              'metadata': {'text': '{"ticket_info": {"flight_segments": {"1": '
                                   '{"arrival": {"airport": "Indira Gandhi '
                                   'International Airport", "iata_code": '
                                   '"DEL", "date": "2024-07-12", "time": '
                                   '"08:10:00 UTC"}}}}}'},
              'score': 0.485283256,
              'values': []},
             {'id': '5710c950-6811-467c-8ad9-f0c3eeccf447',
              'metadata': {'text': '{"ticket_info": {"flight_segments": {"0": '
                                   '{"departure": {"airport": "Cape Town '
                      

### LLM 

In [36]:
from langchain_groq import ChatGroq
llm = ChatGroq(model = "llama3-8b-8192")

In [37]:
llm.invoke("Hello, how are you?")

AIMessage(content="I'm just a language model, so I don't have feelings or emotions like humans do. However, I'm functioning properly and ready to help you with any questions or tasks you may have! How can I assist you today?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 47, 'prompt_tokens': 16, 'total_tokens': 63, 'completion_time': 0.039166667, 'prompt_time': 0.000563847, 'queue_time': 0.01468609, 'total_time': 0.039730514}, 'model_name': 'llama3-8b-8192', 'system_fingerprint': 'fp_179b0f92c9', 'finish_reason': 'stop', 'logprobs': None}, id='run-2d7f6ec7-b7ee-4665-9d74-31b8e6836921-0', usage_metadata={'input_tokens': 16, 'output_tokens': 47, 'total_tokens': 63})

### Retriever

In [38]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
retriever=docsearch.as_retriever()

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)


In [39]:
query1 = "Can you tell me about the layover time for my journey?"
answer1_with_knowledge = retrieval_chain.invoke({"input": query1})

print("Answer with knowledge:\n\n", answer1_with_knowledge['answer'])
print("\nContext used:\n\n", answer1_with_knowledge['context'])
print("\n")

Answer with knowledge:

 According to the information provided, the layover duration is 55 minutes.

Context used:

 [Document(id='20e40c21-b969-422b-9f5c-70974fdb534a', metadata={}, page_content='{"ticket_info": {"flight_route": {"layover_duration": "55 minutes"}}}'), Document(id='6546eaaa-5b86-4abd-bda5-d0c07a716c69', metadata={}, page_content='{"ticket_info": {"flight_segments": {"1": {"arrival": {"airport": "Indira Gandhi International Airport", "iata_code": "DEL", "date": "2024-07-12", "time": "08:10:00 UTC"}}}}}'), Document(id='5710c950-6811-467c-8ad9-f0c3eeccf447', metadata={}, page_content='{"ticket_info": {"flight_segments": {"0": {"departure": {"airport": "Cape Town International Airport", "iata_code": "CPT", "date": "2024-07-11", "time": "14:35:00 UTC"}}}}}'), Document(id='32cf0eb2-3f20-4f77-b077-186d31ef7f37', metadata={}, page_content='{"ticket_info": {"flight_segments": {"0": {"arrival": {"airport": "Addis Ababa Bole International Airport", "iata_code": "ADD", "date": "20

In [40]:
query2 = "Do I have any checked baggage allowance for my flights?"

answer2_with_knowledge = retrieval_chain.invoke({"input": query2})

print("Answer with knowledge:\n\n", answer2_with_knowledge['answer'])
print("\nContext used:\n\n", answer2_with_knowledge['context'])
print("\n")

Answer with knowledge:

 According to the provided information, your total checked baggage allowance is 69kg, as mentioned in the "baggage_summary" section of the ticket info.

Context used:

 [Document(id='8affbc30-8a50-463a-b134-b656a0308c5d', metadata={}, page_content='{"ticket_info": {"baggage_summary": {"total_cabin_baggage": "21kg", "total_check_in_baggage": "69kg"}}}'), Document(id='95a98df1-bb0a-41f8-8e24-d16e8391268b', metadata={}, page_content='{"ticket_info": {"embedded_text_descriptions": {"passenger_summary": "Passengers: Surendra Singh, Narinder Kaur, Samik Singh, traveling with 7kg cabin baggage and 23kg check-in baggage each."}}}'), Document(id='0378660e-5e1a-4806-89e1-0c687f393503', metadata={}, page_content='{"ticket_info": {"passenger_details": {"passengers": {"0": {"full_name": "Surendra Singh", "seat_number": "21A", "baggage": {"cabin": "7kg", "check_in": "23kg"}}}}}}'), Document(id='6c5ac1f4-65af-4ab6-9e6e-61682c566e71', metadata={}, page_content='{"ticket_info": 

In [41]:
query3 = "What’s my seat for the first flight?"

answer3_with_knowledge = retrieval_chain.invoke({"input": query3})

print("Answer with knowledge:\n\n", answer3_with_knowledge['answer'])
print("\nContext used:\n\n", answer3_with_knowledge['context'])
print("\n")

Answer with knowledge:

 Unfortunately, you didn't provide any information about the flight you're referring to. However, based on the context you provided, I can see three different sets of passenger details for different flights. The first set of passenger details doesn't mention a flight number, but the second set mentions flight numbers ET846 and ET686. Could you please specify which flight you're referring to?

Context used:

 [Document(id='cd38204c-a5e6-4cb9-8f3c-cc7050a7de39', metadata={}, page_content='{"ticket_info": {"embedded_text_descriptions": {"flight_details": "Flight numbers ET846 and ET686, Economy class, from CPT to DEL on July 11, 2024"}}}'), Document(id='6c5ac1f4-65af-4ab6-9e6e-61682c566e71', metadata={}, page_content='{"ticket_info": {"passenger_details": {"passengers": {"2": {"full_name": "Samik Singh", "seat_number": "21C", "baggage": {"cabin": "7kg", "check_in": "23kg"}}}}}}'), Document(id='8ad93470-9f28-437e-84a2-4ebdb3066046', metadata={}, page_content='{"tick

In [42]:
query4 = "Which airport am I arriving at in Delhi, and what’s the expected arrival time?"

answer4_with_knowledge = retrieval_chain.invoke({"input": query4})

print("Answer with knowledge:\n\n", answer4_with_knowledge['answer'])
print("\nContext used:\n\n", answer4_with_knowledge['context'])

Answer with knowledge:

 According to the context, you are arriving at Indira Gandhi International Airport (DEL) in Delhi, and the expected arrival time is 08:10:00 UTC on July 12, 2024.

Context used:

 [Document(id='6546eaaa-5b86-4abd-bda5-d0c07a716c69', metadata={}, page_content='{"ticket_info": {"flight_segments": {"1": {"arrival": {"airport": "Indira Gandhi International Airport", "iata_code": "DEL", "date": "2024-07-12", "time": "08:10:00 UTC"}}}}}'), Document(id='7964ca84-0417-4bef-94c5-68e0c1ec8a07', metadata={}, page_content='{"ticket_info": {"flight_route": {"destination_airport": {"name": "Indira Gandhi International Airport", "iata_code": "DEL", "city": "New Delhi", "country": "India"}}}}'), Document(id='32cf0eb2-3f20-4f77-b077-186d31ef7f37', metadata={}, page_content='{"ticket_info": {"flight_segments": {"0": {"arrival": {"airport": "Addis Ababa Bole International Airport", "iata_code": "ADD", "date": "2024-07-11", "time": "22:00:00 UTC"}}}}}'), Document(id='99b592ad-501f-

In [43]:
query5 = "What time do I arrive in Delhi?"

answer5_with_knowledge = retrieval_chain.invoke({"input": query5})

print("Answer with knowledge:\n\n", answer5_with_knowledge['answer'])
print("\nContext used:\n\n", answer5_with_knowledge['context'])

Answer with knowledge:

 According to the given context, you arrive in Delhi (Indira Gandhi International Airport, IATA code "DEL") at 08:10:00 UTC on July 12, 2024.

Context used:

 [Document(id='6546eaaa-5b86-4abd-bda5-d0c07a716c69', metadata={}, page_content='{"ticket_info": {"flight_segments": {"1": {"arrival": {"airport": "Indira Gandhi International Airport", "iata_code": "DEL", "date": "2024-07-12", "time": "08:10:00 UTC"}}}}}'), Document(id='7964ca84-0417-4bef-94c5-68e0c1ec8a07', metadata={}, page_content='{"ticket_info": {"flight_route": {"destination_airport": {"name": "Indira Gandhi International Airport", "iata_code": "DEL", "city": "New Delhi", "country": "India"}}}}'), Document(id='32cf0eb2-3f20-4f77-b077-186d31ef7f37', metadata={}, page_content='{"ticket_info": {"flight_segments": {"0": {"arrival": {"airport": "Addis Ababa Bole International Airport", "iata_code": "ADD", "date": "2024-07-11", "time": "22:00:00 UTC"}}}}}'), Document(id='3abbd8e8-8445-47b1-b382-20eb9c0300a

### Creating Prompt Template

In [44]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)

question_answer_chain
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "What’s my seat for the first flight?"})
print(response["answer"])

I don't know which flight is the "first flight" as there are multiple flights mentioned (ET846 and ET686), but I can tell you that the passenger named Surendra Singh is sitting in seat 21A.


In [45]:
response = rag_chain.invoke({"input": "What time do I arrive in Delhi?"})
print(response["answer"])

According to the ticket information, you arrive in Indira Gandhi International Airport (DEL) at 08:10:00 UTC on July 12, 2024.


In [46]:
response = rag_chain.invoke({"input": "Can you tell me about the layover time for my journey?"})
print(response["answer"])

According to the ticket information, the layover duration for your flight is 55 minutes.


In [47]:
response = rag_chain.invoke({"input": "What time is my flight from Cape Town to Addis Ababa, and what’s the arrival time?"})
print(response["answer"])

Your flight from Cape Town International Airport (CPT) to Addis Ababa Bole International Airport (ADD) departs at 14:35:00 UTC on July 11, 2024. The arrival time is 22:00:00 UTC on July 11, 2024.


In [48]:
response = rag_chain.invoke({"input": "Do I have any checked baggage allowance for my flights?"})
print(response["answer"])

According to the ticket information, the total checked baggage allowance is 69kg.


In [49]:
response = rag_chain.invoke({"input": "What’s my seat for the first flight?"})
print(response["answer"])

According to the ticket information, the first flight's details are not mentioned, but since you're asking about your seat for the first flight, I'm assuming you're asking about one of the flights mentioned in the context. Unfortunately, I don't know which flight you're referring to, but for flights ET846 and ET686, I can see that there are passengers with seat numbers 21A, 21B, and 21C.


In [51]:
response = rag_chain.invoke({"input": "What’s my seat for the first flight and the passenger details?"})
print(response["answer"])

Based on the provided context, the passenger details for the first flight are:

Passenger 0: 
- Full name: Surendra Singh
- Seat number: 21A
- Cabin baggage allowance: 7kg
- Check-in baggage allowance: 23kg


### Prompt Template

In [50]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a travel assistant specialized in providing concise and accurate answers about flight itineraries. "
    "Use the following context to answer questions about flight segments, layovers, seat details, baggage allowances, and airport information. "
    "Respond to each question based on the retrieved context. If the answer isn’t available in the data, respond with “I’m not sure” or “That information is not available.” "
    "Keep answers to a maximum of three sentences and ensure they are clear and direct."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)

rag_chain = create_retrieval_chain(retriever, question_answer_chain)

response = rag_chain.invoke({"input": "What’s my seat for the first flight?"})
print(response["answer"])

I'm not sure, as there's no information about the first flight in the provided context.


In [52]:
response = rag_chain.invoke({"input": "How many passengers are there and list all the passenger details?"})
print(response["answer"])

According to the context, there are 3 passengers in total. Here are the passenger details:

1. Surendra Singh
	* Seat Number: 21A
	* Cabin Baggage: 7kg
	* Check-in Baggage: 23kg
2. Narinder Kaur
	* (No information available)
3. Samik Singh
	* Seat Number: 21C
	* Cabin Baggage: 7kg
	* Check-in Baggage: 23kg
