#### Libraries

In [1]:
import re
import os
import pandas as pd
import random
import numpy as np
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from langchain import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2Tokenizer, GPT2LMHeadModel

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


#### Data Handling

In [None]:
hotel_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\new_hotel_data.csv')
ratings_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\clean_hotel_review_data.json', 'json')
restaurant_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\new_restaurant_db.restaurants_data.json', 'json')
attraction_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\Cleaned_attr.csv')
airbnb_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\Cleaned_Airbnb.csv')
busses_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\Cleaned_busses.csv')
cars_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\Cleaned_Cars.csv')
trains_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\Cleaned_trains.csv')

# Merge hotel/restaurant data with ratings
merged_df = hotel_df.merge(ratings_df, on='hotel_id', how='left')
average_ratings = merged_df.groupby('hotel_id')['rating'].mean().reset_index()
average_ratings.columns = ['hotel_id', 'average_rating']
hotel_df = hotel_df.merge(average_ratings, on='hotel_id', how='left')
hotel_df.rename(columns={'average_rating': 'Rating'}, inplace=True)
restaurant_ratings_df = load_data('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\new_restaurant_db.restaurants_reviews.json', 'json')
restaurant_merged_df = restaurant_df.merge(restaurant_ratings_df, on='restaurant_id', how='left')
average_restaurant_ratings = restaurant_merged_df.groupby('restaurant_id')['rating'].mean().reset_index()
average_restaurant_ratings.columns = ['restaurant_id', 'average_restaurant_ratings']
restaurant_df = restaurant_df.merge(average_restaurant_ratings, on='restaurant_id', how='left')
restaurant_df.rename(columns={'average_restaurant_ratings': 'Rating'}, inplace=True)
restaurant_df['Rating'] = restaurant_df['Rating'].fillna(3.5)

# Generate descriptions 
def preprocess_hotel_data(row):
    return f"{row['name']} located at {row['address']} in {row['city']}, costs Rs{row['price']} per night. Description: {row['about']}."
hotel_df['description'] = hotel_df.apply(preprocess_hotel_data, axis=1)

def preprocess_restaurant_data(row):
    return f"{row['name']} located at {row['address']} with phone {row['phone_number']} in {row['city']}."
restaurant_df['description'] = restaurant_df.apply(preprocess_restaurant_data, axis=1)

def preprocess_attraction_data(row):
    return f"{row['name']} located at {row['address']} in {row['city']}. Category: {row['category']}. Rating: {row['rating']}."
attraction_df['description'] = attraction_df.apply(preprocess_attraction_data, axis=1)

def preprocess_airbnb_data(row):
    return f"{row['name']} located at {row['address']}. Price per night is Rs{row['pricing_rate_amount']}."
airbnb_df['description'] = airbnb_df.apply(preprocess_airbnb_data, axis=1)

def preprocess_buses_data(row):
    return f"{row['name']} departing from {row['starting']} to {row['ending']} at {row['departure_time']}."
busses_df['description'] = busses_df.apply(preprocess_buses_data, axis=1)

def preprocess_car_data(row):
    return f"{row['brand_name']} {row['car_name']} for {row['price_per_day']} per day."
cars_df['description'] = cars_df.apply(preprocess_car_data, axis=1)

def preprocess_train_data(row):
    return f"{row['name']} departing from {row['starting']} to {row['ending']} at {row['departure_time']}."
trains_df['description'] = trains_df.apply(preprocess_train_data, axis=1)

# Enhanced Dictionary for amenities
amenities_data = {
    "hotel": hotel_df.set_index('name')['description'].to_dict(),
    "restaurant": restaurant_df.set_index('name')['description'].to_dict(),
    "attraction": attraction_df.set_index('name')['description'].to_dict(),
    "airbnb": airbnb_df.set_index('name')['description'].to_dict(),
    "bus": busses_df.set_index('name')['description'].to_dict(),
    "car": cars_df.set_index('car_name')['description'].to_dict(),
    "train": trains_df.set_index('name')['description'].to_dict()
}

amenities_data

{'hotel': {'Midway Residency': 'Midway Residency located at 3rd Floor App 301, 143-A Sector, Sector C Commercial Area Sector C Bahria Town, Lahore, Punjab 53200 in Lahore, costs Rs12648 per night. Description: Check-out time: 12:00.',
  'Hotel Royal Comfort': 'Hotel Royal Comfort located at F788+W9X, opposite Emporium Mall, Trade Centre Commercial Area Phase 2 Johar Town, Lahore, Punjab 54000 in Lahore, costs Rs5218 per night. Description: Perfect for business and leisure travelers..',
  'hostel view garden': 'hostel view garden located at 7 Shahrah Aiwan-e-Sanat-o-Tijarat, near china chowk, next to Askari bank, Jinnah Town, Lahore, Punjab 54000 in Lahore, costs Rs12648 per night. Description: InternetWi-FiPoolsNo poolsNo hot tub.',
  'WEDNESDAY HOMES': 'WEDNESDAY HOMES located at address not available in Lahore, costs Rs12648 per night. Description: Perfect for business and leisure travelers..',
  'Prime Motel': 'Prime Motel located at address not available in Lahore, costs Rs12648 pe

In [5]:
ratings_df = pd.read_csv('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Cleaned\\ratings.csv')

#### Fine Tuning Model through GPT-2 (DONT RUN)

In [None]:
# Load the GPT-2 tokenizer and model
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2')

# Set the padding token for the GPT-2 tokenizer
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token  # Set padding token to EOS token

class GPT2BERTDataset(Dataset):
    def __init__(self, descriptions, embeddings, tokenizer):
        self.descriptions = descriptions
        self.embeddings = embeddings
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, idx):
        # Convert text description to tokens
        description = self.descriptions[idx]
        embedding = self.embeddings[idx]
        
        # Tokenize description with padding
        tokenized_input = self.tokenizer.encode(description, return_tensors='pt', padding='max_length', max_length=100, truncation=True).squeeze(0)
        
        return tokenized_input, torch.tensor(embedding, dtype=torch.float32)

# Load embeddings from the specified files
hotel_embedding = np.load('hotel_embeddings.npy')
restaurant_embedding = np.load('restaurant_embeddings.npy')
attraction_embedding = np.load('attr_embeddings.npy')
airbnb_embedding = np.load('airbnb_embeddings.npy')
bus_embedding = np.load('buses_embeddings.npy')
car_embedding = np.load('cars_embeddings.npy')
train_embedding = np.load('trains_embeddings.npy')

# Create datasets
hotel_dataset = GPT2BERTDataset(hotel_df['description'].tolist(), hotel_embedding, gpt_tokenizer)
restaurant_dataset = GPT2BERTDataset(restaurant_df['description'].tolist(), restaurant_embedding, gpt_tokenizer)
attraction_dataset = GPT2BERTDataset(attraction_df['description'].tolist(), attraction_embedding, gpt_tokenizer)
airbnb_dataset = GPT2BERTDataset(airbnb_df['description'].tolist(), airbnb_embedding, gpt_tokenizer)
bus_dataset = GPT2BERTDataset(busses_df['description'].tolist(), bus_embedding, gpt_tokenizer)
car_dataset = GPT2BERTDataset(cars_df['description'].tolist(), car_embedding, gpt_tokenizer)
train_dataset = GPT2BERTDataset(trains_df['description'].tolist(), train_embedding, gpt_tokenizer)

# Concatenate all datasets
full_dataset = ConcatDataset([hotel_dataset, restaurant_dataset, attraction_dataset, airbnb_dataset, bus_dataset, car_dataset, train_dataset])

# Create DataLoader
dataloader = DataLoader(full_dataset, batch_size=16, shuffle=True)

# Example loop through the DataLoader
for batch in dataloader:
    tokenized_inputs, embeddings = batch
    # Process inputs and embeddings as needed
    break  # Remove this to iterate through the entire dataset

# Fine-tuning GPT-2 with BERT embeddings
optimizer = AdamW(gpt_model.parameters(), lr=5e-5,weight_decay=1e-2)

scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)

# Training loop
for epoch in range(5):  # Adjust number of epochs as necessary
    gpt_model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        inputs, bert_embeddings = batch
        
        # Move tensors to the appropriate device (GPU if available)
        inputs = inputs.to(gpt_model.device)
        bert_embeddings = bert_embeddings.to(gpt_model.device)

        # Forward pass through GPT-2
        outputs = gpt_model(inputs, labels=inputs)
        
        # Calculate loss and backpropagate
        loss = outputs.loss
        total_loss += loss.item()  # Accumulate total loss
        optimizer.zero_grad()
        
        loss.backward()       
        # Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(gpt_model.parameters(), max_norm=1.0)       
        # Update parameters and scheduler
        optimizer.step()
        scheduler.step()

        # Print details of each batch
        print(f"Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {loss.item()}, Inputs shape: {inputs.shape}, Bert Embeddings shape: {bert_embeddings.shape}")

    # Print the average loss after each epoch
    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1} completed. Average Loss: {average_loss}")

# Save the fine-tuned GPT-2 model and tokenizer
gpt_model.save_pretrained('fine_tuned_gpt2_1')
gpt_tokenizer.save_pretrained('fine_tuned_gpt2_1')

#### Embeddings + Model Setup

In [6]:
# Load embeddings from the specified files
hotel_embeddings = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Embeddings\\hotel_embeddings.npy')
restaurant_embeddings = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Embeddings\\restaurant_embeddings.npy')
attraction_embeddings = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Embeddings\\attr_embeddings.npy')
airbnb_embeddings = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Embeddings\\airbnb_embeddings.npy')
bus_embeddings = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Embeddings\\buses_embeddings.npy')
car_embeddings = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Embeddings\\cars_embeddings.npy')
train_embeddings = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Embeddings\\trains_embeddings.npy')

hotel_names = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Names\\hotel_names.npy', allow_pickle=True)
restaurant_names = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Names\\restaurant_names.npy', allow_pickle=True)
attraction_names = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Names\\attraction_names.npy', allow_pickle=True)
airbnb_names = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Names\\airbnb_names.npy', allow_pickle=True)
car_names = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Names\\car_names.npy', allow_pickle=True)
bus_names = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Names\\bus_names.npy', allow_pickle=True)
train_names = np.load('C:\\Users\\DELL\\OneDrive\\Documents\\GitHub\\Voyaige\\chatbot\\Names\\train_names.npy', allow_pickle=True)

In [7]:
# Model setup
model_name = "fine_tuned_gpt2_1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=100)
llm = HuggingFacePipeline(pipeline=pipe)

# Updated prompt to enhance coherence
prompt_template = PromptTemplate(input_variables=["query"], template="Respond to this travel-related question: {query}")
langchain_pipeline = LLMChain(prompt=prompt_template, llm=llm)

  warn_deprecated(


#### Functions for Chatbot

###### Finding top Similarities

In [8]:
# Finding the top similarities 
def find_top_similarities(embedding_matrix, names, query_embedding, top_n=1):
    similarities = cosine_similarity(query_embedding, embedding_matrix)
    top_indices = similarities.argsort()[0][-top_n:][::-1]
    return [names[idx] for idx in top_indices]

###### Validating Car Name

In [9]:
def validate_car_name(car_name):
    car_name_lower = car_name.lower()
    for name in car_names:
        if car_name_lower in name.lower():
            return name
    return None

###### Parsing the Users Prompts

In [10]:
def parse_user_input(user_input):
    # List of supported cities
    supported_cities = {
        "kashmir", "murree", "islamabad", "lahore", "karachi", "hunza", 
        "skardu", "chitral", "gilgit", "multan", "abbottabad", "quetta", 
        "naran", "batakundi", "peshawar", "faisalabad"
    }
    
    # Initialize values
    starting_city, destination_city, transport, days, budget = None, None, None, None, None

    # Detect city-to-city trip in user input
    match = re.search(r'plan a trip from (\w+) to (\w+)', user_input.lower())
    if match:
        starting_city, destination_city = match.groups()
        if starting_city not in supported_cities or destination_city not in supported_cities:
            return "One or both of the specified cities are not supported. Please choose from the supported cities."
    
    # If city-to-city trip is not specified, detect single city
    if not destination_city:
        destination_city = next((c for c in supported_cities if c in user_input.lower()), None)
        if destination_city is None:
            return "City information is not available. Please choose from supported cities."

    # Detect transport mode in user input
    if "bus" in user_input.lower():
        transport = "bus"
    elif "car" in user_input.lower():
        transport = "car"
    elif "train" in user_input.lower():
        transport = "train"

    # Detect number of days in user input
    days_match = re.search(r'(\d+)\s*(days|day)', user_input.lower())
    if days_match:
        days = int(days_match.group(1))

    # Detect budget in user input
    budget_match = re.search(r'budget of (\d+)', user_input.lower())
    if budget_match:
        budget = int(budget_match.group(1))

    return starting_city, destination_city, transport, days, budget

###### Making sure User Prompt is Relevant

In [11]:
# Validate user input
def validate_user_query(user_input):
    return any(word in user_input for word in ["plan", "travel", "trip", "itinerary", "tell me", "what", "where"])

###### In the case where Price is missing from dataset, fill it randomly

In [12]:
# Helper function to randomize cost based on type of item (hotel, restaurant, attraction)
def get_randomized_cost(info, category):
    # Check for price in the description and strip it out if exists
    if isinstance(info, dict) and 'description' in info:
        # Try to find a price in the description
        price_match = re.search(r'(\d+[\.,]?\d*)\s*(Rs|USD|PKR)', info['description'])
        if price_match:
            # Extract and return the price as a float
            return float(price_match.group(1).replace(',', ''))
    
    # If no price is found in the description, generate a reasonable default price
    if category == "hotel":
        return random.randint(3000, 15000)  # Reasonable range for hotels/Airbnbs
    elif category == "restaurant":
        return random.randint(500, 3500)  # Price range per person for restaurants
    elif category == "attraction":
        return random.randint(500, 2500)  # Price range for attractions
    elif category == "bus":
        return random.randint(500, 3000)  # Reasonable bus fare range
    elif category == "car":
        return random.randint(3000, 7000)  # Price range for car rental per day
    elif category == "train":
        return random.randint(1000, 5000)  # Price range for train tickets
    
    return 1000  # Default value if no category matches


###### Generate Itinerary

In [13]:
# Predefined minimum budgets per day
MIN_BUDGET_PER_DAY = 3000  # You can adjust this as per your requirements

# Generate an itinerary
def generate_itinerary(user_input, starting_city, destination_city, mode_of_transport, days, budget=None):
    # Validate number of days
    if days > 7:
        return "Sorry, the trip duration cannot exceed 7 days."
    
    min_budget = days * MIN_BUDGET_PER_DAY
    if budget is not None and budget < min_budget:
        return f"Sorry, the minimum budget for a {days}-day trip is Rs{min_budget}. Please increase your budget."

    # Generate user embedding (assuming 'model' and 'tokenizer' are pre-defined)
    user_embedding = model.transformer.wte(tokenizer.encode(user_input, return_tensors='pt')).mean(dim=1).detach().numpy()

    # Transport suggestion for day 1
    day_1_transport = None
    car_transport = None

    if mode_of_transport == "car":
        car_name = input("Please specify the car you'd like to use (e.g., Corolla, Civic, etc.): ").strip()
        validated_car_name = validate_car_name(car_name)
        
        while not validated_car_name:
            print("Sorry, the car name you entered is not available. Please enter a valid car name.")
            car_name = input("Please specify the car you'd like to use (e.g., Corolla, Civic, etc.): ").strip()
            validated_car_name = validate_car_name(car_name)
        
        day_1_transport = validated_car_name
        car_transport = validated_car_name

    elif mode_of_transport in ["bus", "train"]:
        day_1_transport = find_top_similarities(
            bus_embeddings if mode_of_transport == "bus" else train_embeddings,
            bus_names if mode_of_transport == "bus" else train_names,
            user_embedding,
            top_n=1
        )[0]

        print("On day 2 and beyond, only cars can be selected for transport.")
        available_cars = amenities_data.get('car', {})
        if not available_cars:
            return "Sorry, no cars are available."
        
        print("Please choose a car from the following options:")
        for idx, (car_name, car_info) in enumerate(available_cars.items(), start=1):
            print(f"{idx}. {car_name}: {car_info}")
        
        car_choice = input("Please select a car by number: ").strip()
        try:
            car_choice = int(car_choice)
            if 1 <= car_choice <= len(available_cars):
                car_transport = list(available_cars.keys())[car_choice - 1]
            else:
                return "Invalid choice. Please select a valid car number."
        except ValueError:
            return "Invalid input. Please enter a number corresponding to a car."

    # Get city-specific recommendations
    city_hotels = [name for name in hotel_names if destination_city.lower() in name.lower()]
    city_restaurants = [name for name in restaurant_names if destination_city.lower() in name.lower()]
    city_attractions = [name for name in attraction_names if destination_city.lower() in name.lower()]

    if not city_hotels or not city_restaurants or not city_attractions:
        return f"Sorry, we couldn't find recommendations for {destination_city}."

    print(f"\nGenerated Itinerary from {starting_city.title()} to {destination_city.title()} ({days} days):")
    
    total_cost = 0
    for day in range(1, days + 1):
        print("----------------------")
        print(f"\n\tDay {day}")
        
        # Transport logic
        if day == 1:
            print(f"Transport: {mode_of_transport.title()} - {day_1_transport}")
        else:
            print(f"Transport: Car - {car_transport}")
        
        accommodation = random.choice(city_hotels)
        accommodation_info = amenities_data['hotel'].get(accommodation, {})
        accommodation_cost = accommodation_info.get('cost', random.randint(3000, 15000)) if isinstance(accommodation_info, dict) else random.randint(3000, 15000)
        print(f"Accommodation: {accommodation} - Rs{accommodation_cost}")
        daily_cost = accommodation_cost

        restaurants = random.sample(city_restaurants, min(2, len(city_restaurants)))
        attractions = random.sample(city_attractions, min(2, len(city_attractions)))

        restaurant1 = restaurants[0]
        restaurant1_info = amenities_data['restaurant'].get(restaurant1, {})
        restaurant1_cost = restaurant1_info.get('cost', random.randint(500, 3500)) if isinstance(restaurant1_info, dict) else random.randint(500, 3500)
        print(f"Restaurant 1: {restaurant1} - Rs{restaurant1_cost}")
        daily_cost += restaurant1_cost

        attraction1 = attractions[0]
        print(f"Attraction 1: {attraction1}")
        
        if len(attractions) > 1:
            attraction2 = attractions[1]
            print(f"Attraction 2: {attraction2}")
        
        if len(restaurants) > 1:
            restaurant2 = restaurants[1]
            restaurant2_info = amenities_data['restaurant'].get(restaurant2, {})
            restaurant2_cost = restaurant2_info.get('cost', random.randint(500, 3500)) if isinstance(restaurant2_info, dict) else random.randint(500, 3500)
            print(f"Restaurant 2: {restaurant2} - Rs{restaurant2_cost}")
            daily_cost += restaurant2_cost
        
        print(f"Total cost for Day {day}: Rs{daily_cost}")
        print("----------------------")
        total_cost += daily_cost
    
    if budget and total_cost > budget:
        return f"Sorry, the trip cost exceeded your budget of Rs{budget}. Total cost is Rs{total_cost}."
    
    return f"Total Trip Cost: Rs{total_cost}"

###### Retrieving Information 

In [14]:
# Function to retrieve details without specifying types
def retrieve_details(user_input):
    request_phrases = [
        "tell me about", "can you give details on", "what is", "where is",
        "information on", "details about", "describe", "info on"
    ]
    
    # Check if any of the request phrases are in the user's input
    if not any(phrase in user_input.lower() for phrase in request_phrases):
        return "Please specify what you need details on (e.g., a hotel, restaurant, or attraction name)."
    
    # Search for the amenity directly in amenities_data
    for amenity_type, items in amenities_data.items():
        for name, details in items.items():
            if str(name).lower() in str(user_input).lower():
                return details

    # If no valid match found
    return "Could not find details for the specified item in our data."

###### Retrieving top or best from each category

In [15]:
def retrieve_top_items(user_input, num_recommendations=5):
    # Normalize the input (case insensitive)
    user_input = user_input.strip().lower()

    # Define the set of supported cities
    supported_cities = {
        "kashmir", "murree", "islamabad", "lahore", "karachi", "hunza", 
        "skardu", "chitral", "gilgit", "multan", "abbottabad", "quetta", 
        "naran", "batakundi", "peshawar", "faisalabad"
    }

    # Determine category based on the input
    if "hotel" in user_input or "hotels" in user_input:
        category = "hotel"
    elif "restaurant" in user_input or "restaurants" in user_input:
        category = "restaurant"
    elif "attraction" in user_input or "attractions" in user_input:
        category = "attraction"
    elif "airbnb" in user_input or "airbnbs" in user_input:
        category = "airbnb"
    else:
        return "Please specify what type of top information you need (e.g., 'hotels', 'restaurants', 'attractions', 'Airbnbs')."

    # Extract city if present in the user input
    city_name = None
    for city in supported_cities:
        if city in user_input:
            city_name = city
            break  # Stop once we find a match

    # Check if the category exists in the dataset
    if category not in amenities_data.keys():
        return f"Sorry, we do not have data for {category}s."

    # Access the relevant category data from amenities_data
    category_data = amenities_data[category]

    # Filter and find the relevant data
    filtered_data = {}

    # Loop through the category data
    for name, description in category_data.items():
        description_lower = description.lower()

        # Check if the city is part of the description (if city is specified)
        if city_name and city_name.lower() not in description_lower:
            continue  # Skip if the city name is not in the description

        # Add the data to the filtered results
        filtered_data[name] = description

    # If no matching data found
    if not filtered_data:
        return f"Sorry, we couldn't find {category} recommendations for {city_name.title() if city_name else 'your query'}."

    # Sort and take top items based on name (you could add ratings here if available)
    sorted_items = sorted(filtered_data.items(), key=lambda item: item[0].lower(), reverse=True)
    top_items = sorted_items[:num_recommendations]

    # Format the output
    city_part = f" in {city_name.title()}" if city_name else ""
    result = [f"The top {category.title()}s{city_part}:"]

    for i, (name, description) in enumerate(top_items, start=1):
        result.append(f"\t{i}. {name} - {description}")

    return "\n".join(result)

###### Function that allows us to compare two options

In [16]:
def compare_two_options(user_input):
    # Normalize the input (lowercase and strip spaces)
    user_input = user_input.strip().lower()

    # Extract category and options using regex
    match = re.search(r"compare\s+([a-z]+)\s+([\w\s]+)\s+and\s+([\w\s]+)", user_input)

    if not match:
        return "Please specify two options to compare (e.g., 'compare hotel Yasir Broast Hotel and Windmills Hotel')."

    # Extract category and options from the match groups
    category = match.group(1).lower()
    option1 = match.group(2).strip().lower()  # Normalize options
    option2 = match.group(3).strip().lower()  # Normalize options

    # Validate category
    if category not in amenities_data:
        return f"Sorry, we do not have data for {category}s."

    # Check if both options exist in the specified category (case-insensitive)
    category_data = amenities_data[category]
    option1 = next((key for key in category_data if key.lower() == option1), None)
    option2 = next((key for key in category_data if key.lower() == option2), None)

    if not option1 or not option2:
        return f"Sorry, we couldn't find both {category} options you specified."

    # Get details for both options
    option1_details = category_data[option1]
    option2_details = category_data[option2]

    # Format the comparison output
    result = f"Comparing {option1} and {option2}:\n"
    result += f"\n{option1.capitalize()}: {option1_details}\n"
    result += f"\n{option2.capitalize()}: {option2_details}\n"

    return result

###### Main Function i.e., Milo Chatbot

In [None]:
def milo():
    print("Hello, I'm Milo! How can I help you with your travel plans today?")
    
    while True:
        user_input = input("You: ").strip().lower()
        
        if user_input in ["exit", "quit", "goodbye", "bye"]:
            print("Goodbye! Have a safe trip!")
            break
        
        elif "top" in user_input or "best" in user_input:
            # Parse user input for category, city, and number of recommendations
            match = re.search(r"(top|best)\s+(\d+)?\s*(hotels|restaurants|attractions|airbnb)\s+in\s+([\w\s]+)", user_input)
            
            if not match:
                print("Please specify what type of top information you need (e.g., 'top 5 hotels in Lahore').")
                continue
            
            num_recommendations = int(match.group(2)) if match.group(2) else 5  # Default to 5 if not specified
            category = match.group(3).lower()  # Category (hotels, restaurants, etc.)
            city_name = match.group(4).strip().lower() if match.group(4) else None  # City name
            
            # Call retrieve_top_items with num_recommendations and city_name
            response = retrieve_top_items(user_input, num_recommendations)
            print(response)
        
        elif any(phrase in user_input for phrase in ["tell me about", "can you give details on", "what is", "where is", "information on", "details about", "describe", "info on"]):
            response = retrieve_details(user_input)
            print(response)
        
        elif "plan a trip" in user_input or "plan from" in user_input or "generate an itinerary" in user_input or "give possible trip for" in user_input or "generate itinerary" in user_input:
            print("I can help you plan a trip. Let's gather some details.")
            
            parsed_input = parse_user_input(user_input)
            if isinstance(parsed_input, str):
                print(parsed_input)  # Error message from parsing
                continue
            
            start, dest, transport, days, budget = parsed_input

            if not start:
                start = input("Please specify the starting city: ").strip().lower()
            if not dest:
                dest = input("Please specify the destination city: ").strip().lower()
            if not days:
                days = int(input("How many days will your trip be? ").strip())
            if not transport:
                transport = input("Please specify the mode of transport (e.g., car, bus, train): ").strip().lower()
            if not budget:
                budget_input = input("Do you have a budget? (yes/no): ").strip().lower()
                if budget_input == "yes":
                    budget = int(input("Please specify your budget: ").strip())
            
            print("Generating itinerary...")
            itinerary = generate_itinerary(user_input, start, dest, transport, days, budget)
            print(itinerary)
        
        #elif "update itinerary" in user_input:
        #    day = int(input("Which day would you like to update? "))
        #    accommodation = input("New accommodation (or leave blank): ")
        #    transport = input("New transport mode (or leave blank): ")
        #    restaurants = input("New restaurants (comma separated or leave blank): ").split(",")
        #    attractions = input("New attractions (comma separated or leave blank): ").split(",")
            
        #    updated_itinerary = update_itinerary(itinerary, day, new_transport=transport, new_accommodation=accommodation, new_restaurants=restaurants, new_attractions=attractions)
        #    print(updated_itinerary)

        elif "compare" in user_input:
            # Call the compare_two_options function
            response = compare_two_options(user_input)
            print(response)
        
        else:
            print("I'm here to help! You can ask about details on accommodations, restaurants, attractions, or plan/update your itinerary.")

milo()

Hello, I'm Milo! How can I help you with your travel plans today?
Tooba Masjid located at R3V3+8MF, Old Korangi Rd, Sabir SRE Karachi Cantonement, Karachi, Karachi City, Sindh 74000 in Karachi. Category: Mosque. Rating: 4.8.
I can help you plan a trip. Let's gather some details.
Generating itinerary...

Generated Itinerary from Islamabad to Lahore (3 days):
----------------------

	Day 1
Transport: Car - Voxy
Accommodation: One Bedroom Apartment Penta Square Phase 5 DHA Lahore - Rs7740
Restaurant 1: Wasabi Express/ Wok & Co Cantt Lahore - Rs1548
Attraction 1: Army Museum Lahore
Attraction 2: Mall of Lahore
Restaurant 2: Lahore Broast - Rs600
Total cost for Day 1: Rs9888
----------------------
----------------------

	Day 2
Transport: Car - Voxy
Accommodation: National Hotel Lahore - Rs9574
Restaurant 1: LAHORE INN Restaurant - Rs2486
Attraction 1: Lahore Museum 
Attraction 2: Eiffel Tower Bahria Town Lahore
Restaurant 2: Lahore - Rs885
Total cost for Day 2: Rs12945
--------------------