In [3]:
pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp310-cp310-macosx_11_0_arm64.whl (11.3 MB)
Downloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
Using cached tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.1 tzdata-2025.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import json
import re

def clean_text(text):
    """Removes extra spaces and unwanted words like 'Show'."""
    if pd.isna(text):
        return "Unknown"
    text = str(text).strip()
    text = re.sub(r'\bShow\b', '', text)  # Remove 'Show' from city names
    return text

def convert_csv_to_rag_json(csv_file, output_json):
    df = pd.read_csv('/Users/pranavpadmanabhan/Documents/Project/Ai-Travel-Planner-main/data/OriginalHotelsDataset.csv')
    
    # Fill NaN values with defaults
    df.fillna({
        "name": "Unknown Hotel",
        "city": "Unknown City",
        "type_hotels": "Unknown Type",
        "review_badge": 0.0,
        "review_titel": "No review title",
        "review_number": "0",
        "price": "Price not available",
        "Duration": "Duration not specified",
        "persons": "Persons not specified",
        "stars": "No star rating",
        "taxes": "Taxes not specified",
        "Favorite_partner": "No favorite partner info"
    }, inplace=True)
    
    records = []
    for idx, row in df.iterrows():
        record = {
            "id": f"hotel_{idx}",
            "name": clean_text(row["name"]),
            "city": clean_text(row["city"]),
            "type": clean_text(row["type_hotels"]),
            "review_badge": float(row["review_badge"]),
            "review_score": clean_text(row["review_titel"]),
            "review_count": int(re.sub("[^0-9]", "", str(row["review_number"]))),  # Extract numeric part
            "price": clean_text(row["price"]),
            "duration": clean_text(row["Duration"]),
            "persons": clean_text(row["persons"]),
            "stars": clean_text(row["stars"]),
            "taxes": clean_text(row["taxes"]),
            "favorite_partner": clean_text(row["Favorite_partner"]),
            "retrieval_text": f"Hotel {clean_text(row['name'])} in {clean_text(row['city'])} is a {clean_text(row['type_hotels'])} with a rating of {row['review_badge']} ({row['review_number']} reviews). Price: {clean_text(row['price'])} for {clean_text(row['Duration'])}, suitable for {clean_text(row['persons'])}. Taxes: {clean_text(row['taxes'])}."
        }
        records.append(record)
    
    # Save JSON output
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=4, ensure_ascii=False)

# Example usage
convert_csv_to_rag_json("hotels.csv", "hotels_rag.json")


In [5]:
import pandas as pd
import json

def csv_to_rag_json(csv_file, output_json):
    # Load CSV file
    df = pd.read_csv('/Users/pranavpadmanabhan/Documents/Project/Ai-Travel-Planner-main/data/OriginalHotelsDataset.csv')
    
    # Fill NaN values with defaults
    df.fillna({
        "name": "Unknown Hotel",
        "city": "Unknown City",
        "type_hotels": "Unknown Type",
        "review_badge": "No rating",
        "review_titel": "No review title",
        "review_number": "0 reviews",
        "price": "Price not available",
        "Duration": "Duration not specified",
        "persons": "Persons not specified",
        "stars": "No star rating",
        "taxes": "Taxes not specified",
        "Favorite_partner": "No favorite partner info"
    }, inplace=True)
    
    # Define the structured format for RAG
    records = []
    for _, row in df.iterrows():
        record = {
            "id": f"hotel_{_}",
            "name": row["name"],
            "city": row["city"],
            "type": row["type_hotels"],
            "review_badge": row["review_badge"],
            "review_score": row["review_titel"],
            "review_count": row["review_number"],
            "price": row["price"],
            "duration": row["Duration"],
            "persons": row["persons"],
            "stars": row["stars"],
            "taxes": row["taxes"],
            "favorite_partner": row["Favorite_partner"],
            "retrieval_text": f"Hotel {row['name']} in {row['city']} is a {row['type_hotels']} with a rating of {row['review_badge']} ({row['review_number']} reviews). Price: {row['price']} for {row['Duration']}, suitable for {row['persons']}. Taxes: {row['taxes']}."
        }
        records.append(record)
    
    # Save JSON output
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=4, ensure_ascii=False)

# Example usage
csv_to_rag_json("hotels.csv", "hotels_rag.json")


In [14]:
import pandas as pd
import json
import re

def clean_text(text):
    """Removes extra spaces and handles missing values."""
    if pd.isna(text) or text in ["-", "NaN"]:
        return "Unknown"
    return str(text).strip()

def parse_rating(rating):
    """Extracts numeric rating value or defaults to 0.0."""
    rating = re.findall(r"\d+(\.\d+)?", str(rating))  # Extract numeric parts
    return float(rating[0]) if rating else 0.0  # Convert to float or default to 0.0

def parse_review_count(review_count):
    """Extracts numeric review count, handling 'T' (thousands notation)."""
    review_count = re.sub("[^0-9T]", "", str(review_count))  # Remove unwanted characters
    if "T" in review_count:
        return int(float(review_count.replace("T", "")) * 1000)  # Convert '1.1T' → 1100
    return int(review_count) if review_count.isdigit() else 0  # Ensure integer

def convert_csv_to_rag_json(csv_file, output_json):
    df = pd.read_csv('/Users/pranavpadmanabhan/Documents/Project/Saudi-Dataset-main/Scraped Data(Saudi)/Kaggle/Entertainment_KSA.csv')
    
    # Fill NaN values with defaults
    df.fillna({
        "name": "Unknown Theater",
        "rating": "0.0",
        "review_count": "0",
        "genre": "Unknown Genre",
        "location": "Unknown Location",
        "best_comment": "No comments available"
    }, inplace=True)
    
    records = []
    for idx, row in df.iterrows():
        record = {
            "id": f"theater_{idx}",
            "name": clean_text(row["name"]),
            "rating": parse_rating(row["rating"]),  # Extract numeric rating
            "review_count": parse_review_count(row["review_count"]),  # Extract clean review count
            "genre": clean_text(row["genre"]),
            "location": clean_text(row["location"]),
            "best_comment": clean_text(row["best_comment"]),
            "retrieval_text": (
                f"{clean_text(row['name'])} is a {clean_text(row['genre'])} located in {clean_text(row['location'])}. "
                f"It has a rating of {parse_rating(row['rating'])} based on {parse_review_count(row['review_count'])} reviews. "
                f"Best comment: \"{clean_text(row['best_comment'])}\"."
            )
        }
        records.append(record)
    
    # Save JSON output
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=4, ensure_ascii=False)

# Example usage
convert_csv_to_rag_json("theaters.csv", "theaters_rag.json")


In [17]:
import os
import pandas as pd
import json

def clean_text(text):
    """Removes extra spaces and handles missing values."""
    if pd.isna(text):
        return "Unknown"
    return str(text).strip()

def convert_hotels_csv_to_json(csv_file, output_json):
    """Reads a CSV file and converts hotel data into structured JSON."""
    
    # Ensure the file exists
    if not os.path.exists(csv_file):
        raise FileNotFoundError(f"Error: The file '{csv_file}' was not found. Check the path and try again.")

    # Read CSV file
    df = pd.read_csv('/Users/pranavpadmanabhan/Documents/Project/Saudi-Dataset-main/Scraped Data(Saudi)/Kaggle/hotel_distance_from_airport.csv')

    # Fill NaN values with defaults
    df.fillna({
        "name": "Unknown Hotel",
        "distance_from_airport": "Unknown Distance",
        "rating": 0.0,
        "reviews": 0,
        "room_type": "Unknown Room Type",
        "current_price": 0.0,
        "default_price": 0.0
    }, inplace=True)

    # Convert data to structured JSON format
    records = []
    for idx, row in df.iterrows():
        record = {
            "id": f"hotel_{idx}",
            "name": clean_text(row["name"]),
            "distance_from_airport": f"{row['distance_from_airport']} km",
            "rating": round(float(row["rating"]), 1),
            "reviews": int(row["reviews"]),
            "room_type": clean_text(row["room_type"]),
            "current_price": f"SAR {int(row['current_price'])}",
            "default_price": f"SAR {int(row['default_price'])}",
            "retrieval_text": (
                f"{clean_text(row['name'])} is located {row['distance_from_airport']} km from the airport. "
                f"It has a rating of {row['rating']} based on {row['reviews']} reviews. "
                f"The available room type is '{clean_text(row['room_type'])}', priced at SAR {int(row['current_price'])} "
                f"(original price: SAR {int(row['default_price'])})."
            )
        }
        records.append(record)

    # Save JSON output
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=4, ensure_ascii=False)

# Example usage
csv_file = "/Users/pranavpadmanabhan/Documents/Project/Saudi-Dataset-main/Scraped Data(Saudi)/Kaggle/hotel_distance_from_airport.csv"
convert_hotels_csv_to_json(csv_file, "hotels_rag.json")


In [19]:
pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.


In [20]:
import os
import pandas as pd
import json

def clean_text(text):
    """Removes extra spaces and handles missing values."""
    if pd.isna(text):
        return "Unknown"
    return str(text).strip()

def convert_restaurants_csv_to_json(csv_file, output_json):
    """Reads a CSV file and converts restaurant data into structured JSON."""
    
    # Ensure the file exists
    if not os.path.exists(csv_file):
        raise FileNotFoundError(f"Error: The file '{csv_file}' was not found. Check the path and try again.")

    # Read CSV file
    df = pd.read_excel('/Users/pranavpadmanabhan/Documents/Project/Saudi-Dataset-main/Scraped Data(Saudi)/Kaggle/Riyadh_Resturants.xlsx')

    # Fill NaN values with defaults
    df.fillna({
        "restaurant_id": "Unknown ID",
        "name": "Unknown Restaurant",
        "reviews": 0,
        "rating": 0.0,
        "latitude": 0.0,
        "longitude": 0.0,
        "cuisines": "[]"
    }, inplace=True)

    # Convert data to structured JSON format
    records = []
    for idx, row in df.iterrows():
        cuisines = eval(row["cuisines"]) if isinstance(row["cuisines"], str) else []
        record = {
            "id": f"restaurant_{row['restaurant_id']}",
            "name": clean_text(row["name"]),
            "reviews": int(row["reviews"]),
            "rating": round(float(row["rating"]), 1),
            "location": {
                "latitude": float(row["latitude"]),
                "longitude": float(row["longitude"])
            },
            "cuisines": cuisines,
            "retrieval_text": (
                f"{clean_text(row['name'])} has a rating of {row['rating']} based on {row['reviews']} reviews. "
                f"It offers {', '.join(cuisines)} cuisine. "
                f"The restaurant is located at latitude {row['latitude']} and longitude {row['longitude']}."
            )
        }
        records.append(record)

    # Save JSON output
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(records, f, indent=4, ensure_ascii=False)

# Example usage
csv_file = "/Users/pranavpadmanabhan/Documents/Project/Saudi-Dataset-main/Scraped Data(Saudi)/Kaggle/Riyadh_Resturants.xlsx"
convert_restaurants_csv_to_json(csv_file, "restaurants_rag.json")


# code to Combine


In [23]:
import json

def combine_data(hotels_file, restaurants_file, entertainment_file, output_file):
    """Combines hotel, restaurant, and entertainment JSON data into one file."""
    
    # Load hotels data
    with open(hotels_file, "r", encoding="utf-8") as f:
        hotels = json.load(f)
    
    # Load restaurants data
    with open(restaurants_file, "r", encoding="utf-8") as f:
        restaurants = json.load(f)
    
    # Load entertainment data
    with open(entertainment_file, "r", encoding="utf-8") as f:
        entertainment = json.load(f)
    
    # Combine data into a single dictionary
    combined_data = {
        "hotels": hotels,
        "restaurants": restaurants,
        "entertainment": entertainment
    }
    
    # Save the combined JSON output
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(combined_data, f, indent=4, ensure_ascii=False)
    
    print(f"Combined data saved to {output_file}")

# Example usage:
combine_data("restaurants_rag.json", "Distance from airport_hotels_rag.json", "theaters_rag.json", "combined_rag_data.json")


Combined data saved to combined_rag_data.json
