In [93]:
import pandas as pd
import random
from faker import Faker

In [94]:
user_df = pd.read_csv("adjusted_datasets/users.csv", encoding="utf-8", encoding_errors="replace") 
history_df = pd.read_csv("adjusted_datasets/histories.csv", encoding="utf-8", encoding_errors="replace") 
flight_df = pd.read_csv("adjusted_datasets/adjusted_flights.csv", encoding="utf-8", encoding_errors="replace") 


In [95]:
import pandas as pd
from collections import Counter
import uuid
import ast  # To safely convert stringified lists to actual lists

# Dictionary to store user preferences
user_preferences = {}

# Iterate through each user's history
for user_id in history_df["user_id"].unique():
    user_history = history_df[history_df["user_id"] == user_id]

    # 🔹 Get top 3 cities visited
    city_counts = Counter(user_history["city"])
    top_cities = [city for city, _ in city_counts.most_common(3)]

    # 🔹 Get top 3 hotels
    all_hotels = []
    for hotels_list in user_history["hotels"]:
        if isinstance(hotels_list, str):  # Convert stringified lists to actual lists
            hotels_list = ast.literal_eval(hotels_list)  
        if isinstance(hotels_list, list):  
            all_hotels.extend(hotels_list)

    hotel_counts = Counter(all_hotels)
    top_hotels = [hotel for hotel, _ in hotel_counts.most_common(3)]

    # 🔹 Get top 3 restaurants
    all_restaurants = []
    for restaurants_list in user_history["restaurants"]:
        if isinstance(restaurants_list, str):  
            restaurants_list = ast.literal_eval(restaurants_list)  
        if isinstance(restaurants_list, list):  
            all_restaurants.extend(restaurants_list)

    restaurant_counts = Counter(all_restaurants)
    top_restaurants = [restaurant for restaurant, _ in restaurant_counts.most_common(3)]

    # 🔹 Get top 3 airlines
    all_flights = user_history["flight_id"].tolist()
    flight_subset = flight_df[flight_df["flight_id"].isin(all_flights)]  # Filter flights
    airline_counts = Counter(flight_subset["Airline"])
    top_airlines = [airline for airline, _ in airline_counts.most_common(3)]

    # 🔹 Store user preferences
    user_preferences[user_id] = {
        "user_id": user_id,
        "top_cities": top_cities if top_cities else [],
        "top_hotels": top_hotels if top_hotels else [],
        "top_restaurants": top_restaurants if top_restaurants else [],
        "top_airlines": top_airlines if top_airlines else [],
    }

# Convert dictionary to DataFrame
preferences_df = pd.DataFrame.from_dict(user_preferences, orient="index").reset_index(drop=True)

# Add unique preference_id
preferences_df["preference_id"] = preferences_df.apply(lambda _: "PRE" + uuid.uuid4().hex[:6], axis=1)

# Reorder columns
preferences_df = preferences_df[["preference_id"] + [col for col in preferences_df.columns if col != "preference_id"]]

# Display the result
print(preferences_df.to_markdown())


|    | preference_id   | user_id      | top_cities                         | top_hotels                              | top_restaurants                               | top_airlines                                                        |
|---:|:----------------|:-------------|:-----------------------------------|:----------------------------------------|:----------------------------------------------|:--------------------------------------------------------------------|
|  0 | PRE6e28cc       | USER1c438113 | ['Istanbul', 'Dubai', 'New York']  | ['HOT46ddb2', 'HOTc22832', 'HOT81dfcb'] | ['RESTO8b1c60', 'RESTO82af76', 'RESTOd65a0c'] | ['US-Bangla Airlines', 'Air India', 'IndiGo']                       |
|  1 | PREdaa68f       | USER53194250 | ['Dubai', 'New York', 'London']    | ['HOT280fd9', 'HOT179af9', 'HOT13e0df'] | ['RESTObe3105', 'RESTO301769', 'RESTO588994'] | ['Singapore Airlines', 'Lufthansa', 'Air Astra']                    |
|  2 | PRE33ece5       | USER8a741dde | ['Dubai', 'I

In [99]:
import os

# Define the folder path
folder_path = "adjusted_datasets"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# Define the full file path
file_path = os.path.join(folder_path, "preferences.csv")

# Save the DataFrame as a CSV file
df.to_csv(file_path, index=False, encoding='utf-8')

print(f"CSV file saved successfully at: {file_path}")

CSV file saved successfully at: adjusted_datasets\preferences.csv
