In [143]:
import pandas as pd
import random
from faker import Faker

In [144]:
user_df = pd.read_csv("adjusted_datasets/users.csv", encoding="utf-8", encoding_errors="replace") 
history_df = pd.read_csv("adjusted_datasets/histories.csv", encoding="utf-8", encoding_errors="replace") 
flight_df = pd.read_csv("adjusted_datasets/adjusted_flights.csv", encoding="utf-8", encoding_errors="replace") 
hotel_df = pd.read_csv("adjusted_datasets/adjusted_hotels.csv", encoding="utf-8", encoding_errors="replace") 
restaurant_df = pd.read_csv("adjusted_datasets/adjusted_restaurants.csv", encoding="utf-8", encoding_errors="replace") 


In [145]:
import pandas as pd
from collections import Counter
import uuid
import ast  # To safely convert stringified lists to actual lists
import random

# Dictionary to store user preferences
user_preferences = {}

# Get all available cities from a global dataset (for bucket list)
all_possible_cities = set(flight_df["City Arr"].unique()).union(set(hotel_df["City"].unique())).union(set(restaurant_df["City"].unique()))

# Iterate through each user's history
for user_id in history_df["user_id"].unique():
    user_history = history_df[history_df["user_id"] == user_id]

    # 🔹 Get top 3 cities visited
    city_counts = Counter(user_history["city"])
    top_cities = [city for city, _ in city_counts.most_common(3)]

    # 🔹 Get top 3 hotels
    all_hotels = []
    for hotels_list in user_history["hotels"]:
        if isinstance(hotels_list, str):  # Convert stringified lists to actual lists
            hotels_list = ast.literal_eval(hotels_list)  
        if isinstance(hotels_list, list):  
            all_hotels.extend(hotels_list)

    hotel_counts = Counter(all_hotels)
    top_hotels = [hotel for hotel, _ in hotel_counts.most_common(3)]

    # 🔹 Get top 3 restaurants
    all_restaurants = []
    for restaurants_list in user_history["restaurants"]:
        if isinstance(restaurants_list, str):  
            restaurants_list = ast.literal_eval(restaurants_list)  
        if isinstance(restaurants_list, list):  
            all_restaurants.extend(restaurants_list)

    restaurant_counts = Counter(all_restaurants)
    top_restaurants = [restaurant for restaurant, _ in restaurant_counts.most_common(3)]

    # 🔹 Get top 3 airlines
    all_flights = user_history["flight_id"].tolist()
    flight_subset = flight_df[flight_df["flight_id"].isin(all_flights)]  # Filter flights
    airline_counts = Counter(flight_subset["Airline"])
    top_airlines = [airline for airline, _ in airline_counts.most_common(3)]

    # 🔹 Determine majority visa requirement
    visa_counts = Counter(user_history["visa_required"])
    travel_visa_preference = visa_counts.most_common(1)[0][0] if visa_counts else None

    # 🔹 Preferred flight price range (grouped per 100€)
    flight_prices = flight_subset["Total Fare (EUR)"].dropna().tolist()
    if flight_prices:
        avg_flight_price = sum(flight_prices) / len(flight_prices)
        preferred_flight_price_range = f"{int(avg_flight_price // 100) * 100}-{int(avg_flight_price // 100) * 100 + 99}€"
    else:
        preferred_flight_price_range = None

    # 🔹 Preferred hotel price range (grouped per 50€)
    user_hotels = hotel_df[hotel_df["hotel_id"].isin(all_hotels)]
    hotel_prices = user_hotels["price"].dropna().tolist()
    if hotel_prices:
        avg_hotel_price = sum(hotel_prices) / len(hotel_prices)
        preferred_hotel_price_range = f"{int(avg_hotel_price // 50) * 50}-{int(avg_hotel_price // 50) * 50 + 49}€"
    else:
        preferred_hotel_price_range = None

    # 🔹 Preferred cuisines (Top 3)
    user_restaurants = restaurant_df[restaurant_df["restaurant_id"].isin(all_restaurants)]
    cuisine_counts = Counter(user_restaurants["Cuisines"])
    top_cuisines = [cuisine for cuisine, _ in cuisine_counts.most_common(3)]

    # 🔹 Bucket list destinations (1-10 cities not yet visited)
    visited_cities = set(user_history["city"])
    possible_cities = list(all_possible_cities - visited_cities)
    if possible_cities:
        num_cities = random.randint(1, min(10, len(possible_cities)))  # Select between 1 and 10 cities
        bucket_list_destinations = random.sample(possible_cities, num_cities)
    else:
        bucket_list_destinations = []

    # 🔹 Store user preferences
    user_preferences[user_id] = {
        "user_id": user_id,
        "top_cities": top_cities if top_cities else [],
        "top_hotels": top_hotels if top_hotels else [],
        "top_restaurants": top_restaurants if top_restaurants else [],
        "top_airlines": top_airlines if top_airlines else [],
        "visa_preference": travel_visa_preference,
        "preferred_flight_price_range": preferred_flight_price_range,
        "preferred_hotel_price_range": preferred_hotel_price_range,
        "preferred_cuisines": top_cuisines if top_cuisines else [],
        "bucket_list_destinations": bucket_list_destinations if bucket_list_destinations else []
    }

# Convert dictionary to DataFrame
preferences_df = pd.DataFrame.from_dict(user_preferences, orient="index").reset_index(drop=True)

# Add unique preference_id
preferences_df["preference_id"] = preferences_df.apply(lambda _: "PRE" + uuid.uuid4().hex[:6], axis=1)

# Reorder columns
preferences_df = preferences_df[["preference_id"] + [col for col in preferences_df.columns if col != "preference_id"]]

# Display the result
print(preferences_df.to_markdown())


|     | preference_id   | user_id      | top_cities                         | top_hotels                              | top_restaurants                               | top_airlines                                                          | visa_preference   | preferred_flight_price_range   | preferred_hotel_price_range   | preferred_cuisines                                                                       | bucket_list_destinations                                                                                                        |
|----:|:----------------|:-------------|:-----------------------------------|:----------------------------------------|:----------------------------------------------|:----------------------------------------------------------------------|:------------------|:-------------------------------|:------------------------------|:-----------------------------------------------------------------------------------------|:--------------------------------------

In [146]:
# import pandas as pd
# from collections import Counter
# import uuid
# import ast  # To safely convert stringified lists to actual lists

# # Dictionary to store user preferences
# user_preferences = {}

# # Iterate through each user's history
# for user_id in history_df["user_id"].unique():
#     user_history = history_df[history_df["user_id"] == user_id]

#     # 🔹 Get top 3 cities visited
#     city_counts = Counter(user_history["city"])
#     top_cities = [city for city, _ in city_counts.most_common(3)]

#     # 🔹 Get top 3 hotels
#     all_hotels = []
#     for hotels_list in user_history["hotels"]:
#         if isinstance(hotels_list, str):  # Convert stringified lists to actual lists
#             hotels_list = ast.literal_eval(hotels_list)  
#         if isinstance(hotels_list, list):  
#             all_hotels.extend(hotels_list)

#     hotel_counts = Counter(all_hotels)
#     top_hotels = [hotel for hotel, _ in hotel_counts.most_common(3)]

#     # 🔹 Get top 3 restaurants
#     all_restaurants = []
#     for restaurants_list in user_history["restaurants"]:
#         if isinstance(restaurants_list, str):  
#             restaurants_list = ast.literal_eval(restaurants_list)  
#         if isinstance(restaurants_list, list):  
#             all_restaurants.extend(restaurants_list)

#     restaurant_counts = Counter(all_restaurants)
#     top_restaurants = [restaurant for restaurant, _ in restaurant_counts.most_common(3)]

#     # 🔹 Get top 3 airlines
#     all_flights = user_history["flight_id"].tolist()
#     flight_subset = flight_df[flight_df["flight_id"].isin(all_flights)]  # Filter flights
#     airline_counts = Counter(flight_subset["Airline"])
#     top_airlines = [airline for airline, _ in airline_counts.most_common(3)]

#     # 🔹 Determine majority visa requirement
#     visa_counts = Counter(user_history["visa_required"])
#     travel_visa_preference = visa_counts.most_common(1)[0][0] if visa_counts else "Unknown"

#     # 🔹 Store user preferences
#     user_preferences[user_id] = {
#         "user_id": user_id,
#         "top_cities": top_cities if top_cities else [],
#         "top_hotels": top_hotels if top_hotels else [],
#         "top_restaurants": top_restaurants if top_restaurants else [],
#         "top_airlines": top_airlines if top_airlines else [],
#         "visa_preference": travel_visa_preference 
#     }

# # Convert dictionary to DataFrame
# preferences_df = pd.DataFrame.from_dict(user_preferences, orient="index").reset_index(drop=True)

# # Add unique preference_id
# preferences_df["preference_id"] = preferences_df.apply(lambda _: "PRE" + uuid.uuid4().hex[:6], axis=1)

# # Reorder columns
# preferences_df = preferences_df[["preference_id"] + [col for col in preferences_df.columns if col != "preference_id"]]

# # Display the result
# print(preferences_df.to_markdown())


In [147]:
import os

# Define the folder path
folder_path = "adjusted_datasets"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# Define the full file path
file_path = os.path.join(folder_path, "preferences.csv")

# Save the DataFrame as a CSV file
preferences_df.to_csv(file_path, index=False, encoding='utf-8')

print(f"CSV file saved successfully at: {file_path}")

CSV file saved successfully at: adjusted_datasets\preferences.csv
