In [11]:
import pandas as pd
import random
from faker import Faker

In [12]:
user_df = pd.read_csv("adjusted_datasets/users.csv", encoding="utf-8", encoding_errors="replace") 
history_df = pd.read_csv("adjusted_datasets/histories.csv", encoding="utf-8", encoding_errors="replace") 

In [13]:
import pandas as pd
import random

# Sample user_df and history_df for illustration
# Assuming `user_df` and `history_df` are already available

# Initialize an empty list to store the preference data
preference_data = []

# Loop through each user in user_df
for user_id in user_df["user_id"]:
    # Filter history_df for the given user_id
    user_history = history_df[history_df["user_id"] == user_id]
    
    # Get the most frequent city, country, flight departures, and arrivals
    preferred_city = user_history["city"].mode()[0] if not user_history["city"].isna().all() else "N/A"
    preferred_country = user_history["country"].mode()[0] if not user_history["country"].isna().all() else "N/A"
    
    frequent_flight_dep = user_history["flight_dep"].mode()[0] if not user_history["flight_dep"].isna().all() else "N/A"
    frequent_flight_arr = user_history["flight_arr"].mode()[0] if not user_history["flight_arr"].isna().all() else "N/A"
    
    # Calculate average length of stay
    avg_length_of_stay = user_history["length_of_stay"].mean() if not user_history["length_of_stay"].isna().all() else 0
    
    # Get the most frequent restaurants and hotels (top 3)
    frequent_restaurants = pd.Series([item for sublist in user_history["restaurants"].dropna() for item in sublist]).mode().head(3).tolist()
    frequent_hotels = pd.Series([item for sublist in user_history["hotels"].dropna() for item in sublist]).mode().head(3).tolist()

    # Add a new preference record for this user
    preference_data.append({
        "user_id": user_id,
        "preferred_city": preferred_city,
        "preferred_country": preferred_country,
        "frequent_flight_dep": frequent_flight_dep,
        "frequent_flight_arr": frequent_flight_arr,
        "avg_length_of_stay": avg_length_of_stay,
        "frequent_restaurants": frequent_restaurants,
        "frequent_hotels": frequent_hotels
    })

# Create the preference dataset as a DataFrame
df = pd.DataFrame(preference_data)

print(df)


                                 user_id preferred_city preferred_country  \
0   USERccb2497588b9434895882cb039bfa99d           Doha             Qatar   
1   USER6a7821ca652142bba31920546eaf3e23         Yangon     Burma/Myanmar   
2   USER2917b5dc7f184e77a811296c6c4b2164          Dakar           Senegal   
3   USER35d8f06804604d618e7ad1f3a886273a       Arequipa              Peru   
4   USER0f9a51f2e47c416bb99f0b644778a9e6         Geneva       Switzerland   
..                                   ...            ...               ...   
95  USER261e41a6470044138cfa8fec0588f904    Los Angeles     United States   
96  USER92275520360749d5b6ab169819f708a4        Calgary            Canada   
97  USER10b43a8df5214567bd95b7a6db919b9d       Belgrade            Serbia   
98  USERa57c118fd7db4efb96406c8efd4afe23      Rotterdam       Netherlands   
99  USER30c6e5ca39284f0493695f869b8f00b4          Split           Croatia   

   frequent_flight_dep frequent_flight_arr  avg_length_of_stay  \
0        

In [14]:
import uuid

df["preference_id"] = df.apply(lambda _: "PRE" + uuid.uuid4().hex[:6], axis=1)

df = df[["preference_id"] + [col for col in df.columns if col != "preference_id"]]

# Display the result
print(df.to_markdown())

|    | preference_id   | user_id                              | preferred_city   | preferred_country    | frequent_flight_dep   | frequent_flight_arr   |   avg_length_of_stay | frequent_restaurants   | frequent_hotels   |
|---:|:----------------|:-------------------------------------|:-----------------|:---------------------|:----------------------|:----------------------|---------------------:|:-----------------------|:------------------|
|  0 | PREebd9d5       | USERccb2497588b9434895882cb039bfa99d | Doha             | Qatar                | N/A                   | FLY7af30c             |                    9 | ["'"]                  | ['[', ']']        |
|  1 | PREa8d167       | USER6a7821ca652142bba31920546eaf3e23 | Yangon           | Burma/Myanmar        | N/A                   | N/A                   |                   11 | ['[', ']']             | ['[', ']']        |
|  2 | PRE5d66a7       | USER2917b5dc7f184e77a811296c6c4b2164 | Dakar            | Senegal              | N/A   

In [15]:
import os

# Define the folder path
folder_path = "adjusted_datasets"

# Ensure the folder exists
os.makedirs(folder_path, exist_ok=True)

# Define the full file path
file_path = os.path.join(folder_path, "preferences.csv")

# Save the DataFrame as a CSV file
df.to_csv(file_path, index=False, encoding='utf-8')

print(f"CSV file saved successfully at: {file_path}")

CSV file saved successfully at: adjusted_datasets\preferences.csv
