In [2]:
import pandas as pd
from tqdm import tqdm

# Load your datasets
destination_df = pd.read_csv('cleaned_destination_data.csv')
user_df = pd.read_csv('cleaned_user_data.csv')

# Drop unnecessary columns ('Name', 'Email') from user dataset
user_df_cleaned = user_df.drop(columns=['Name', 'Email'])

# Handle missing values in destination dataset
destination_df['rating'].fillna(destination_df['rating'].mean(), inplace=True)
destination_df['user_ratings_total'].fillna(destination_df['user_ratings_total'].mean(), inplace=True)

# Create a combined score based on rating and total reviews (weighted score)
destination_df['combined_score'] = (0.7 * destination_df['rating']) + (0.3 * (destination_df['user_ratings_total'] / destination_df['user_ratings_total'].max()))

# Convert the 'Bucket list destinations mapped' to actual lists
user_df_cleaned['Bucket list destinations mapped'] = user_df_cleaned['Bucket list destinations mapped'].apply(lambda x: eval(x))

In [5]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
import numpy as np
from tqdm import tqdm

# Assuming destination_df and user_df_cleaned are preprocessed as before

# 1. Build the User-Destination interaction matrix (Collaborative Filtering)
# We will use the "Bucket list destinations mapped" to generate a user-item matrix

# Convert 'Bucket list destinations mapped' to lists (if they are strings)
def safe_eval(x):
    if isinstance(x, str):
        return eval(x)  # Convert string to list
    return x  # Already a list

# Apply the safe_eval function
user_df_cleaned['Bucket list destinations mapped'] = user_df_cleaned['Bucket list destinations mapped'].apply(safe_eval)

# Create an empty user-destination matrix (users x destinations)
num_users = user_df_cleaned.shape[0]
num_destinations = destination_df.shape[0]
user_destination_matrix = np.zeros((num_users, num_destinations))

# Populate the matrix where each user has marked a destination in their bucket list
for idx, bucket_list in tqdm(user_df_cleaned['Bucket list destinations mapped'].items(), desc="Building User-Destination Matrix"):
    for destination_id in bucket_list:
        if destination_id < num_destinations:  # Ensure within bounds
            user_destination_matrix[idx, destination_id] = 1  # Mark presence

# 2. Apply SVD (Singular Value Decomposition) for collaborative filtering
svd = TruncatedSVD(n_components=50, random_state=42)
user_latent_matrix = svd.fit_transform(user_destination_matrix)
destination_latent_matrix = svd.components_.T

# 3. Generate recommendations based on similar users
# Compute the dot product between user latent features and destination latent features
user_recommendation_scores = np.dot(user_latent_matrix, destination_latent_matrix.T)

# For each user, sort destinations by recommendation score and select top 5
top_n = 5
user_recommendations = {}
for user_idx in range(num_users):
    top_destinations = np.argsort(-user_recommendation_scores[user_idx])[:top_n]
    user_recommendations[user_idx] = top_destinations

    
# Now user_recommendations contains the top 5 destinations for each user based on collaborative filtering

Building User-Destination Matrix: 10000it [00:00, 668446.94it/s]


In [6]:
user_recommendations

{0: array([23, 78, 95, 29, 62]),
 1: array([134,  17,  13,  15, 124]),
 2: array([  1,  45,  33, 100,   0]),
 3: array([ 45,  47,  44,  59, 124]),
 4: array([ 8, 10, 45, 47, 53]),
 5: array([  8,  10, 109,  24,  22]),
 6: array([45, 76, 59, 65, 64]),
 7: array([21, 30, 44, 62, 63]),
 8: array([10,  8, 87, 86, 88]),
 9: array([ 45,  47, 125,  95, 391]),
 10: array([65, 64, 42, 41, 38]),
 11: array([ 12,  13,  15,  14, 127]),
 12: array([76, 42, 41, 40, 38]),
 13: array([  8,  27,  29, 125, 132]),
 14: array([ 61,  11, 112, 113, 120]),
 15: array([62, 63, 29, 27, 93]),
 16: array([ 18,  16,  19, 127,  15]),
 17: array([76, 10, 12, 13, 15]),
 18: array([ 77,  21,  92, 128, 103]),
 19: array([  1,  11,  12,   4, 127]),
 20: array([  9,  23, 112, 113, 120]),
 21: array([ 23, 109,  80,  95, 101]),
 22: array([109,  97,  96,  21,  80]),
 23: array([ 24,  22,  30,  56, 131]),
 24: array([17, 10, 22, 16, 79]),
 25: array([80, 28, 16, 87, 86]),
 26: array([51, 47, 21, 75, 74]),
 27: array([112, 

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Use TF-IDF to encode the destination reviews (you can replace this with embeddings for better semantic understanding)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
destination_tfidf = tfidf_vectorizer.fit_transform(destination_df['cleaned_reviews'])

# 2. Use the userâ€™s preferred activities to find similar destinations
user_preference_vectors = tfidf_vectorizer.transform(user_df_cleaned['Preferred Activities'])

# 3. Compute the cosine similarity between user preferences and destination reviews
user_destination_similarity = cosine_similarity(user_preference_vectors, destination_tfidf)

# 4. For each user, recommend destinations based on similarity
user_content_based_recommendations = {}
for user_idx in range(num_users):
    top_destinations = np.argsort(-user_destination_similarity[user_idx])[:top_n]
    user_content_based_recommendations[user_idx] = top_destinations

In [8]:
user_content_based_recommendations

{0: array([149, 276, 284, 165, 307]),
 1: array([122, 123, 124,  20,  63]),
 2: array([ 46,  44, 255, 130, 191]),
 3: array([122, 123, 124,  73, 340]),
 4: array([  8, 283,  78, 351, 257]),
 5: array([ 92,  55,  58,  56, 344]),
 6: array([ 73, 340, 358, 363, 258]),
 7: array([ 63,  62, 123, 124, 312]),
 8: array([ 88, 319, 179, 214,  85]),
 9: array([140,  70,  45,  72, 370]),
 10: array([ 39, 106, 171, 236,  42]),
 11: array([205, 290, 322,  69,  68]),
 12: array([289, 128,  39, 171, 130]),
 13: array([ 80, 140,  70,  45, 178]),
 14: array([103, 331, 134,  69,  60]),
 15: array([133,  80,  63, 132, 139]),
 16: array([128, 289, 243,  80,  20]),
 17: array([ 20, 335, 334,  14, 269]),
 18: array([125,  92,  77, 106, 275]),
 19: array([205, 290,  69,  68, 246]),
 20: array([ 39, 171, 103, 236, 106]),
 21: array([133,  55, 132,  58, 139]),
 22: array([55, 58, 56, 92, 80]),
 23: array([342, 344, 187, 183, 148]),
 24: array([ 18,  15,  14, 128,  34]),
 25: array([ 88,  87, 179,  85, 319]),
 

In [9]:
# Combine both collaborative filtering and content-based filtering recommendations
hybrid_recommendations = {}
alpha = 0.5  # Weight between collaborative filtering and content-based filtering
for user_idx in range(num_users):
    combined_scores = alpha * user_recommendation_scores[user_idx] + (1 - alpha) * user_destination_similarity[user_idx]
    top_destinations = np.argsort(-combined_scores)[:top_n]
    hybrid_recommendations[user_idx] = top_destinations

In [10]:
hybrid_recommendations

{0: array([ 23,  78,  95, 149, 276]),
 1: array([124, 134, 123,  17,  13]),
 2: array([  1,  45,  33, 100,  44]),
 3: array([123, 124,  59,  45,  47]),
 4: array([ 8, 10, 45, 47, 53]),
 5: array([ 22,   8,  10, 109,  24]),
 6: array([59, 45, 76, 65, 64]),
 7: array([30, 21, 44, 63, 62]),
 8: array([ 8, 10, 88, 87, 86]),
 9: array([ 45,  47, 140,  70,  72]),
 10: array([42, 41, 40, 38, 43]),
 11: array([ 12,  13,  15, 205,  14]),
 12: array([ 76,  42,  41, 129, 289]),
 13: array([  8,  80, 140,  45,  70]),
 14: array([ 61,  11, 103, 331,  69]),
 15: array([ 63,  62, 133, 132,  80]),
 16: array([ 18,  16,  19, 128, 289]),
 17: array([10, 76, 12, 13, 15]),
 18: array([ 77,  92,  21, 125, 106]),
 19: array([ 11,   1,  12,   4, 205]),
 20: array([  9,  23,  39, 103,  95]),
 21: array([ 23,  80, 109, 133, 132]),
 22: array([ 80,  96,  97, 109,  21]),
 23: array([22, 24, 30, 56, 54]),
 24: array([10, 17, 22, 16, 79]),
 25: array([80, 28, 16, 88, 87]),
 26: array([51, 47, 21, 74, 75]),
 27: ar

In [12]:
import pandas as pd

# Assuming 'user_recommendations' contains the top 5 destination indices for each user
# and 'destination_df' contains the cleaned destination names

# Prepare a list to store the recommendation data
recommendation_data = []

# Loop through each user and their top 5 recommendations
for user_idx, recommended_destinations in user_recommendations.items():
    # Get the destination names for the recommended destination IDs
    recommended_names = destination_df.loc[recommended_destinations, 'cleaned_name'].values
    
    # Append the recommendations to the list
    recommendation_data.append({
        'User ID': user_df_cleaned.iloc[user_idx]['User ID'],
        'Rec_1': recommended_names[0] if len(recommended_names) > 0 else None,
        'Rec_2': recommended_names[1] if len(recommended_names) > 1 else None,
        'Rec_3': recommended_names[2] if len(recommended_names) > 2 else None,
        'Rec_4': recommended_names[3] if len(recommended_names) > 3 else None,
        'Rec_5': recommended_names[4] if len(recommended_names) > 4 else None
    })

# Convert the list of recommendations to a DataFrame
recommendations_df = pd.DataFrame(recommendation_data)

# Save the DataFrame to a CSV file
recommendations_df.to_csv('user_recommendations.csv', index=False)

print("Recommendations exported to 'user_recommendations.csv'.")


Recommendations exported to 'user_recommendations.csv'.


In [13]:
import pandas as pd

# Assuming 'user_recommendations' contains the top 5 destination indices for each user
# and 'destination_df' contains the cleaned destination names

# Prepare a list to store the recommendation data
recommendation_data = []

# Loop through each user and their top 5 recommendations
for user_idx, recommended_destinations in user_content_based_recommendations.items():
    # Get the destination names for the recommended destination IDs
    recommended_names = destination_df.loc[recommended_destinations, 'cleaned_name'].values
    
    # Append the recommendations to the list
    recommendation_data.append({
        'User ID': user_df_cleaned.iloc[user_idx]['User ID'],
        'Rec_1': recommended_names[0] if len(recommended_names) > 0 else None,
        'Rec_2': recommended_names[1] if len(recommended_names) > 1 else None,
        'Rec_3': recommended_names[2] if len(recommended_names) > 2 else None,
        'Rec_4': recommended_names[3] if len(recommended_names) > 3 else None,
        'Rec_5': recommended_names[4] if len(recommended_names) > 4 else None
    })

# Convert the list of recommendations to a DataFrame
recommendations_df = pd.DataFrame(recommendation_data)

# Save the DataFrame to a CSV file
recommendations_df.to_csv('user_content_based_recommendations.csv', index=False)

print("Recommendations exported to 'user_content_based_recommendations.csv'.")

Recommendations exported to 'user_content_based_recommendations.csv'.


In [14]:
import pandas as pd

# Assuming 'user_recommendations' contains the top 5 destination indices for each user
# and 'destination_df' contains the cleaned destination names

# Prepare a list to store the recommendation data
recommendation_data = []

# Loop through each user and their top 5 recommendations
for user_idx, recommended_destinations in hybrid_recommendations.items():
    # Get the destination names for the recommended destination IDs
    recommended_names = destination_df.loc[recommended_destinations, 'cleaned_name'].values
    
    # Append the recommendations to the list
    recommendation_data.append({
        'User ID': user_df_cleaned.iloc[user_idx]['User ID'],
        'Rec_1': recommended_names[0] if len(recommended_names) > 0 else None,
        'Rec_2': recommended_names[1] if len(recommended_names) > 1 else None,
        'Rec_3': recommended_names[2] if len(recommended_names) > 2 else None,
        'Rec_4': recommended_names[3] if len(recommended_names) > 3 else None,
        'Rec_5': recommended_names[4] if len(recommended_names) > 4 else None
    })

# Convert the list of recommendations to a DataFrame
recommendations_df = pd.DataFrame(recommendation_data)

# Save the DataFrame to a CSV file
recommendations_df.to_csv('hybrid_recommendations.csv', index=False)

print("Recommendations exported to 'hybrid_recommendations.csv'.")

Recommendations exported to 'hybrid_recommendations.csv'.


In [None]:
# import openai
# import numpy as np
# from tqdm import tqdm

# # Set up your OpenAI API key
# openai.api_key = 'your-openai-api-key'

# # Function to get embeddings for a list of texts (batching can improve efficiency)
# def get_openai_embeddings(text_list):
#     embeddings = []
#     for text in tqdm(text_list, desc="Fetching OpenAI Embeddings"):
#         response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
#         embeddings.append(response['data'][0]['embedding'])
#     return np.array(embeddings)

# # Get embeddings for destination reviews (cleaned reviews) and user preferences
# destination_reviews = destination_df['cleaned_reviews'].tolist()
# user_preferences = user_df_cleaned['Preferred Activities'].tolist()

# destination_embeddings = get_openai_embeddings(destination_reviews)
# user_embeddings = get_openai_embeddings(user_preferences)

In [15]:
user_embeddings = np.load("user_embeddings.npy")

In [16]:
destination_embeddings = np.load('review_embeddings.npy')

In [17]:
# Combine and normalize ratings and number of reviews
destination_df['combined_rating'] = (0.7 * destination_df['rating'] + 
                                     0.3 * destination_df['user_ratings_total'] / destination_df['user_ratings_total'].max())

# Normalize the combined score
destination_df['combined_rating_normalized'] = destination_df['combined_rating'] / destination_df['combined_rating'].max()

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between user preferences and destination reviews
user_destination_similarity = cosine_similarity(user_embeddings, destination_embeddings)

# For each user, recommend the top 5 destinations based on the similarity scores
top_n = 5
user_content_based_recommendations = {}
for user_idx in range(user_embeddings.shape[0]):
    top_destinations = np.argsort(-user_destination_similarity[user_idx])[:top_n]
    user_content_based_recommendations[user_idx] = top_destinations


In [19]:
# Weights for combining different models
alpha = 0.5  # Weight for collaborative filtering
beta = 0.3   # Weight for content-based filtering
gamma = 0.2  # Weight for rating/review boost

# Assuming user_recommendation_scores is from collaborative filtering
final_recommendations = {}
for user_idx in range(user_embeddings.shape[0]):
    # Combined score
    combined_score = (alpha * user_recommendation_scores[user_idx] + 
                      beta * user_destination_similarity[user_idx] + 
                      gamma * destination_df['combined_rating_normalized'].values)
    
    # Sort by the combined score to get the top 5 destinations
    top_destinations = np.argsort(-combined_score)[:top_n]
    final_recommendations[user_idx] = top_destinations


In [21]:
# Prepare the final recommendations DataFrame
recommendation_data = []
for user_idx, recommended_destinations in final_recommendations.items():
    recommended_names = destination_df.loc[recommended_destinations, 'name'].values
    
    recommendation_data.append({
        'User ID': user_df_cleaned.iloc[user_idx]['User ID'],
        'Rec_1': recommended_names[0] if len(recommended_names) > 0 else None,
        'Rec_2': recommended_names[1] if len(recommended_names) > 1 else None,
        'Rec_3': recommended_names[2] if len(recommended_names) > 2 else None,
        'Rec_4': recommended_names[3] if len(recommended_names) > 3 else None,
        'Rec_5': recommended_names[4] if len(recommended_names) > 4 else None
    })

recommendations_df = pd.DataFrame(recommendation_data)
recommendations_df.to_csv('hybrid_user_recommendations.csv', index=False)

print("Hybrid recommendations exported to 'hybrid_user_recommendations.csv'.")


Hybrid recommendations exported to 'hybrid_user_recommendations.csv'.


In [26]:
print(recommendations_df.head(1))

   User ID         Rec_1     Rec_2        Rec_3                   Rec_4  \
0        1  Anuradhapura  Haputale  Polonnaruwa  Yapahuwa Rock Fortress   

            Rec_5  
0  Surathali Ella  


In [28]:
recommendations_df = pd.read_csv("hybrid_user_recommendations.csv")

In [29]:
# Create a mapping of user_id to their true preferences
user_true_preferences = destination_df.set_index('User ID')['Bucket list items Sri Lanka'].to_dict()

# Helper function to evaluate relevance (Precision@k)
def evaluate_precision_at_k(recommendations_df, user_true_preferences, k=5):
    precision_scores = []

    for index, row in recommendations_df.iterrows():
        user_id = row['User ID']
        recs = [row[f'Rec_{i}'] for i in range(1, k+1)]  # Top-k recommendations
        
        # Get the true preferences for the user
        true_prefs = user_true_preferences.get(user_id, [])
        true_prefs_set = set(true_prefs) if isinstance(true_prefs, list) else {true_prefs}
        
        # Calculate Precision@k (binary relevance)
        relevant_recs = [1 if rec in true_prefs_set else 0 for rec in recs]
        precision = sum(relevant_recs) / k
        precision_scores.append(precision)

    avg_precision_at_k = sum(precision_scores) / len(precision_scores)
    print(f'Average Precision@{k}: {avg_precision_at_k}')
    return avg_precision_at_k

KeyError: "None of ['User ID'] are in the columns"