In [119]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

In [120]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [121]:
places = pd.read_csv("/content/drive/MyDrive/Colab Projects/TechTriathlon Recommender/data/visitors.csv")

In [122]:
visitors_original = pd.read_excel("/content/drive/MyDrive/Colab Projects/TechTriathlon Recommender/data/Visitors Preference Dataset.xlsx")
visitors_original.head(10)

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."
5,6,Michelle Anderson,michelle.anderson@example.com,"['public art installations', 'temple pilgrimag...","['Colombo', 'Sigiriya', 'Mihintale', 'Galle Du..."
6,7,Louis Ramsey,louis.ramsey@example.com,"['fishing', 'golfing', 'historical monuments']","['Hikkaduwa', 'Kalpitiya', 'Polonnaruwa', 'Neg..."
7,8,Dominique Hammond,dominique.hammond@example.com,"['sailing', 'hot air ballooning', 'spiritual r...","['Trincomalee Harbour', 'Kandalama', ""Sri Pada..."
8,9,Tara Reilly,tara.reilly@example.com,"['cultural experiences', 'botanical gardens', ...","['Seethawaka Wet Zone Botanical Gardens', 'Sig..."
9,10,Stacy Anderson MD,stacy.md@example.com,"['boat safaris', 'sailing', 'caving']","['Batatotalena (Batadombalena) Cave', 'Colombo..."


In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class DestinationSearch():
  """
  Used to find the closest match to destination in bucket list column
  to the destinations in the destination dataset
  """
  def __init__(self, destination_list):
    self.tfidf_vectorizer = TfidfVectorizer()
    self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(destination_list)

  def find_similar(self, search_term):
    """
    Args:
      search_term (string) : The term to find similar phrase for

    Returns:
      most_similar_document_index (int) : The index of the most similar item from the list
      similarity (float) : The cosime similarity between the most similar and search_term
    """
    search_vector = self.tfidf_vectorizer.transform([search_term])
    # Compute cosine similarity between the search vector and the corpus TF-IDF matrix
    cosine_similarities = cosine_similarity(search_vector, self.tfidf_matrix)
    most_similar_document_index = np.argmax(cosine_similarities)
    similarity = np.max(cosine_similarities)

    # print("Similarity: ", similarity)
    # print("Search string: ", search_term)
    # print("Result: ", name)

    return most_similar_document_index, similarity

In [124]:
class CollaborativeFiltering():
  def __init__(self, dataset, debug=False):
    """
    Args:
      dataset (pd.Dataframe) : Contains columns 'Preferred Activities' and
                              'Bucket list destinations Sri Lanka'
    """

    self.debug = debug # Set to False by default, used only to debug the class
    self.activity_list = self.get_activity_list(dataset["Preferred Activities"])
    self.destination_list = self.get_destination_list(dataset["Bucket list destinations Sri Lanka"])


    self.user_count = len(dataset)
    self.activity_count = len(self.activity_list)
    self.destination_count = len(self.destination_list)


    self.activity_index = {activity: idx for idx, activity in enumerate(self.activity_list)}
    self.user_activity_matrix = self.get_activity_matrix(dataset["Preferred Activities"])

    self.destination_index = {place: idx for idx, place in enumerate(self.destination_list)}
    self.user_destination_matrix = self.get_destination_matrix(dataset["Bucket list destinations Sri Lanka"])




  def get_activity_list(self, activity_df):
    """
    Returns the list of activities
    """
    activity_set = set()
    for activities in activity_df:
      for activity in activities.strip('[').strip(']').split("', '"):
        activity_set.add(activity.strip("'"))

    if self.debug:
      print("Printing from get_activity_list()")
      print(activity_set)
      print("Number of activities is ", len(activity_set))
    activity_list = sorted(activity_set)

    return activity_list


  def get_destination_list(self, bucket_list_df):
    """
    Returns the set of destinations appearing in bucket lists
    """
    bucket_list_set = set()

    for destinations in bucket_list_df:
      for destination in eval(destinations):
        bucket_list_set.add(destination)

    bucket_list = sorted(bucket_list_set)
    if self.debug:
      print("Printing from get_destination_list()")
      print("Number of unique places in the bucket list: ", len(bucket_list_set))
      print(bucket_list_set)
      print(bucket_list)

    return bucket_list


  def get_destination_matrix(self, bucket_list_df):
    user_place_matrix = np.zeros((self.user_count, self.destination_count))

    # Fill the matrix
    for user_id, places in enumerate(bucket_list_df):
        for destination in eval(places): # Places are given in the form ['place1', 'place2']
            if destination in self.destination_index:
                place_id = self.destination_index[destination]
                user_place_matrix[user_id, place_id] = 1

    # Create DataFrame for easier manipulation
    df_user_place_matrix = pd.DataFrame(user_place_matrix)

    return df_user_place_matrix



  def get_activity_matrix(self, activity_df):

    user_activity_matrix = np.zeros((self.user_count, self.activity_count))
    # Fill the matrix
    for user_id, activities in enumerate(visitors_original["Preferred Activities"].iloc[:9990]):
        for activity in eval(activities): # Places are given in the form ['place1', 'place2']
            if activity in self.activity_index:
                activity_id = self.activity_index[activity]
                user_activity_matrix[user_id, activity_id] = 1

    # Create DataFrame for easier manipulation
    df_user_activity_matrix = pd.DataFrame(user_activity_matrix)

    return df_user_activity_matrix



  def activity_based_collaborative_filtering(self, new_user_activities: list,
                                           top_n=5,
                                           min_similarity=0.2):
      # Convert new user purchases to a vector
      new_user_vector = np.zeros(self.user_activity_matrix.shape[1])
      for activity in new_user_activities:
          if activity in self.activity_index:
              new_user_vector[self.activity_index[activity]] = 1

      # Calculate similarities between the new user and all existing users
      user_similarity = cosine_similarity([new_user_vector], self.user_activity_matrix)

      # Compute weighted scores
      weighted_scores = np.dot(user_similarity, self.user_destination_matrix)
      sum_of_weights = np.sum(user_similarity)

      if sum_of_weights == 0:
          return "No recommendations available."

      # Calculate average score for each activity
      recommendations = weighted_scores.flatten() / sum_of_weights


      # Get indices of top N items
      # recommended_places_indices = np.argsort(recommendations)[::-1][:top_n]
      # Filter recommendations to keep only those with a
      # score higher than minimum similarity (default is set to 0.2)
      filtered_indices = np.where(recommendations > min_similarity)[0]

      # Sort the filtered indices based on the recommendation scores in descending order
      sorted_filtered_indices = filtered_indices[np.argsort(recommendations[filtered_indices])[::-1]]

      # Select the top_n items from the sorted filtered indices
      if top_n <= len(filtered_indices):
        top_indices = sorted_filtered_indices
      else:
        top_indices = sorted_filtered_indices[:top_n]
      recommended_places = [self.destination_list[index] for index in top_indices]

      return recommended_places



  def bucket_list_based_collaborative_filtering(self, new_user_places: list, top_n=5):
    ###################
    ### NOTE
    ### Simply looking at the recommendations we can see that this is not as effective
    ###################


    # Convert new user purchases to a vector
    new_user_vector = np.zeros(self.user_destination_matrix.shape[1])
    for item in new_user_places:
        if item in self.destination_index:
            new_user_vector[self.destination_index[item]] = 1

    # Calculate similarities between the new user and all existing users
    user_similarity = cosine_similarity([new_user_vector], self.user_destination_matrix)

    # Compute weighted scores
    weighted_scores = np.dot(user_similarity, self.user_destination_matrix)
    sum_of_weights = np.sum(user_similarity)

    if sum_of_weights == 0:
        return "No recommendations available."

    # Calculate average score for each item
    recommendations = weighted_scores.flatten() / sum_of_weights

    # Get indices of top N items
    recommended_places_indices = np.argsort(recommendations)[::-1][:top_n]
    if self.debug:
      print(recommended_places_indices, len(places))
    recommneded_places = [self.destination_list[index] for index in recommended_places_indices]

    return recommneded_places

In [125]:
!pip install sentence-transformers



In [126]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

activities = CFModel.get_activity_list(visitors_original["Preferred Activities"])


embeddings = dict()

for activity in activities:
  embeddings[activity] = model.encode(activity)



In [127]:
import pickle

# with open('/content/drive/MyDrive/Colab Projects/TechTriathlon Recommender/data/saved_dictionary.pkl', 'wb') as f:
#     pickle.dump(embeddings, f)

# with open('/content/drive/MyDrive/Colab Projects/TechTriathlon Recommender/data/saved_dictionary.pkl', 'rb') as f:
#     loaded_dict = pickle.load(f)

In [131]:
import pickle
import re

review_embeddings = []


try:
  with open('/content/drive/MyDrive/Colab Projects/TechTriathlon Recommender/data/review_embeddings.pkl', 'rb') as f:
      review_embeddings = pickle.load(f)


except:
  pattern = r'\'\s*,\s*\'|\"\s*,\s*\"'

  for review_set in places["latest_reviews"].unique():
    reviews = ""
    for review in re.split(pattern, review_set):
      print(re.sub("Ã¢Â€Â™", "'", review.strip("[").strip("]").strip('"').strip("'")))
      reviews += " " + re.sub("Ã¢Â€Â™", "'", review.strip("[").strip("]").strip('"').strip("'"))
    review_embeddings.append(model.encode(reviews))

In [132]:
print(len(review_embeddings))

341


In [None]:
# with open('/content/drive/MyDrive/Colab Projects/TechTriathlon Recommender/data/review_embeddings.pkl', 'wb') as f:
#     pickle.dump(review_embeddings, f)

# with open('/content/drive/MyDrive/Colab Projects/TechTriathlon Recommender/data/review_embeddings.pkl', 'rb') as f:
#     loaded_dict = pickle.load(f)

In [133]:
# with open('/content/drive/MyDrive/Colab Projects/TechTriathlon Recommender/data/review_embeddings.pkl', 'rb') as f:
#       review_embeddings = pickle.load(f)

In [134]:
review_embeddings_map = {}
for index, place in enumerate(places["name"].unique()):
  review_embeddings_map[place] = review_embeddings[index]

In [135]:
def cosine_similarity_custom(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [136]:
CFModel = CollaborativeFiltering(visitors_original)


# CHANGE THIS
userid = 9991


new_user = visitors_original.iloc[userid]
new_user_activities = eval(new_user["Preferred Activities"]) # New user's purchases
new_user_bucket_list = eval(new_user["Bucket list destinations Sri Lanka"])



places_activity_collaborative = CFModel.activity_based_collaborative_filtering(new_user_activities)
print("User ID: ", userid)
print("Activity List: ", new_user_activities)
print("Recommended places for the new user based on activity:", set(places_activity_collaborative) - set(new_user_bucket_list))
print("All places recommended: ", places_activity_collaborative)
print("Bucket Listed Places: ", new_user_bucket_list)



recommended_places = CFModel.bucket_list_based_collaborative_filtering(new_user_bucket_list)
print("Recommended items for the new user based on bucket_list:", set(recommended_places) - set(new_user_bucket_list))
print(recommended_places)

User ID:  9991
Activity List:  ['botanical gardens', 'elephant rides', 'cultural festivals']
Recommended places for the new user based on activity: {'Kandy', 'Nallur Kandaswamy Devasthanam'}
All places recommended:  ['Kandy', 'Nallur Kandaswamy Devasthanam']
Bucket Listed Places:  ['Pinnawala', 'Hakgala Botanical Garden', 'Udawalawe', 'Seethawaka Wet Zone Botanical Gardens', 'Dry Zone Botanic Gardens, Hambantota']
Recommended items for the new user based on bucket_list: set()
['Seethawaka Wet Zone Botanical Gardens', 'Hakgala Botanical Garden', 'Udawalawe', 'Pinnawala', 'Dry Zone Botanic Gardens, Hambantota']


In [137]:
# Metric for evaluation will be Precision@5 considering the bucket list items

# CHANGE THIS
userid = 9994


new_user = visitors_original.iloc[userid]
new_user_activities = eval(new_user["Preferred Activities"]) # New user's purchases
new_user_bucket_list = eval(new_user["Bucket list destinations Sri Lanka"])


places_activity_collaborative = CFModel.activity_based_collaborative_filtering(new_user_activities, top_n=5)
# places_bucket_list_collaborative = CFModel.bucket_list_based_collaborative_filtering(new_user_bucket_list)

In [138]:
def similar_score(bucket_list, recommendations):
  """
  Based on proximity of embeddings in bucket list with recommendations

  """
  similarity_aggregate = 0
  count = 0
  all_places = places["name"].unique()
  Searcher = DestinationSearch(all_places)
  for recommendation in recommendations:
    for place in bucket_list:
      if recommendation not in all_places:
        index, similarity = Searcher.find_similar(recommendation)
        recommendation = all_places[index]

      if place not in all_places:
        index, similarity = Searcher.find_similar(recommendation)
        place = all_places[index]

      similarity_aggregate += cosine_similarity_custom(review_embeddings_map[recommendation], review_embeddings_map[place])
      count += 1

  return similarity_aggregate / count   #Average similarity score, higher the better


similar_score(new_user_bucket_list, places_activity_collaborative)

0.8908505886793137

In [139]:
### Precision@5
def get_precision(bucket_list, recommendations):
  precision = 1 - 0.2*len(set(bucket_list).difference(set(recommendations)))
  return round(precision,1)

get_precision(new_user_bucket_list, places_activity_collaborative)

0.2