# Final Model

## Package installs

In [2]:
!pip install sentence_transformers, pandas, numpy, sklearn

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.0


## Package Imports

In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from sklearn.cluster import DBSCAN
from itertools import combinations
import ast
from typing import List, Dict, Set

  from tqdm.autonotebook import tqdm, trange


## Data Imports

### Places

In [8]:
places_final = pd.read_csv('final_places_dataset.csv')
places_final.columns = ['name', 'lat', 'lng', 'address', 'rating', 'total_ratings', 'acts', 'reviews']
places_final['acts'] = places_final['acts'].astype(str).apply(lambda x: [item.strip() for item in x.strip('[]').split(',')])
places_final = places_final.drop('reviews', axis=1)
places_final.head()

Unnamed: 0,name,lat,lng,address,rating,total_ratings,acts
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,"[surfing, beachfront dining, kayaking, snorkel..."
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,"[beach visits, beachfront dining, surfing, sno..."
2,Weligama Beach surf and stay,5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,"[surfing, scuba diving, snorkeling, beach visi..."
3,Ahangama,5.973975,80.36216,"Ahangama, Sri Lanka",2.9,,"[surfing, beach visits, snorkeling, paddleboar..."
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,"[surfing, snorkeling, turtle watching, beach v..."


### Visitors

In [9]:
users_final = pd.read_csv('final_visitors_dataset.csv')
users_final.columns = ['uid', 'name', 'email', 'acts', 'bucket']
users_final['acts'] = users_final['acts'].apply(ast.literal_eval)
users_final['bucket'] = users_final['bucket'].apply(ast.literal_eval)
users_final.head()

Unnamed: 0,uid,name,email,acts,bucket
0,1,Jennifer Quinn,jennifer.quinn@example.com,"[cycling, historical monuments, village homest...","[Polonnaruwa, Hatale Mini World’s End, Anuradh..."
1,2,Emily Perry,emily.perry@example.com,"[butterfly watching, hot springs, wildlife vie...","[Madunagala Hot Water Spring, Wilpattu Nationa..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"[sea cruises, themed parks, craft workshops]","[Mirissa Beach, Negombo Lagoon, Batadombalena,..."
3,4,Angelica Wilson,angelica.wilson@example.com,"[fishing, hot springs, sailing]","[Maha Oya Hot Water Springs, Port City Colombo..."
4,5,Laurie Powers,laurie.powers@example.com,"[history tours, sailing, literary tours]","[Negombo Lagoon, Port City Colombo, Galle Dutc..."


## Model

### Clean initialization

In [4]:
users = None
places = None

### Data format

*   users ( uid, name, email, acts, bucket )
*   places ( name, lat, lng, address, ratings, total_ratings, acts )


### Helper functions

In [43]:
# Preprocess the data
def preprocess_text(text):
    return ' '.join(str(text).lower().split())

# Cluster places
def cluster_places(df, eps=10, min_samples=2):
    coords = df[['lat', 'lng']].values
    kms_per_radian = 6371.0088
    epsilon = eps / kms_per_radian
    db = DBSCAN(eps=epsilon, min_samples=min_samples, metric='haversine').fit(np.radians(coords))
    df['cluster'] = db.labels_
    return df


# Normalize ratings
def normalize_ratings(df):
    min_rating = df['rating'].min()
    max_rating = df['rating'].max()
    df['normalized_rating'] = (df['rating'] - min_rating) / (max_rating - min_rating)
    return df


# Calculate similarity score
def calculate_similarity(user_activities, place_embeddings, model):
    user_embedding = model.encode([' '.join(user_activities)])
    similarities = util.pytorch_cos_sim(user_embedding, place_embeddings)[0].numpy()
    return similarities

# Optimize recommendations with balanced activity coverage and cluster diversity
def optimize_recommendations(candidate_places, user_activities, places, users, top_n=5):
    def objective_function(subset):
        total_score = sum(place['score'] for place in subset)
        cluster_diversity = len(set(place['cluster'] for place in subset))
        # Calculate the total number of unique activities in the subset
        unique_activities = set()  # Use a set to track unique activities
        for place in subset:
            unique_activities.update(place['acts'])  # Add activities to the set
        # Number of unique activities in the subset
        activity_diversity = len(unique_activities)
        return total_score * cluster_diversity * activity_diversity

    best_subset = []
    best_score = float('-inf')

    for combo in combinations(candidate_places, top_n):
        if len(set(place['cluster'] for place in combo)) == len(combo):  # Ensure all places are from different clusters
            score = objective_function(combo)
            if score > best_score:
                best_score = score
                best_subset = combo
    return list(best_subset)

# Recommend places with balanced activity coverage and cluster diversity
def recommend_places(user_activities: List[str], place_embeddings, model, places, users, num_similar=50, top_n=5):
    similarities = calculate_similarity(user_activities, place_embeddings, model)
    similar_indices = similarities.argsort()[::-1][:num_similar]
    similar_places = places.iloc[similar_indices].copy()
    similar_places['similarity'] = similarities[similar_indices]

    # Calculate score using similarity and normalized rating
    similar_places['score'] = similar_places['similarity'] * 0.5 + similar_places['normalized_rating'] * 0.5

    candidate_places = similar_places.sort_values('score', ascending=False).head(num_similar).to_dict('records')

    optimized_recommendations = optimize_recommendations(candidate_places, user_activities, places, users, top_n)

    return optimized_recommendations


### Model intializing function

In [42]:
def predrive(places=places_final.copy(), users=users_final.copy()):
    # Load the SentenceTransformer model
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

    places['temp_acts'] = places['acts'].apply(lambda x: ' '.join(x).lower())
    places['text_for_embedding'] = places['name'] + ' ' + ' '.join(places['temp_acts'])
    places.drop('temp_acts', axis=1)

    # Generate embeddings for places
    place_embeddings = model.encode(places['text_for_embedding'].tolist())

    places = cluster_places(places)
    places = normalize_ratings(places)
    return places, users, model, place_embeddings

### Model Initialization

In [41]:
places, users, model, place_embeddings = predrive()



### Driver function

In [40]:
def driver(user_id, places, users, model, place_embeddings):
    user = users[users['uid'] == user_id].iloc[0]
    user_id = user['uid']
    name = user['name']
    preferred_activities: List[str] = user['acts']
    max_activities_per_place = places['acts'].apply(len).max()
    max_activities = max_activities_per_place * 5   # since 5 places

    print(f"Recommendations for {name} (ID: {user_id}):")
    print(f"Preferred activities: {', '.join(preferred_activities)}")

    recommendations = recommend_places(preferred_activities, place_embeddings, model, places, users, num_similar=50, top_n=5)

    print("Top 5 recommended places:")
    covered_activities = set()
    total_place_activities = set()
    total_similarity_score = 0.0
    total_rating = 0
    user_activities_set = set(map(lambda x: x.lower().strip(), preferred_activities))

    for place in recommendations:
        place_activities_set = set(place['acts']) # <- mine
        total_place_activities.update(place_activities_set)
        total_similarity_score += place['similarity']
        rating = place['rating']
        total_rating += rating

        print(f" {place['name']} (Lat: {place['lat']:.4f}, Long: {place['lng']:.4f})")
        print(f"  Cluster: {place['cluster']}")
        print(f"  Similarity score: {place['similarity']:.2f}")
        print(f"  Rating: {place['rating']:.2f}")
        print(f"  Activities: {place['acts']}")
        print()  # Add an empty line between places for better readability

    diversity_score = len(total_place_activities)/len(user_activities_set)
    print(f"Diversity score: {diversity_score:.2f}")

    avg_similarity = total_similarity_score / len(recommendations)
    print(f"Average similarity score: {avg_similarity:.2f}")

    avg_rating = total_rating/len(recommendations)
    print(f"Average rating: {avg_rating:.2f}")
    scaled_rating = avg_rating / 5

    print("\n" + "="*50 + "\n")


### Final Recommend Function

In [44]:
def recommend(user_id, places=places, users=users, model=model, place_embeddings=place_embeddings):
  driver(user_id, places, users, model, place_embeddings)

### Recommendation in Action
Please enter a valid user_id (in **int** form) to the input below

In [45]:
recommend(int(input("User ID: ")))

User ID: 5
Recommendations for Laurie Powers (ID: 5):
Preferred activities: history tours, sailing, literary tours
Top 5 recommended places:
 Chariot Path (Lat: 7.0844, Long: 80.7239)
  Cluster: 9
  Similarity score: 0.49
  Rating: 5.00
  Activities: ['hiking', 'wildlife viewing', 'bird watching', 'photography', 'landscape photography', 'historic walks', 'mountain biking']

 Tour With Dash (Lat: 6.0431, Long: 80.2183)
  Cluster: 0
  Similarity score: 0.49
  Rating: 4.80
  Activities: ['city tours', 'historical monuments', 'historic sites', 'historic walks', 'cultural experiences', 'temple pilgrimages', 'village homestays', 'tea tasting']

 Lionel Wendt Art Centre (Lat: 6.9069, Long: 79.8606)
  Cluster: 18
  Similarity score: 0.49
  Rating: 4.70
  Activities: ['arts and culture', 'art classes', 'literary tours', 'museum visits', 'theater']

 Parappudva Island Temple (Lat: 6.1116, Long: 80.1383)
  Cluster: 1
  Similarity score: 0.47
  Rating: 4.50
  Activities: ['spiritual retreats', 'te