In [1]:
# !pip install --upgrade pip
# !pip install torch torch-geometric networkx pandas scikit-learn

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import networkx as nx
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.image as mpimg
import random

In [3]:
# Load user data
user_data = pd.read_csv('users.csv')
destination_data = pd.read_csv('places_final_dataset.csv')

In [4]:
import re

def clean_text(text):
    if isinstance(text, str):
        # Replace typical encoding artifacts (if needed)
        text = text.replace("Ã¢Â€Â™", " ")
        text = text.replace("Ã¢Â€Âœ", " ").replace("Ã¢Â€Â�", " ")  # Handle quotes
        text = text.replace("Ã¢Â€Â", " ")  # Handle dashes
        text = text.replace("\u00A0", " ")  # Replace non-breaking space with regular space
        
        # Remove all special characters, keeping only alphanumeric and spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        
        return text.strip()  # Remove leading/trailing whitespace
    elif isinstance(text, list):
        # Apply recursively for lists
        return [clean_text(t) if isinstance(t, str) else t for t in text]
    return text

In [5]:
# Clean the 'name' column and store in a new column 'cleaned_name'
destination_data['cleaned_name'] = destination_data['name'].apply(clean_text)

# Clean the 'formatted_address' column and store in a new column 'cleaned_address'
destination_data['cleaned_address'] = destination_data['formatted_address'].apply(clean_text)

# Clean the 'latest_reviews' column and store in a new column 'cleaned_reviews'
destination_data['cleaned_reviews'] = destination_data['latest_reviews'].apply(clean_text)

In [6]:
# Verify the cleaned data
destination_data[['name', 'cleaned_name', 'formatted_address', 'cleaned_address', 'latest_reviews', 'cleaned_reviews']].head()

Unnamed: 0,name,cleaned_name,formatted_address,cleaned_address,latest_reviews,cleaned_reviews
0,Arugam Bay Beach,Arugam Bay Beach,"Arugam Bay Beach, Sri Lanka",Arugam Bay Beach Sri Lanka,['Arugam Bay Beach is a surfer's paradise! I s...,Arugam Bay Beach is a surfers paradise I spent...
1,Mirissa Beach,Mirissa Beach,"Mirissa, Sri Lanka",Mirissa Sri Lanka,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...,Mirissa Beach is truly a gem on Sri Lanka s so...
2,Weligama Beach (surf and stay),Weligama Beach surf and stay,"Weligama, Sri Lanka",Weligama Sri Lanka,['Weligama Beach is a fantastic spot for both ...,Weligama Beach is a fantastic spot for both be...
3,Ahangama,Ahangama,"Ahangama, Sri Lanka",Ahangama Sri Lanka,['Ahangama was a bit disappointing for me as a...,Ahangama was a bit disappointing for me as a s...
4,Hikkaduwa Beach,Hikkaduwa Beach,"Hikkaduwa Beach, Sri Lanka",Hikkaduwa Beach Sri Lanka,['Hikkaduwa Beach is a delightful escape for s...,Hikkaduwa Beach is a delightful escape for sol...


In [7]:
# Save the cleaned destination dataset to a new CSV
destination_data.to_csv('cleaned_destination_data.csv', index=False)

In [8]:
import ast  # Safer than eval for evaluating lists

# Function to safely evaluate string lists and ignore already parsed lists
def safe_eval(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)  # Safer alternative to eval
        except (ValueError, SyntaxError):
            return val  # If parsing fails, return the original value
    return val  # Return the value as-is if it's not a string

In [9]:
destination_data.shape

(411, 10)

In [10]:
user_data.shape

(10000, 5)

In [11]:
user_data['Preferred Activities'] = user_data['Preferred Activities'].apply(safe_eval)

In [12]:
user_data.shape

(10000, 5)

In [13]:
user_data['Bucket list destinations Sri Lanka'] = user_data['Bucket list destinations Sri Lanka'].apply(safe_eval)

In [14]:
user_data[['Preferred Activities', 'Bucket list destinations Sri Lanka']].head()

Unnamed: 0,Preferred Activities,Bucket list destinations Sri Lanka
0,"[cycling, historical monuments, village homest...","[Polonnaruwa, Hatton, Anuradhapura, Ella, Hapu..."
1,"[butterfly watching, hot springs, wildlife vie...","[Madunagala Hot Water Spring, Wilpattu Nationa..."
2,"[sea cruises, themed parks, craft workshops]","[Mirissa Beach, Negombo Lagoon, Batadombalena ..."
3,"[fishing, hot springs, sailing]","[Maha Oya Hot Water Springs, Colombo Port City..."
4,"[history tours, sailing, literary tours]","[Negombo Lagoon, Colombo Port City, Galle Dutc..."


In [15]:
destination_data['latest_reviews'] = destination_data['latest_reviews'].apply(safe_eval)

In [16]:
destination_data.shape

(411, 10)

In [17]:
destination_data['latest_reviews'].head()

0    ['Arugam Bay Beach is a surfer's paradise! I s...
1    [Mirissa Beach is truly a gem on Sri LankaÃ¢Â€...
2    [Weligama Beach is a fantastic spot for both b...
3    [Ahangama was a bit disappointing for me as a ...
4    ['Hikkaduwa Beach is a delightful escape for s...
Name: latest_reviews, dtype: object

In [18]:
# Find duplicates based on the 'name' column
duplicates = destination_data[destination_data.duplicated(subset=['name'], keep=False)]

In [19]:
# Remove duplicates, keeping the first occurrence
destination_data_cleaned = destination_data.drop_duplicates(subset=['name'], keep='first')

In [20]:
# Reset the index after removing duplicates
destination_data_cleaned = destination_data_cleaned.reset_index(drop=True)
destination_data_cleaned

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,cleaned_name,cleaned_address,cleaned_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...,Arugam Bay Beach,Arugam Bay Beach Sri Lanka,Arugam Bay Beach is a surfers paradise I spent...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,[Mirissa Beach is truly a gem on Sri LankaÃ¢Â€...,Mirissa Beach,Mirissa Sri Lanka,Mirissa Beach is truly a gem on Sri Lanka s so...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,[Weligama Beach is a fantastic spot for both b...,Weligama Beach surf and stay,Weligama Sri Lanka,Weligama Beach is a fantastic spot for both be...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,[Ahangama was a bit disappointing for me as a ...,Ahangama,Ahangama Sri Lanka,Ahangama was a bit disappointing for me as a s...
4,Hikkaduwa Beach,6.137727,80.099060,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...,Hikkaduwa Beach,Hikkaduwa Beach Sri Lanka,Hikkaduwa Beach is a delightful escape for sol...
...,...,...,...,...,...,...,...,...,...,...
393,Nilaveli Beach,8.700307,81.192050,"Nilaveli Beach, Sri Lanka",4.5,1247.0,[Nilaveli Beach is a beautiful escape with stu...,Nilaveli Beach,Nilaveli Beach Sri Lanka,Nilaveli Beach is a beautiful escape with stun...
394,Uppuveli Beach,8.607956,81.220013,"Trincomalee, Sri Lanka",4.3,399.0,[Uppuveli Beach is a stunning escape! The soft...,Uppuveli Beach,Trincomalee Sri Lanka,Uppuveli Beach is a stunning escape The soft s...
395,Koggala Beach,5.992272,80.310691,"Koggala Beach, Sri Lanka",4.3,353.0,[Koggala Beach is a hidden gem! The soft sand ...,Koggala Beach,Koggala Beach Sri Lanka,Koggala Beach is a hidden gem The soft sand an...
396,Marakolliya Beach,6.042222,80.823073,"Kapuhenwala Road, Sri Lanka",4.3,180.0,['Marakolliya Beach is a hidden gem! The waves...,Marakolliya Beach,Kapuhenwala Road Sri Lanka,Marakolliya Beach is a hidden gem The waves we...


In [21]:
# Function to clean and standardize destination names: remove special characters, ignore case, and sort words
def clean_name(name):
    # Use regex to remove any special characters and convert to lowercase
    cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', name).strip().lower()
    # Split the cleaned string into words, sort them, and join them back
    sorted_words = ' '.join(sorted(cleaned.split()))
    return sorted_words

In [22]:
# Clean the destination names in the destination dataset
destination_data_cleaned['cleaned_name'] = destination_data_cleaned['name'].apply(clean_name)

In [23]:
# Create a mapping of cleaned destination name to its index
dest_name_to_index = {clean_name(name): idx for idx, name in enumerate(destination_data_cleaned['name'])}

In [24]:
# Function to map user's bucket list destinations to indices
def map_destinations_to_indices(destinations):
    if isinstance(destinations, str):
        # Split the destinations (assuming comma-separated list)
        dest_list = [dest.strip() for dest in destinations.split(',')]
    else:
        dest_list = destinations
    # Clean each destination and map to index, ignore if not found
    return [dest_name_to_index.get(clean_name(dest)) for dest in dest_list if clean_name(dest) in dest_name_to_index]

In [25]:
# Apply the mapping function to the 'Bucket list destinations Sri Lanka' column
user_data['Bucket list destinations mapped'] = user_data['Bucket list destinations Sri Lanka'].apply(map_destinations_to_indices)

In [26]:
user_data.drop(['Name', 'Email', 'Preferred Activities', 'User ID'], axis=1)

Unnamed: 0,Bucket list destinations Sri Lanka,Bucket list destinations mapped
0,"[Polonnaruwa, Hatton, Anuradhapura, Ella, Hapu...","[95, 23, 78]"
1,"[Madunagala Hot Water Spring, Wilpattu Nationa...","[124, 13, 14, 134, 17]"
2,"[Mirissa Beach, Negombo Lagoon, Batadombalena ...","[1, 45, 33]"
3,"[Maha Oya Hot Water Springs, Colombo Port City...","[123, 47, 45, 44, 59]"
4,"[Negombo Lagoon, Colombo Port City, Galle Dutc...","[45, 47, 8, 10, 53]"
...,...,...
9995,"[Ahungalla, Bolgoda Lake, Unawatuna Beach, Col...","[75, 73, 6, 47]"
9996,"[Kalpitiya, Hikkaduwa Coral Sanctuary, Trincom...","[59, 61, 60, 57]"
9997,"[Hikkaduwa Coral Sanctuary, Ella, Pigeon Islan...","[7, 45, 47]"
9998,"[Ella, Hatton, Negambo, Colombo Port City, Lei...","[47, 89]"


In [27]:
destination_data_cleaned

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,cleaned_name,cleaned_address,cleaned_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...,arugam bay beach,Arugam Bay Beach Sri Lanka,Arugam Bay Beach is a surfers paradise I spent...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,[Mirissa Beach is truly a gem on Sri LankaÃ¢Â€...,beach mirissa,Mirissa Sri Lanka,Mirissa Beach is truly a gem on Sri Lanka s so...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,[Weligama Beach is a fantastic spot for both b...,and beach stay surf weligama,Weligama Sri Lanka,Weligama Beach is a fantastic spot for both be...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,[Ahangama was a bit disappointing for me as a ...,ahangama,Ahangama Sri Lanka,Ahangama was a bit disappointing for me as a s...
4,Hikkaduwa Beach,6.137727,80.099060,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...,beach hikkaduwa,Hikkaduwa Beach Sri Lanka,Hikkaduwa Beach is a delightful escape for sol...
...,...,...,...,...,...,...,...,...,...,...
393,Nilaveli Beach,8.700307,81.192050,"Nilaveli Beach, Sri Lanka",4.5,1247.0,[Nilaveli Beach is a beautiful escape with stu...,beach nilaveli,Nilaveli Beach Sri Lanka,Nilaveli Beach is a beautiful escape with stun...
394,Uppuveli Beach,8.607956,81.220013,"Trincomalee, Sri Lanka",4.3,399.0,[Uppuveli Beach is a stunning escape! The soft...,beach uppuveli,Trincomalee Sri Lanka,Uppuveli Beach is a stunning escape The soft s...
395,Koggala Beach,5.992272,80.310691,"Koggala Beach, Sri Lanka",4.3,353.0,[Koggala Beach is a hidden gem! The soft sand ...,beach koggala,Koggala Beach Sri Lanka,Koggala Beach is a hidden gem The soft sand an...
396,Marakolliya Beach,6.042222,80.823073,"Kapuhenwala Road, Sri Lanka",4.3,180.0,['Marakolliya Beach is a hidden gem! The waves...,beach marakolliya,Kapuhenwala Road Sri Lanka,Marakolliya Beach is a hidden gem The waves we...


In [28]:
user_data.to_csv('cleaned_user_data.csv', index=False)

In [29]:
destination_data_cleaned.to_csv('cleaned_destination_data.csv', index=False)