In [1]:
# !pip install --upgrade pip
# !pip install torch torch-geometric networkx pandas scikit-learn

In [3]:
import pandas as pd
import networkx as nx
import torch
from torch_geometric.data import Data 
from sklearn.preprocessing import LabelEncoder

In [4]:
# Load user data
user_data = pd.read_csv('users.csv')
destination_data = pd.read_csv('places_final_dataset.csv')

In [5]:
user_data

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."
...,...,...,...,...,...
9995,9996,Jonathan Hernandez,jonathan.hernandez@example.com,"['paddleboarding', 'river cruises', 'kayaking']","['Ahungalla', 'Bolgoda Lake', 'Unawatuna Beach..."
9996,9997,Cody Gallegos,cody.gallegos@example.com,"['theater', 'scuba diving', 'yoga retreats']","['Kalpitiya', 'Hikkaduwa Coral Sanctuary', 'Tr..."
9997,9998,Amy House,amy.house@example.com,"['sea cruises', 'zip-lining', 'outdoor adventu...","['Hikkaduwa Coral Sanctuary', 'Ella', 'Pigeon ..."
9998,9999,Leslie Aguilar,leslie.aguilar@example.com,"['cycling', 'amusement parks', 'paddleboarding']","['Ella', 'Hatton', 'Negambo', 'Colombo Port Ci..."


In [6]:
destination_data

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.099060,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...
...,...,...,...,...,...,...,...
406,Uppuveli Beach,8.607956,81.220013,"Trincomalee, Sri Lanka",4.3,399.0,['Uppuveli Beach is a stunning escape! The sof...
407,Koggala Beach,5.992272,80.310691,"Koggala Beach, Sri Lanka",4.3,353.0,['Koggala Beach is a hidden gem! The soft sand...
408,Marakolliya Beach,6.042222,80.823073,"Kapuhenwala Road, Sri Lanka",4.3,180.0,['Marakolliya Beach is a hidden gem! The waves...
409,Pasikuda Beach,7.929994,81.561185,"Pasikuda Beach, Sri Lanka",4.4,1142.0,['Pasikuda Beach is a hidden gem! The pristine...


In [7]:
# Function to clean text by removing unwanted characters
def clean_text(text):
    if isinstance(text, str):
        # Replace typical encoding artifacts
        text = text.replace("Ã¢Â€Â™", "'")  # Example replacement for a common encoding issue
        text = text.replace("Ã¢Â€Âœ", '"').replace("Ã¢Â€Â�", '"')  # Handle quotes
        text = text.replace("Ã¢Â€Â", "-")  # Handle dashes
        text = text.replace("\u00A0", " ")  # Replace non-breaking space with regular space
        # Remove any remaining non-ASCII characters
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        return text.strip()  # Remove leading/trailing whitespace
    elif isinstance(text, list):
        return [clean_text(t) for t in text]  # Apply recursively for lists
    return text

In [8]:
import re

# Apply the cleaning function to the relevant columns
destination_data['name'] = destination_data['name'].apply(clean_text)
destination_data['formatted_address'] = destination_data['formatted_address'].apply(clean_text)
destination_data['latest_reviews'] = destination_data['latest_reviews'].apply(clean_text)

In [9]:
# Verify the cleaned data
destination_data[['name', 'formatted_address', 'latest_reviews']].head()

Unnamed: 0,name,formatted_address,latest_reviews
0,Arugam Bay Beach,"Arugam Bay Beach, Sri Lanka",['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,"Mirissa, Sri Lanka",['Mirissa Beach is truly a gem on Sri Lanka's ...
2,Weligama Beach (surf and stay),"Weligama, Sri Lanka",['Weligama Beach is a fantastic spot for both ...
3,Ahangama,"Ahangama, Sri Lanka",['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,"Hikkaduwa Beach, Sri Lanka",['Hikkaduwa Beach is a delightful escape for s...


In [10]:
# Save the cleaned destination dataset to a new CSV
destination_data.to_csv('cleaned_destination_data.csv', index=False)

In [11]:
import ast  # Safer than eval for evaluating lists

# Function to safely evaluate string lists and ignore already parsed lists
def safe_eval(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)  # Safer alternative to eval
        except (ValueError, SyntaxError):
            return val  # If parsing fails, return the original value
    return val  # Return the value as-is if it's not a string

In [12]:
user_data['Preferred Activities'] = user_data['Preferred Activities'].apply(safe_eval)

In [13]:
user_data['Bucket list destinations Sri Lanka'] = user_data['Bucket list destinations Sri Lanka'].apply(safe_eval)

In [15]:
print(user_data[['Preferred Activities', 'Bucket list destinations Sri Lanka']].head())

                                Preferred Activities  \
0  [cycling, historical monuments, village homest...   
1  [butterfly watching, hot springs, wildlife vie...   
2       [sea cruises, themed parks, craft workshops]   
3                    [fishing, hot springs, sailing]   
4           [history tours, sailing, literary tours]   

                  Bucket list destinations Sri Lanka  
0  [Polonnaruwa, Hatton, Anuradhapura, Ella, Hapu...  
1  [Madunagala Hot Water Spring, Wilpattu Nationa...  
2  [Mirissa Beach, Negombo Lagoon, Batadombalena ...  
3  [Maha Oya Hot Water Springs, Colombo Port City...  
4  [Negombo Lagoon, Colombo Port City, Galle Dutc...  


In [16]:
# Create a mapping for destination names to index values
destination_mapping = {name: idx for idx, name in enumerate(destination_data['name'])}

In [18]:
# Map the user's bucket list destinations to indices from the destination data
user_data['Bucket list destinations mapped'] = user_data['Bucket list destinations Sri Lanka'].apply(
    lambda dests: [destination_mapping.get(dest, -1) for dest in dests if dest in destination_mapping]
)

In [19]:
# Remove users without valid destination mappings (i.e., destinations not in the destination dataset)
user_data = user_data[user_data['Bucket list destinations mapped'].apply(lambda x: len(x) > 0)]

In [20]:
# Check the updated user data
print(user_data[['User ID', 'Bucket list destinations mapped']].head())

   User ID Bucket list destinations mapped
0        1                    [95, 23, 78]
1        2               [399, 13, 14, 17]
2        3                     [1, 45, 33]
3        4               [123, 45, 44, 59]
4        5                 [45, 8, 10, 53]


In [21]:
# Explode the user's mapped destinations so that each row corresponds to one destination per user
user_destinations = user_data[['User ID', 'Bucket list destinations mapped']].explode('Bucket list destinations mapped')

In [22]:
# Merge this with the destination data to get full details for each user's bucket list destinations
merged_data = pd.merge(user_destinations, destination_data, left_on='Bucket list destinations mapped', right_index=True, how='inner')

In [23]:
# Drop the now-unnecessary "Bucket list destinations mapped" column
merged_data = merged_data.drop(columns=['Bucket list destinations mapped'])

In [24]:
# Check the merged data
print(merged_data.head())

     User ID         name       lat        lng       formatted_address  \
0          1  Polonnaruwa  7.940338  81.018798  Polonnaruwa, Sri Lanka   
6          7  Polonnaruwa  7.940338  81.018798  Polonnaruwa, Sri Lanka   
21        22  Polonnaruwa  7.940338  81.018798  Polonnaruwa, Sri Lanka   
168      169  Polonnaruwa  7.940338  81.018798  Polonnaruwa, Sri Lanka   
178      179  Polonnaruwa  7.940338  81.018798  Polonnaruwa, Sri Lanka   

     rating  user_ratings_total  \
0       NaN                 NaN   
6       NaN                 NaN   
21      NaN                 NaN   
168     NaN                 NaN   
178     NaN                 NaN   

                                        latest_reviews  
0    ['Polonnaruwa felt like a missed opportunity. ...  
6    ['Polonnaruwa felt like a missed opportunity. ...  
21   ['Polonnaruwa felt like a missed opportunity. ...  
168  ['Polonnaruwa felt like a missed opportunity. ...  
178  ['Polonnaruwa felt like a missed opportunity. ...  


In [25]:
# Create user-to-destination edges
user_to_dest_edges = []

In [26]:
# Loop through the merged data to create edges between users and destinations
for _, row in merged_data.iterrows():
    user_id = row['User ID']
    dest_id = destination_mapping[row['name']]  # Get the mapped destination index from the cleaned data
    user_to_dest_edges.append((user_id, len(user_data) + dest_id))  # Offset destination node index by user count

In [27]:
# Convert to edge list format
import torch
edge_index = torch.tensor(user_to_dest_edges, dtype=torch.long).t().contiguous()

In [28]:
# Now we have user-to-destination edges in a tensor form
print(edge_index)

tensor([[    1,     7,    22,  ...,  9668,  9736,  9970],
        [10069, 10069, 10069,  ..., 10380, 10380, 10380]])


In [29]:
from sklearn.preprocessing import MultiLabelBinarizer

# One-hot encode the preferred activities for each user
mlb = MultiLabelBinarizer()
user_features = mlb.fit_transform(user_data['Preferred Activities'])

In [30]:
# Convert user features to torch tensor
user_features = torch.tensor(user_features, dtype=torch.float)
print(user_features.shape)  # Check shape of user feature matrix

torch.Size([9974, 68])


In [31]:
# Add latitude and longitude to the destination features
destination_features = destination_data[['rating', 'user_ratings_total', 'lat', 'lng']].values

In [32]:
# Normalize the features (optional but recommended for GNNs)
from sklearn.preprocessing import StandardScaler

# Normalize the features (important for GNNs)
scaler = StandardScaler()
destination_features = scaler.fit_transform(destination_features)

In [33]:
# Convert to torch tensor
destination_features = torch.tensor(destination_features, dtype=torch.float)

print(destination_features.shape)  # Check new destination feature matrix shape

torch.Size([411, 4])


In [34]:
# Let's assume user_features has 10 dimensions, and destination_features has 4.
# We'll pad destination features to have 10 dimensions.

padding = torch.zeros(destination_features.shape[0], 6)  # Create padding to match user feature dimension (6 additional features)
destination_features_padded = torch.cat([destination_features, padding], dim=1)

print(destination_features_padded.shape)  # Now both user_features and destination_features should have the same dimensions

torch.Size([411, 10])


In [35]:
print(f'User Features Shape: {user_features.shape}')  # e.g., torch.Size([num_users, 68])
print(f'Destination Features Shape: {destination_features_padded.shape}')  # e.g., torch.Size([num_destinations, 10])

User Features Shape: torch.Size([9974, 68])
Destination Features Shape: torch.Size([411, 10])


In [36]:
from sklearn.decomposition import PCA

# Apply PCA to reduce user features to 10 dimensions
pca = PCA(n_components=10)
user_features_reduced = pca.fit_transform(user_features)

# Convert back to torch tensor
user_features_reduced = torch.tensor(user_features_reduced, dtype=torch.float)
print(user_features_reduced.shape)  # This should now be [num_users, 10]

torch.Size([9974, 10])


In [37]:
# After aligning the dimensions, combine user and destination features
x = torch.cat([user_features_reduced, destination_features_padded], dim=0)  # Or use destination_features_final

print(x.shape)  # This should work without errors and give a shape of [num_users + num_destinations, 68] or [num_users + num_destinations, 10]

torch.Size([10385, 10])


In [38]:
# !pip install torch_geometric



In [39]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First convolutional layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)

        # Second convolutional layer
        x = self.conv2(x, edge_index)

        return x

In [40]:
# Set up model, optimizer, and data
model = GCN(in_channels=x.size(1), hidden_channels=32, out_channels=16)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

data = Data(x=x, edge_index=edge_index)

In [41]:
# Training loop
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = F.mse_loss(out, out)  # Simple unsupervised loss (e.g., reconstruction)
    loss.backward()
    optimizer.step()
    return loss.item()

In [42]:
# Training for a few epochs
for epoch in range(200):
    loss = train()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {loss:.4f}')

Epoch 0, Loss: nan
Epoch 10, Loss: nan
Epoch 20, Loss: nan
Epoch 30, Loss: nan
Epoch 40, Loss: nan
Epoch 50, Loss: nan
Epoch 60, Loss: nan
Epoch 70, Loss: nan
Epoch 80, Loss: nan
Epoch 90, Loss: nan
Epoch 100, Loss: nan
Epoch 110, Loss: nan
Epoch 120, Loss: nan
Epoch 130, Loss: nan
Epoch 140, Loss: nan
Epoch 150, Loss: nan
Epoch 160, Loss: nan
Epoch 170, Loss: nan
Epoch 180, Loss: nan
Epoch 190, Loss: nan


In [43]:
# Get the node embeddings from the GNN
model.eval()
with torch.no_grad():
    node_embeddings = model(data)

In [44]:
# User embeddings (first len(user_data) rows) and destination embeddings (remaining rows)
user_embeddings = node_embeddings[:len(user_data)]
destination_embeddings = node_embeddings[len(user_data):]

In [55]:
# Recommend destinations to a specific user (e.g., User 0)
user_idx = 3
user_embedding = user_embeddings[user_idx]

In [56]:
# Compute similarity (e.g., cosine similarity) between user and all destination embeddings
cos_sim = torch.nn.CosineSimilarity(dim=1)
similarities = cos_sim(user_embedding.unsqueeze(0), destination_embeddings)

In [57]:
# Get top 5 most similar destinations
top_destinations = similarities.topk(5)

In [58]:
# Convert the indices to a list of integers
top_destinations_indices = top_destinations.indices.tolist()  # Convert tensor to list of integers

print(f"Top recommended destinations for User {user_idx}:")
for i in top_destinations_indices:
    print(destination_data.iloc[i]['name'])  # Now indexing will work correctly

Top recommended destinations for User 3:
Weligama Beach (surf and stay)
Hikkaduwa Beach
Arugam Bay Beach
Mirissa Beach
Ahangama


In [59]:
import torch
from sklearn.model_selection import train_test_split

# Assuming `edge_index` contains user-to-destination edges
# Split edges into training and test sets (e.g., 80% train, 20% test)
edges = edge_index.t().tolist()
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)

In [60]:
# Convert back to tensor
train_edge_index = torch.tensor(train_edges, dtype=torch.long).t().contiguous()
test_edge_index = torch.tensor(test_edges, dtype=torch.long).t().contiguous()

In [61]:
# Assuming `data` is your graph data object
data.edge_index = train_edge_index  # Use the training edges for training

# Continue training your GNN model as before
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.mse_loss(out, out)  # Example unsupervised loss
    loss.backward()
    optimizer.step()


In [62]:
model.eval()
with torch.no_grad():
    node_embeddings = model(data)


In [63]:
# Get user and destination embeddings
user_embeddings = node_embeddings[:len(user_data)]
destination_embeddings = node_embeddings[len(user_data):]

In [64]:
# Example: Compute similarity scores for test edges
cos_sim = torch.nn.CosineSimilarity(dim=1)

In [67]:
def compute_score(edge):
    user_idx, dest_idx = edge
    user_embedding = user_embeddings[user_idx]
    dest_embedding = destination_embeddings[dest_idx - len(user_data)]
    return cos_sim(user_embedding.unsqueeze(0), dest_embedding.unsqueeze(0)).item()

In [71]:
print(f"User embeddings shape: {user_embeddings.shape}")  # Should be (num_users, embedding_dim)
print(f"Destination embeddings shape: {destination_embeddings.shape}")  # Should be (num_destinations, embedding_dim)


User embeddings shape: torch.Size([9974, 16])
Destination embeddings shape: torch.Size([411, 16])
