In [1]:
# !pip install --upgrade pip
# !pip install torch torch-geometric networkx pandas scikit-learn

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import networkx as nx
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.image as mpimg
import random

In [3]:
user_data = pd.read_csv('cleaned_user_data.csv')
destination_data = pd.read_csv('cleaned_destination_data.csv')

In [4]:
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder

# Encode user IDs and destination names into node indices
user_encoder = LabelEncoder()
dest_encoder = LabelEncoder()

In [5]:
user_indices = user_encoder.fit_transform(user_data['User ID'])
dest_indices = dest_encoder.fit_transform(destination_data['name'])

In [6]:
# Create an edge list: each row is a (user_index, destination_index) pair
edge_list = []

In [7]:
for idx, row in user_data.iterrows():
    # Ensure 'Bucket list destinations mapped' contains integers
    bucket_list = row['Bucket list destinations mapped']
    
    if isinstance(bucket_list, str):
        bucket_list = eval(bucket_list)  # Convert string representation of list back to list
    
    for dest_idx in bucket_list:
        if pd.notna(dest_idx):
            try:
                # Ensure dest_idx is an integer
                dest_idx = int(dest_idx)
                edge_list.append([user_indices[idx], len(user_indices) + dest_idx])
            except ValueError:
                print(f"Skipping invalid destination index: {dest_idx}")

In [8]:
import torch

# Convert edge list to tensor
edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

In [9]:
# Prepare node features (optional, could be user or destination embeddings)
num_users = len(user_indices)
num_destinations = len(dest_indices)
num_nodes = num_users + num_destinations

In [10]:
# Create node features (e.g., use some feature vectors for users and destinations)
x = torch.eye(num_nodes)  # Identity matrix as a placeholder for features

In [11]:
# Create a PyG data object
data = Data(x=x, edge_index=edge_index)

In [12]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCNRecommendation(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCNRecommendation, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        # First graph convolution layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        
        # Second graph convolution layer
        x = self.conv2(x, edge_index)
        
        return x

In [13]:
from torch.optim import Adam

# Initialize model, optimizer, and loss function
model = GCNRecommendation(in_channels=data.num_node_features, hidden_channels=64, out_channels=32)
optimizer = Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

In [14]:
import random
from torch.optim import Adam

# Number of negative samples per positive sample
num_neg_samples = 1  # Can be tuned

# Get the total number of nodes (users + destinations)
num_total_nodes = data.num_nodes

# Training loop
model.train()
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=5e-4)  # Reduced learning rate and added weight decay

for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    
    # Get user and destination node embeddings
    user_embeddings = out[edge_index[0]]  # User nodes
    dest_embeddings = out[edge_index[1]]  # Destination nodes
    
    # Compute interaction scores for positive edges
    pos_interaction_scores = (user_embeddings * dest_embeddings).sum(dim=1)
    pos_labels = torch.ones(edge_index.size(1))  # Positive labels
    
    # Generate negative samples (random user-destination pairs that do not exist in the graph)
    neg_edge_list = []
    for _ in range(num_neg_samples * edge_index.size(1)):
        user_idx = random.randint(0, len(user_indices) - 1)  # Random user
        dest_idx = random.randint(len(user_indices), num_total_nodes - 1)  # Random destination
        neg_edge_list.append([user_idx, dest_idx])
    
    neg_edge_index = torch.tensor(neg_edge_list, dtype=torch.long).t().contiguous()
    
    # Get embeddings for negative samples
    neg_user_embeddings = out[neg_edge_index[0]]
    neg_dest_embeddings = out[neg_edge_index[1]]
    
    # Compute interaction scores for negative edges
    neg_interaction_scores = (neg_user_embeddings * neg_dest_embeddings).sum(dim=1)
    neg_labels = torch.zeros(neg_edge_index.size(1))  # Negative labels
    
    # Concatenate positive and negative scores and labels
    all_interaction_scores = torch.cat([pos_interaction_scores, neg_interaction_scores])
    all_labels = torch.cat([pos_labels, neg_labels])
    
    # Compute the loss using BCEWithLogitsLoss
    loss = criterion(all_interaction_scores, all_labels)
    
    # Backpropagate and optimize
    loss.backward()
    optimizer.step()

    print(f'Epoch {epoch}, Loss: {loss.item()}')

Epoch 0, Loss: 0.6881556510925293
Epoch 1, Loss: 0.6851226091384888
Epoch 2, Loss: 0.6807478666305542
Epoch 3, Loss: 0.67481929063797
Epoch 4, Loss: 0.6669038534164429
Epoch 5, Loss: 0.6572286486625671
Epoch 6, Loss: 0.6455220580101013
Epoch 7, Loss: 0.6318177580833435
Epoch 8, Loss: 0.6168252229690552
Epoch 9, Loss: 0.6011227369308472
Epoch 10, Loss: 0.5862175226211548
Epoch 11, Loss: 0.5747014880180359
Epoch 12, Loss: 0.56910640001297
Epoch 13, Loss: 0.56834477186203
Epoch 14, Loss: 0.5710943341255188
Epoch 15, Loss: 0.5772814154624939
Epoch 16, Loss: 0.5778958797454834
Epoch 17, Loss: 0.5776435732841492
Epoch 18, Loss: 0.5742995738983154
Epoch 19, Loss: 0.5702221989631653
Epoch 20, Loss: 0.5655210614204407
Epoch 21, Loss: 0.5649187564849854
Epoch 22, Loss: 0.5627686977386475
Epoch 23, Loss: 0.5634481906890869
Epoch 24, Loss: 0.5631458163261414
Epoch 25, Loss: 0.5640755891799927
Epoch 26, Loss: 0.565493106842041
Epoch 27, Loss: 0.5667837858200073
Epoch 28, Loss: 0.5666300654411316
Ep

In [15]:
# Get embeddings for all users and destinations after training
model.eval()
with torch.no_grad():
    embeddings = model(data)

user_embeddings = embeddings[:num_users]  # First part of the embedding matrix for users
dest_embeddings = embeddings[num_users:]  # Second part for destinations

# Compute similarity scores (dot product or cosine similarity) between users and destinations
recommendation_scores = torch.matmul(user_embeddings, dest_embeddings.t())

# Get top-k recommendations for each user
k = 5
top_k_recommendations = recommendation_scores.topk(k, dim=1).indices
print("Top k recommended destinations for each user:", top_k_recommendations)

Top k recommended destinations for each user: tensor([[10, 92, 61, 11, 76],
        [10, 92, 61, 11, 76],
        [10, 92, 61,  1, 76],
        ...,
        [10, 92, 61, 11, 76],
        [10, 92, 61, 11, 76],
        [10, 92, 61, 11, 76]])


In [16]:
import pandas as pd

# Assuming top_k_recommendations is the tensor of top-k recommended destination indices
# Also assuming 'dest_encoder' was already used to encode the destination names earlier

# Convert tensor to a list of indices
top_k_recommendations = top_k_recommendations.tolist()

# Convert the destination indices back to the destination names
recommendation_names = []
for user_recommendations in top_k_recommendations:
    # Convert each list of destination indices to names
    decoded_dest_names = dest_encoder.inverse_transform(user_recommendations)
    recommendation_names.append(decoded_dest_names)

# Convert to DataFrame for better readability (Optional)
recommended_df = pd.DataFrame(recommendation_names, columns=[f'Top {i+1} Destination' for i in range(len(recommendation_names[0]))])

# Add user identifier (Optional if you want to associate with user IDs)
recommended_df.insert(0, 'User ID', user_data['User ID'])

In [17]:
# Display the DataFrame of recommendations
print(recommended_df)

      User ID Top 1 Destination     Top 2 Destination Top 3 Destination  \
0           1       Ambalangoda  Elephant Point Beach   Council Chamber   
1           2       Ambalangoda  Elephant Point Beach   Council Chamber   
2           3       Ambalangoda  Elephant Point Beach   Council Chamber   
3           4       Ambalangoda  Elephant Point Beach   Council Chamber   
4           5       Ambalangoda  Elephant Point Beach   Council Chamber   
...       ...               ...                   ...               ...   
9995     9996   Aanda Ella Fall  Elephant Point Beach   Council Chamber   
9996     9997       Ambalangoda  Elephant Point Beach   Council Chamber   
9997     9998       Ambalangoda  Elephant Point Beach   Council Chamber   
9998     9999       Ambalangoda  Elephant Point Beach   Council Chamber   
9999    10000       Ambalangoda  Elephant Point Beach   Council Chamber   

                    Top 4 Destination  \
0     Ambuluwawa Biodiversity Complex   
1     Ambuluwawa 

In [18]:
recommended_df.to_csv("recommendations.csv", index=False)