# GNN Node Embeddings


Firstly, we import the necessary libraries to the notebook

In [1]:
import pandas as pd
import numpy as np
import torch
pd.set_option('display.max_columns', None)
from torch_geometric.nn import SAGEConv #, Sequential
from torch.nn import ReLU, Sequential, Linear
import torch.nn.functional as F
from torch_geometric.data import HeteroData
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

First lets set the seed.

In [2]:
np.random.seed(1337) 

Then we read the csv file to the notebook and look at the top 3 columns.

In [3]:
# Loading the data - Adapt the file location according to your folder structure
df = pd.read_csv("")
df.head(3)

FileNotFoundError: [Errno 2] No such file or directory: ''

I will create a copy of the original dataframe so I can remap the embeddings to the original items

In [4]:
# Create a copy of the original DataFrame
df_original = df.copy()

To create nodes for the location. I decided to create a location variable which is consistent of "il", "ilce": and "mahalle". Since we had a lot of Nan values in these column and dealing with such uncertain values are combersome and we having mand data points, I decided to remove the data points with nans in this column.

In [5]:
df = df.dropna(subset=['il', 'ilce', 'mahalle'])
df['location'] = df['il'] + '-' + df['ilce'] + '-' + df['mahalle']


Then I set up the node mappings here. I want to create a unique node for each item. Since we have duplicates in the tabular data. I will deal with them here.

In [6]:
location_mapping = {loc: i for i, loc in enumerate(df['location'].unique())}
df['location_id'] = df['location'].map(location_mapping)

client_id_mapping = {id: i for i, id in enumerate(df['client_id'].unique())}
df['client_id'] = df['client_id'].map(client_id_mapping)

product_id_mapping = {id: i for i, id in enumerate(df['product_id'].unique())}
df['product_id'] = df['product_id'].map(product_id_mapping)

In [7]:
nodes_client_one_hot = pd.get_dummies(df[["client_id", "event_type", "transaction_type", "device_category"]], columns=["device_category", "event_type", "transaction_type"])
nodes_client_one_hot = nodes_client_one_hot.groupby("client_id").mean().reset_index()

nodes_product_one_hot = pd.get_dummies(df[["product_id", "product_list_position", "kategori_1", "kategori_2", "kategori_3"]], columns=["product_list_position", "kategori_1", "kategori_2", "kategori_3"])
nodes_product_one_hot = nodes_product_one_hot.groupby("product_id").mean().reset_index()

nodes_location = pd.DataFrame({'location_id': list(location_mapping.values())})
nodes_location['dummy'] = 1  # Since location has no feature and we need to use its id as the data index,
# I create a dummy variable here.

I decide to create a simple graph where we have the following graph structure:

            location --> product <--> client

In [8]:
client_product_edges = torch.tensor([df['client_id'].values, df['product_id'].values], dtype=torch.long)
product_client_edges = torch.tensor([df['product_id'].values, df['client_id'].values], dtype=torch.long)
product_location_edges = torch.tensor([df['product_id'].values, df['location_id'].values], dtype=torch.long)
location_product_edges = torch.tensor([df['location_id'].values, df['product_id'].values], dtype=torch.long)

  client_product_edges = torch.tensor([df['client_id'].values, df['product_id'].values], dtype=torch.long)


The graph needs a specific data structure called HeteroData. This is needed because our aim is to create embedding vectors with different types of nodes. So we store our edges and nodes in this special data type.

In [9]:
data = HeteroData()
data['client'].x = torch.tensor(nodes_client_one_hot.values, dtype=torch.float)
data['product'].x = torch.tensor(nodes_product_one_hot.values, dtype=torch.float)
data['location'].x = torch.tensor(nodes_location.values, dtype=torch.float)
data['client', 'interacts', 'product'].edge_index = client_product_edges
data['product', 'interacted_by', 'client'].edge_index = product_client_edges
data['product', 'be_found_in', 'location'].edge_index = product_location_edges


We then define the Graphical Neural network model. We used a really simple graph model where we have one SAGEConv layer and one Linear layer where we project the dimensions of the embeddings to be 2 dimensional

In [10]:
class CustomHeteroGNN(torch.nn.Module):
    def __init__(self, hidden_channels, uniform_out_channels):
        super(CustomHeteroGNN, self).__init__()
        # Defining the Layers
        self.conv1_client_product = SAGEConv((-1, -1), hidden_channels)
        self.conv1_product_client = SAGEConv((-1, -1), hidden_channels)
        self.conv1_product_location = SAGEConv((-1, -1), hidden_channels)

        # PRoject Layers to be 2 dimensional
        self.client_out = Linear(hidden_channels, uniform_out_channels)
        self.product_out = Linear(hidden_channels, uniform_out_channels)
        self.location_out = Linear(hidden_channels, uniform_out_channels)

    def forward(self, x_dict, edge_index_dict):
        # Client => Product
        x_client_to_product = F.relu(self.conv1_client_product((x_dict['client'], x_dict['product']), edge_index_dict[('client', 'interacts', 'product')]))
        # Product => Client 
        x_product_to_client = F.relu(self.conv1_product_client((x_dict['product'], x_dict['client']), edge_index_dict[('product', 'interacted_by', 'client')]))
        # Product => Location
        x_product_to_location = F.relu(self.conv1_product_location((x_dict['product'], x_dict['location']), edge_index_dict[('product', 'be_found_in', 'location')]))

        # Fill our output dictionary so we can access them outside of the function
        x_dict['client'] = self.client_out(x_client_to_product)
        x_dict['product'] = self.product_out(x_product_to_client)
        x_dict['location'] = self.location_out(x_product_to_location)

        return x_dict



def train(model, data, optimizer, epochs=120):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        
        z_dict = model(data.x_dict, data.edge_index_dict)
        
        # Simple mse loss that takes the difference between the prediction and 0 tensor and averages it
        
        loss = sum(F.mse_loss(z, torch.zeros_like(z)) for z in z_dict.values()) / len(z_dict.values())
        
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}: Loss {loss.item()}')
    print(f'Epoch {epoch}: Loss {loss.item()}')



Now lets train our model and observe the loss function

In [11]:
model = CustomHeteroGNN(hidden_channels=64, uniform_out_channels=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

train(model, data, optimizer)

Epoch 0: Loss 862893.0
Epoch 10: Loss 189673.640625
Epoch 20: Loss 26005.349609375
Epoch 30: Loss 5563.419921875
Epoch 40: Loss 6763.08447265625
Epoch 50: Loss 2459.5380859375
Epoch 60: Loss 817.9625854492188
Epoch 70: Loss 306.91796875
Epoch 80: Loss 219.52471923828125
Epoch 90: Loss 172.0084228515625
Epoch 100: Loss 128.01121520996094
Epoch 110: Loss 99.48551177978516
Epoch 119: Loss 86.44808197021484


We see that the loss in decreaseing. That being said this is a fairly simple loss function and if given time a better loss function should be used.

Now lets look at our embeddings

In [12]:
model.eval()  

with torch.no_grad():  # Disables gradient computation so we dont change traıned model
    z_dict = model(data.x_dict, data.edge_index_dict)

client_embeddings = z_dict['client']  
product_embeddings = z_dict['product']  
location_embeddings = z_dict['location']  

print("Client embeddings shape:", client_embeddings.shape)
print("Product embeddings shape:", product_embeddings.shape)
print("Location embeddings shape:", location_embeddings.shape)

Client embeddings shape: torch.Size([50778, 2])
Product embeddings shape: torch.Size([56193, 2])
Location embeddings shape: torch.Size([10292, 2])


We get 2 dimensional embeddings for each unique item type. It is possible to get a higher dimensional embeddings but will not further invertigate this for the simplicity of the case.

We can evaluate our embeddings on a local level and a global level.

To evaluate our embeddings on the local level, we can simply pick two embeddings and look at their original form and compare it to their cosine similarity. A higher cosine similarity means our embeddings are closer in the 2 dimensional space and the products should reflect that.

In [13]:
def predict_link(embedding1, embedding2):
    cos_sim = torch.nn.functional.cosine_similarity(embedding1, embedding2, dim=0)
    return cos_sim.item()

# Ids for example 
product_one_id = 10
product_two_id = 150

product1_embedding = product_embeddings[product_one_id]
product2_embedding = product_embeddings[product_two_id]
similarity_score = predict_link(product1_embedding, product2_embedding)
print(f'Link Prediction Score: {similarity_score}')


Link Prediction Score: -0.5370484590530396


In [14]:
index_to_product_id = {index: product_id for product_id, index in product_id_mapping.items()}

product1_index = product_one_id  
product2_index = product_two_id  


original_product1_id = index_to_product_id[product1_index]
original_product2_id = index_to_product_id[product2_index]

print(f"Second id is {original_product1_id}")
print(f"Second id is {original_product2_id}")

product1_info = df_original[df_original['product_id'] == original_product1_id]
product2_info = df_original[df_original['product_id'] == original_product2_id]



Second id is 11396543
Second id is 13205323


In [15]:
product1_info[["product_id", "product_list_position", "kategori_1", "kategori_2", "kategori_3"]].head(2)


Unnamed: 0,product_id,product_list_position,kategori_1,kategori_2,kategori_3
1781,11396543,12,kiralik,konut,
27022,11396543,26,kiralik,konut,daire


In [16]:
product2_info[["product_id", "product_list_position", "kategori_1", "kategori_2", "kategori_3"]].head(2)

Unnamed: 0,product_id,product_list_position,kategori_1,kategori_2,kategori_3
2158,13205323,2,satilik,konut,daire
145760,13205323,1,satilik,konut,daire


We can also look at the performance of our embeddings on a global level with KMeans algorithm. K-means algorithm is also an unsupervized learning algorithm that tries to create natural clusters. Then we can use these clusters to see if their is huge inter cluster difference or not by using the silhouette score which is calculated as follow:

                                        (b - a) / max(a, b)
                                        
This will take some time!!

In [17]:
for i in range(2,7):
    kmeans = KMeans(n_clusters=i, random_state=42).fit(client_embeddings)
    labels = kmeans.labels_

    silhouette_avg = silhouette_score(client_embeddings, labels)
    print(f'Silhouette Score for {i} clusters: {silhouette_avg}')



Silhouette Score: 0.4074034094810486
Silhouette Score: 0.40675559639930725
Silhouette Score: 0.3888115882873535
Silhouette Score: 0.4548274278640747
Silhouette Score: 0.4172956347465515


We see that overall best clustering in our range is with 4 groups of customers. Since the silhouette score is between 1 and -1 where 1 is a perfect clustering. Our clustering seem to be on the Ok side.