In [3]:
import pandas as pd
#import torch as nn
import numpy as np
import pdb

In [4]:
df = pd.read_pickle('dataset/dataframe_restaurant_10_50.pkl')

Create <anchor, postive , negative> dataset

In [95]:
from itertools import combinations


def create_triplets(df, anchors_per_item,enforce_max_length=True, max_length=10000):
    if enforce_max_length:
        max_length = max_length
    else:
        max_length = np.inf
        
    triplets_list =[]
    while len(triplets_list) < max_length:
        user = np.random.choice(df['User_id'].unique(),replace=False)
        user_df = df[df['User_id'] == user]
        _, indices = np.unique(user_df['Business_id'], return_index=True)
        length= len(indices)
        if length< anchors_per_item:
            continue
        else:
            selected_indices = np.random.choice(indices, size=anchors_per_item, 
                                                    replace=False)
            
            ratings = user_df['Rating'].iloc[selected_indices]
            anchor_indexes = ratings.index.values
            flag=False
            for rating, index, anchor_index in zip(ratings,selected_indices,anchor_indexes):

                """ Assuming the rating is only 4 and 5, we use a simple
                 is equals and not equals to get the positive and negative """
                if flag:
                    break

                positive_indices = ratings.index[(ratings == rating) & 
                                                 (ratings.index != anchor_index)]

                negative_indices = ratings.index[ratings != rating]
                flag = True
                for i in range(min(len(positive_indices),len(negative_indices))):
                    
                    #user_id = user_df['User_id'].iloc[index]
                    #pos_id  = user_df['User_id'].loc[positive_indices[i]]
                    triplets_list.append([anchor_index,positive_indices[i],
                                         negative_indices[i],])
                    
                    
                    flag = False

    triplets_df = pd.DataFrame(triplets_list,columns=['anchor','positive','negative'])
    return triplets_df
        


In [96]:
triplets= create_triplets(df, 10, max_length=1000)

Create Contrastive Pairs

In [50]:
""" Contrastive structure can be obtainined from the triplet structure """
contrastive_df = triplets[['positive', 'negative']].copy() 

Creating custom dataset and dataloader

In [97]:
from torch.utils.data import Dataset, DataLoader
class TripletDataset(Dataset):
    def __init__(self, triplets_df, df):
        self.triplets_df = triplets_df
        self.df = df
        
    def __len__(self):
        return len(self.triplets_df)
    
    def __getitem__(self, idx):
        """ Retrun the "User_id", "Business_id" and "Rating" of df.iloc[idx], where
         idx is the value stores in the "anchor", "positive" and "negative" columns.
        
         Final output should be 3 user-item-rating tuples """
        anchor_idx = self.triplets_df['anchor'].iloc[idx]
        positive_idx = self.triplets_df['positive'].iloc[idx]
        negative_idx = self.triplets_df['negative'].iloc[idx]

        anchor_val = self.df.loc[anchor_idx].to_list()
        positive_val = self.df.loc[positive_idx].to_list()
        negative_val = self.df.loc[negative_idx].to_list()

        return anchor_val, positive_val, negative_val
    
triplet_dataset = TripletDataset(triplets, df[['User_id', 'Business_id', 'Rating']])

In [101]:
import torch

def collate_fn(batch):
    # Convert the list of tuples into separate lists for each component (User_id, Business_id, Rating)
    anchor_vals, positive_vals, negative_vals = zip(*batch)

    # Stack the lists to create tensors for each component
    anchor_tensor = torch.tensor(anchor_vals)
    positive_tensor = torch.tensor(positive_vals)
    negative_tensor = torch.tensor(negative_vals)

    # Stack the tensors along dimension 1 to create a single tensor of shape (batch_size, 3)
    combined_tensor = torch.stack((anchor_tensor, positive_tensor, negative_tensor), dim=1)

    return combined_tensor


In [102]:
triplet_dataloader = DataLoader(triplet_dataset, batch_size=10,collate_fn=collate_fn, shuffle=False)


In [104]:
# get 1 batch
anchor = next(iter(triplet_dataloader))

In [113]:
anchor[:,2]

tensor([[1.0514e+06, 1.3393e+04, 5.0000e+00],
        [1.0514e+06, 1.8300e+04, 5.0000e+00],
        [1.0514e+06, 4.1222e+04, 5.0000e+00],
        [1.0514e+06, 2.2145e+04, 5.0000e+00],
        [1.0514e+06, 1.3393e+04, 5.0000e+00],
        [1.0514e+06, 1.8300e+04, 5.0000e+00],
        [1.0514e+06, 4.1222e+04, 5.0000e+00],
        [1.0514e+06, 2.2145e+04, 5.0000e+00],
        [1.0514e+06, 4.3438e+04, 4.0000e+00],
        [1.0514e+06, 1.5691e+04, 4.0000e+00]])

Build Siamese Network

In [12]:
import torch 
import torch.nn as nn
import torch.optim as optim

In [117]:
class SiameseNetwork(nn.Module):
    def __init__(self, input_size):
        super(SiameseNetwork, self).__init__()

        """ Architecture of the network """
        self.network = nn.Sequential(
            nn.Linear(input_size,16),
            nn.ReLU(),
            nn.Linear(16,32),
            nn.ReLU(),
            nn.Linear(32,64),
            nn.ReLU(),
            nn.Linear(64,16),
        )
      

    def forward(self,input1):
        output1 = self.network(input1)
        
        
        return output1



Train the Siamese Network

Triplet Loss

In [114]:
def triplet_loss(anchor_embed, positive_embed, negative_embed, margin=1.0):
    distance_positive = torch.nn.functional.pairwise_distance(anchor_embed, positive_embed)
    distance_negative = torch.nn.functional.pairwise_distance(anchor_embed, negative_embed)
    
    loss = torch.nn.functional.relu(distance_positive - distance_negative + margin)
    return torch.mean(loss)

In [118]:
epochs = 100
learning_rate = 0.001
batch_size = 10
model = SiameseNetwork(3)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(epochs):
    for batch in triplet_dataloader:
        anchor = batch[:,0]
        positive = batch[:,1]
        negative = batch[:,2]

        anchor_output = model(anchor)
        positive_output = model(positive)
        negative_output = model(negative)

        #Triplet loss
        loss = triplet_loss(anchor_output, positive_output, negative_output)

        #Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


In [121]:
model.eval()
test= df[['User_id', 'Business_id', 'Rating']].iloc[115:175]


In [125]:
#convert to tensor
test_tensor = torch.tensor(test.values.astype(np.float32))

In [127]:
embeddings=[]
for input in test_tensor:
    output = model(input)
    embeddings.append(output.detach().numpy())

    

In [129]:
embeddings[0]

array([ 2277.915  , -3992.1597 ,  1805.6791 , -3170.4724 ,  1288.1118 ,
       -1771.0773 , -1151.3721 ,  -167.62589,  2773.5547 ,  1685.4729 ,
        3655.6655 , -1804.9977 ,  2099.8184 ,   -73.06734, -2357.4268 ,
       -4431.4277 ], dtype=float32)