In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import pickle

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import random

In [11]:
# Dataset: RetailRocket, https://www.kaggle.com/datasets/retailrocket/ecommerce-dataset
# Cleaning of events done in separate notebook!
with open('events_df_cleaned.df', 'rb') as f:
    events = pickle.load(f)

item_properties1 = pd.read_csv('retail_rocket/item_properties_part1.csv')
item_properties2 = pd.read_csv('retail_rocket/item_properties_part2.csv')
item_properties = pd.concat([item_properties1, item_properties2], ignore_index=True)

del item_properties1, item_properties2

category_tree = pd.read_csv('retail_rocket/category_tree.csv')

In [12]:
events.shape, item_properties.shape, category_tree.shape

((20282, 13), (20275902, 4), (1669, 2))

In [13]:
events.head(3)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date_time,day_of_week,hour,total_events_user,total_events_item,unique_items_interacted,unique_users_interacted,interaction_value
0,1433222276276,599528,transaction,356475,4000.0,2015-06-02 05:17:56.276,Tuesday,5,1,2,1,2,0.8
1,1433193500981,121688,transaction,15335,11117.0,2015-06-01 21:18:20.981,Monday,21,11,1,11,1,0.8
2,1433193915008,552148,transaction,81345,5444.0,2015-06-01 21:25:15.008,Monday,21,1,1,1,1,0.8


In [14]:
item_properties.head(3)

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566


In [15]:
category_tree.head(3)

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0


# Feature Engineering

In [16]:
# Create user-item interaction matrix
interaction_matrix = events.pivot_table(index='visitorid', columns='itemid', values='event', aggfunc='count').fillna(0)

In [17]:
sum(interaction_matrix.values)

array([1., 1., 2., ..., 2., 1., 1.])

In [18]:
# Encode event types into numerical values
event_type_mapping = {'view': 1, 'addtocart': 5, 'transaction': 10} # Give proportial weights to interactivity
events['event'] = events['event'].map(event_type_mapping)

In [19]:
events.head(3)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,date_time,day_of_week,hour,total_events_user,total_events_item,unique_items_interacted,unique_users_interacted,interaction_value
0,1433222276276,599528,10,356475,4000.0,2015-06-02 05:17:56.276,Tuesday,5,1,2,1,2,0.8
1,1433193500981,121688,10,15335,11117.0,2015-06-01 21:18:20.981,Monday,21,11,1,11,1,0.8
2,1433193915008,552148,10,81345,5444.0,2015-06-01 21:25:15.008,Monday,21,1,1,1,1,0.8


In [20]:
user_history = events.groupby('visitorid')['itemid'].agg(list).reset_index()

In [21]:
user_history

Unnamed: 0,visitorid,itemid
0,172,"[465522, 10034]"
1,186,[49029]
2,264,"[459835, 161949]"
3,419,[19278]
4,539,[94371]
...,...,...
11224,1406787,[336832]
11225,1406981,[436004]
11226,1407070,[215596]
11227,1407110,[360922]


In [22]:
# Merge item properties to get the latest property values
item_properties = item_properties[-10_000:]
item_properties = item_properties.sort_values(by=['itemid', 'timestamp']).drop_duplicates(subset=['itemid', 'property'], keep='last')

In [23]:
item_properties.head(3)

Unnamed: 0,timestamp,itemid,property,value
20269289,1436065200000,42,790,n199080.000
20266721,1439694000000,59,470,769062
20270454,1431226800000,62,categoryid,342


In [25]:
print('Events data shape:', events.shape)
print('Item properties shape:', item_properties.shape)
print('Category tree shape:', category_tree.shape)

Events data shape: (20282, 13)
Item properties shape: (10000, 4)
Category tree shape: (1669, 2)


# Item-Based Collaborative Filtering

In [26]:
class InteractionDataset(Dataset):
    def __init__(self, interactions):
        self.interactions = interactions
        self.user_ids = {id: idx for idx, id in enumerate(interactions['visitorid'].unique())}
        self.item_ids = {id: idx for idx, id in enumerate(interactions['itemid'].unique())}
        
        self.interactions['visitorid'] = self.interactions['visitorid'].map(self.user_ids)
        self.interactions['itemid'] = self.interactions['itemid'].map(self.item_ids)

    def __len__(self):
        return len(self.interactions)

    def __getitem__(self, idx):
        user = self.interactions.iloc[idx, 1]
        item = self.interactions.iloc[idx, 3]
        rating = self.interactions.iloc[idx, 2]
        return torch.tensor(user, dtype=torch.long), torch.tensor(item, dtype=torch.long), torch.tensor(rating, dtype=torch.float)

In [27]:
# Split data into training and test sets
train_data, test_data = train_test_split(events, test_size=0.2, random_state=42)

train_dataset = InteractionDataset(train_data)
test_dataset = InteractionDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [28]:
class RecommenderNet(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=50):
        super(RecommenderNet, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.item_embedding = nn.Embedding(num_items, embedding_size)
        self.fc1 = nn.Linear(embedding_size*2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, user, item):
        user_emb = self.user_embedding(user)
        item_emb = self.item_embedding(item)
        x = torch.cat([user_emb, item_emb], dim=1)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x.squeeze()

In [29]:
num_users = events['visitorid'].nunique()
num_items = events['itemid'].nunique()

num_users, num_items

(11229, 11485)

In [32]:
model = RecommenderNet(num_users, num_items)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

def train(model, train_loader, criterion, optimizer, epochs=4):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for user, item, rating in train_loader:
            optimizer.zero_grad()
            outputs = model(user, item)
            loss = criterion(outputs, rating)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}')
train(model, train_loader, criterion, optimizer)

Epoch 1/4, Loss: 2.9802440515184028
Epoch 2/4, Loss: 0.46566412867758217
Epoch 3/4, Loss: 0.2876269191972853
Epoch 4/4, Loss: 0.19792331397298754


Learning loss and epochs were chosed manually to ensure test loss is not bigger than the train loss (a sign for overfitting).

In [33]:
def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for user, item, rating in test_loader:
            outputs = model(user, item)
            loss = criterion(outputs, rating)
            total_loss += loss.item()
    print(f'Test Loss: {total_loss/len(test_loader)}')
evaluate(model, test_loader, criterion)

Test Loss: 0.20225184550508857


# Content-Based Filtering 

In [34]:
item_properties = item_properties[-10_000:]

# Merge item properties into a single string per item
item_properties['properties'] = item_properties.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [35]:
item_properties.head(3)

Unnamed: 0,timestamp,itemid,property,value,properties
20269289,1436065200000,42,790,n199080.000,1436065200000 42 790 n199080.000
20266721,1439694000000,59,470,769062,1439694000000 59 470 769062
20270454,1431226800000,62,categoryid,342,1431226800000 62 categoryid 342


Note: Because of the nature of the dataset, we don't get any readable properties, instead they are just abstract numbers. In a real use-case they would probably contain a lot of useful info that different items can be related to and similarties can be found. For now we just row with the random number jumbo.

In [36]:
# Use TF-IDF to vectorize the item properties
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(item_properties['properties'])

In [37]:
tfidf_matrix

<10000x18059 sparse matrix of type '<class 'numpy.float64'>'
	with 54102 stored elements in Compressed Sparse Row format>

In [38]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [39]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.10235842,
        0.1455046 ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.10235842, ..., 0.        , 1.        ,
        0.08864493],
       [0.        , 0.        , 0.1455046 , ..., 0.        , 0.08864493,
        1.        ]])

In [40]:
# Create a mapping from item IDs to indices
item_id_to_idx = {item_id: idx for idx, item_id in enumerate(item_properties['itemid'].unique())}
idx_to_item_id = {idx: item_id for item_id, idx in item_id_to_idx.items()}

In [41]:
# Function to get item recommendations based on similarity scores
def get_content_based_recommendations(item_id, cosine_sim=cosine_sim):
    if item_id not in item_id_to_idx:
        return "Item ID not found in item properties."
    
    idx = item_id_to_idx[item_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 similar items, excluding itself
    item_indices = [i[0] for i in sim_scores]
    recommended_items = [idx_to_item_id[i] for i in item_indices if i in idx_to_item_id]
    return recommended_items

In [45]:
print(get_content_based_recommendations(59))

[118711, 446072, 454666, 461690, 310280, 159398, 414522, 29260, 85377, 283017]


# Hybrid

In [46]:
# Function to predict interactivity
def predict_interactivity(model, user_id, item_id, user_ids, item_ids):
    if user_id not in user_ids or item_id not in item_ids:
        return "User or item not found in training data."
    
    user_idx = torch.tensor([user_ids[user_id]], dtype=torch.long)
    item_idx = torch.tensor([item_ids[item_id]], dtype=torch.long)
    
    model.eval()
    with torch.no_grad():
        prediction = model(user_idx, item_idx).item()
    
    return prediction

In [47]:
def recommend_future_items(user_id, user_history, user_ids, item_ids, category_tree, num_recommendations=10):
    user_items = user_history[user_history.visitorid == user_id].itemid.to_list()[0]
    
    # Use collaborative filtering for users with history
    collaborative_recommendations = []
    if user_items:
        for item_id in item_ids:
            if item_id not in user_items:
                interactivity = predict_interactivity(model, user_id, item_id, user_ids, item_ids)
                collaborative_recommendations.append((item_id, interactivity))
        collaborative_recommendations = sorted(collaborative_recommendations, key=lambda x: x[1], reverse=True)
        collaborative_recommendations = [item[0] for item in collaborative_recommendations[:num_recommendations]]

    # Get content-based recommendations
    content_based_recommendations = []
    if user_items:
        # Pick a random item from the user's history to base content recommendations on
        random_item_id = random.choice(user_items)
        content_based_recommendations = get_content_based_recommendations(random_item_id)
    else:
        # Use a random item id if the user has no history
        random_item_id = random.choice(list(item_ids.keys()))
        content_based_recommendations = get_content_based_recommendations(random_item_id)
    
    # Get items from the same parent category
    parent_category_recommendations = []
    if user_items:
        for item_id in user_items:
            parent_category = category_tree[category_tree['categoryid'] == item_id]['parentid'].values
            if parent_category.size > 0:
                parent_category_id = parent_category[0]
                sibling_items = category_tree[category_tree['parentid'] == parent_category_id]['categoryid'].values
                sibling_items = [item for item in sibling_items if item not in user_items and item in item_ids]
                parent_category_recommendations.extend(sibling_items)
    
    # Prioritize collaborative recommendations, sprinkle in content-based and parent category recommendations
    combined_recommendations = collaborative_recommendations
    combined_recommendations.extend([item for item in content_based_recommendations if item not in combined_recommendations])
    combined_recommendations.extend([item for item in parent_category_recommendations if item not in combined_recommendations])
    
    # Return the top N recommendations
    return combined_recommendations[:num_recommendations]

In [48]:
user_id = list(train_dataset.user_ids.keys())[0]
item_id = list(train_dataset.item_ids.keys())[0]
print(f'Predicted interactivity for user {user_id} and item {item_id}: {predict_interactivity(model, user_id, item_id, train_dataset.user_ids, train_dataset.item_ids)}')

Predicted interactivity for user 235392 and item 317296: 9.829662322998047


In [49]:
print(f'Recommended items for user {user_id}: {recommend_future_items(user_id, user_history, train_dataset.user_ids, train_dataset.item_ids, category_tree)}')

Recommended items for user 235392: [378505, 355511, 84595, 451207, 30702, 367490, 343104, 377327, 132854, 362339]


# Handling new users and items

In [50]:
def add_new_user(user_id, user_ids):
    if user_id not in user_ids:
        user_ids[user_id] = len(user_ids)

new_user_id = max(train_dataset.user_ids.keys()) + 1
add_new_user(new_user_id, train_dataset.user_ids)
print(f'New user added with ID {new_user_id}')

New user added with ID 1407399


In [51]:
def add_new_item(item_id, item_ids, item_id_to_idx, idx_to_item_id, cosine_sim):
    if item_id not in item_ids:
        new_idx = len(item_ids)
        item_ids[item_id] = new_idx
        item_id_to_idx[item_id] = new_idx
        idx_to_item_id[new_idx] = item_id
        
        # Expand the cosine_sim matrix to accommodate the new item
        new_row = np.zeros((1, cosine_sim.shape[1]))
        new_col = np.zeros((cosine_sim.shape[0] + 1, 1))
        cosine_sim = np.vstack((cosine_sim, new_row))
        cosine_sim = np.hstack((cosine_sim, new_col))

        # Optionally update the cosine_sim matrix with new similarities

    return cosine_sim

new_item_id = max(train_dataset.item_ids.keys()) + 1
cosine_sim = add_new_item(new_item_id, train_dataset.item_ids, item_id_to_idx, idx_to_item_id, cosine_sim)
print(f'New item added with ID {new_item_id}')

New item added with ID 466862


In [52]:
def save_model(model, path='recommender_model.pth'):
    torch.save(model.state_dict(), path)

In [53]:
def save_cosine_sim(cosine_sim, path='cosine_sim.pkl'):
    with open(path, 'wb') as f:
        pickle.dump(cosine_sim, f)

In [54]:
save_model(model, 'recommender_model.pth')
save_cosine_sim(cosine_sim, 'cosine_sim.pkl')