# Getting Started

In this notebook, we will write the code for popular recommender system algorithsm from scratch. We will first generate a toy dataset with user, product and interaction data. Then, use the dataset to train four models discussed in the recommender system class.

1.   Content Filtering
2.   Collaborative Filtering
3.   Matrix Factorization
4.   Two-Tower Model



## Load Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

## Generate User and Product Data

In [2]:
# Product data
product_data = {
    'product_id': [1, 2, 3, 4, 5],
    'product': ['Kindle', 'Macbook', 'Brown Leather Shoes', 'Samsung 75-Inch QLED TV', 'LG 60-Inch QLED TV'],
    'genres': ['Electronics, Tablet', 'Electronics, Computer, Apple', 'Clothing', 'Electronics, Television, QLED', 'Electronics, Television, QLED']
}

# User-product interaction data
user_data = {
    'user_id': [1, 1, 1, 2, 2, 3, 3, 4, 4, 5],
    'product_id': [1, 2, 3, 2, 4, 1, 2, 3, 4, 5],
}

df_product = pd.DataFrame(product_data)
df_user = pd.DataFrame(user_data)

# Models

## Content Filtering

In [5]:
df_product

Unnamed: 0,product_id,product,genres
0,1,Kindle,"Electronics, Tablet"
1,2,Macbook,"Electronics, Computer, Apple"
2,3,Brown Leather Shoes,Clothing
3,4,Samsung 75-Inch QLED TV,"Electronics, Television, QLED"
4,5,LG 60-Inch QLED TV,"Electronics, Television, QLED"


In [3]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_product['genres'])

In [4]:
# Let's see what this product features dataframe looks like
product_features = (
    pd.DataFrame(count_matrix.toarray(),
                 columns=count.get_feature_names_out())
); product_features

Unnamed: 0,apple,clothing,computer,electronics,qled,tablet,television
0,0,0,0,1,0,1,0
1,1,0,1,1,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,1,1,0,1
4,0,0,0,1,1,0,1


In [None]:
user_products = df_user[df_user.user_id == '1']['product_id'].tolist()
product_idx = np.array(user_products) -1 

In [None]:
# Create a user profile by applying the mean aggregation on the features of the products
# that the user has purchased before.
def create_user_profile(user_id):
    user_products = df_user[df_user['user_id'] == user_id]['product_id'].tolist()
    # Map the product ID to the index of the count matrix. ID = 1 has an index of 0
    products_index = np.array(user_products) - 1
    # Aggregate to derive the user characteristics
    user_profile = count_matrix[products_index].mean(axis=0)
    return np.array(user_profile).reshape(1, -1)

# Function that takes in user_id as input and outputs most similar products
def recommend_products(user_id):
    user_profile = create_user_profile(user_id)
    sim_scores = cosine_similarity(user_profile, count_matrix)
    sim_scores = sorted(list(enumerate(sim_scores[0])), key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[0:5]  # Get the scores of the 5 most similar products
    product_indices = [i[0] for i in sim_scores]  # Get the product indices
    return df_product['product'].iloc[product_indices]  # Return the top 5 most similar products

# Recommend product for a user
userID = 1
print(recommend_products(userID))

1                    Macbook
0                     Kindle
3    Samsung 75-Inch QLED TV
4         LG 60-Inch QLED TV
2        Brown Leather Shoes
Name: product, dtype: object


## Collaborative Filtering

In [None]:
# Pivot the data to get a user-product matrix
user_product_matrix = pd.pivot_table(df_user, index='user_id', columns='product_id', aggfunc=lambda x: 1, fill_value=0)

# Calculate cosine similarity between users
user_similarities = cosine_similarity(user_product_matrix)

# Convert similarity matrix to DataFrame for easier lookup
user_similarities_df = pd.DataFrame(user_similarities, index=user_product_matrix.index, columns=user_product_matrix.index)

def recommend_products(user_id, num_recommendations):
    # Get the most similar users
    similar_users = user_similarities_df[user_id].sort_values(ascending=False).reset_index().user_id[1:]

    # Get the list of products the user has purchased
    user_products = set(user_product_matrix.loc[user_id][user_product_matrix.loc[user_id] > 0].index)

    # Get product recommendations from similar users
    recommendations = set()
    for user in similar_users:
        if len(recommendations) < num_recommendations:
            similar_user_products = set(user_product_matrix.loc[user][user_product_matrix.loc[user] > 0].index)
            recommendations.update(similar_user_products.difference(user_products))
        else:
            break

    # Retrieve the names of the top K products given the product IDs
    recommended_product_id = list(recommendations)[:num_recommendations]
    recommended_products = df_product[df_product.product_id.isin(recommended_product_id)]['product'].reset_index(drop=True)
    return recommended_products

# Recommend 5 products for user 1
print(recommend_products(1, 5))

0    Samsung 75-Inch QLED TV
1         LG 60-Inch QLED TV
Name: product, dtype: object


## Matrix Factorization

In [None]:
user_product_matrix

product_id,1,2,3,4,5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,1,1,0,0
2,0,1,0,1,0
3,1,1,0,0,0
4,0,0,1,1,0
5,0,0,0,0,1


In [None]:
from scipy.sparse.linalg import svds

# Assume user_product_matrix is a pandas DataFrame representing your user-item matrix
matrix = user_product_matrix.values

# Normalize the matrix by subtracting the mean
user_product_mean = np.mean(matrix, axis = 1)
matrix_demeaned = matrix - user_product_mean.reshape(-1, 1)

# Apply SVD
U, sigma, Vt = svds(matrix_demeaned, k = 2)

# Since sigma is just the values, we need to convert it to a diagonal matrix form
sigma = np.diag(sigma)

# Making predictions
all_user_predicted_purchases = np.dot(np.dot(U, sigma), Vt) + user_product_mean.reshape(-1, 1)
all_user_predicted_purchases.round(3)

array([[ 1.026,  1.13 ,  0.514,  0.267,  0.062],
       [ 0.26 ,  0.568,  0.569,  0.773, -0.171],
       [ 0.959,  0.965,  0.233, -0.136, -0.02 ],
       [-0.109,  0.206,  0.684,  1.136,  0.083],
       [ 0.15 , -0.171,  0.083,  0.   ,  0.938]])

In [None]:
user_id = 1
def get_recommendations(id):
  index = id - 1
  unseen_products_ind = matrix[index] == 0
  unseen_products = df_product['product'][unseen_products_ind].values
  unseen_relevance = np.argsort(all_user_predicted_purchases[index][unseen_products_ind])[::-1]
  return unseen_products[unseen_relevance]

get_recommendations(1)

array(['Samsung 75-Inch QLED TV', 'LG 60-Inch QLED TV'], dtype=object)

## Two-Tower Model

In [None]:
user_data = {
    'user_id': [1, 1, 1, 2, 2, 3, 3, 4, 4, 5],
    'product_id': [1, 2, 3, 2, 4, 1, 2, 3, 4, 5],
}

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Input, Flatten, Dense, Dot

# Define hyperparameters
EMBEDDING_SIZE = 50
NUM_USERS = 1000
NUM_ITEMS = 1000

# User Tower
user_input = Input(shape=(1,), name='UserInput')
user_embedding = Embedding(NUM_USERS, EMBEDDING_SIZE, name='UserEmbedding')(user_input)
user_vector = Flatten(name='FlattenUsers')(user_embedding)

# Item Tower
item_input = Input(shape=(1,), name='ItemInput')
item_embedding = Embedding(NUM_ITEMS, EMBEDDING_SIZE, name='ItemEmbedding')(item_input)
item_vector = Flatten(name='FlattenItems')(item_embedding)

# Dot product
dot_product = Dot(axes=1, name='DotProduct')([user_vector, item_vector])

# Output Layer
output = Dense(1, activation='sigmoid', name='Output')(dot_product)

# Create model
model = Model(inputs=[user_input, item_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 UserInput (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 ItemInput (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 UserEmbedding (Embedding)      (None, 1, 50)        50000       ['UserInput[0][0]']              
                                                                                                  
 ItemEmbedding (Embedding)      (None, 1, 50)        50000       ['ItemInput[0][0]']              
                                                                                              

In [None]:
negative_sample = user_data.copy()

SAMPLE_SIZE = 10
# Negative sampling
def negative_sampling(sample):
  negative_user_ids = np.random.choice(sample['user_id'], SAMPLE_SIZE)
  negative_item_ids = np.random.choice(sample['product_id'], SAMPLE_SIZE)

  sample['user_id'] = np.concatenate([sample['user_id'], negative_user_ids])
  sample['product_id'] = np.concatenate([sample['product_id'], negative_item_ids])
  return sample

# Generate labels - 1 if the pair exists in the original data. 0 otherwise.
def generate_labels(sample):
  original_pairs = list(zip(user_data['user_id'], user_data['product_id']))
  negative_pairs = list(zip(negative_sample['user_id'], negative_sample['product_id']))
  # print([1 if pair in original_pairs else 0 for pair in negative_pairs])
  return np.array([1 if pair in original_pairs else 0 for pair in negative_pairs])

negative_sample = negative_sampling(negative_sample)
labels = generate_labels(negative_sample)

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Input, Flatten, Dense

# Define hyperparameters
EMBEDDING_SIZE = 4
NUM_USERS = 6  # highest user_id + 1
NUM_ITEMS = 6  # highest product_id + 1

# User Tower
user_input = Input(shape=(1,), name='UserInput')
user_embedding = Embedding(NUM_USERS, EMBEDDING_SIZE, name='UserEmbedding')(user_input)
user_vector = Flatten(name='FlattenUsers')(user_embedding)

# Item Tower
item_input = Input(shape=(1,), name='ItemInput')
item_embedding = Embedding(NUM_ITEMS, EMBEDDING_SIZE, name='ItemEmbedding')(item_input)
item_vector = Flatten(name='FlattenItems')(item_embedding)

# Dot product
interaction = tf.keras.layers.Dot(axes=1)([user_vector, item_vector])

# Output Layer
output = Dense(1, activation='sigmoid', name='Output')(interaction)

# Create model
model = Model(inputs=[user_input, item_input], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Get the model summary
model.summary()d

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 UserInput (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 ItemInput (InputLayer)         [(None, 1)]          0           []                               
                                                                                                  
 UserEmbedding (Embedding)      (None, 1, 4)         24          ['UserInput[0][0]']              
                                                                                                  
 ItemEmbedding (Embedding)      (None, 1, 4)         24          ['ItemInput[0][0]']              
                                                                                            

In [None]:
# Train the model
model.fit([negative_sample['user_id'], negative_sample['product_id']], labels, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0cf8fb6b00>

In [None]:
# User and item IDs for which to generate predictions
user_ids = np.array([1, 2, 3, 4, 5])
item_ids = np.array([1, 2, 3, 4, 5])

# Generate predictions
predictions = model.predict([user_ids, item_ids])

# Print out the predictions
print(predictions)

[[0.5025484 ]
 [0.5028143 ]
 [0.5022978 ]
 [0.50278115]
 [0.50317115]]


In [None]:
def get_embedding_model(original_model, embedding_layer_name):
    # Identify if the desired embedding is for users or items
    input_index = 0 if 'User' in embedding_layer_name else 1

    # Build a new model that outputs the desired embeddings
    new_model = Model(inputs=original_model.input, outputs=original_model.get_layer(embedding_layer_name).output)
    return new_model, input_index

def get_embeddings(embedding_model, input_index, ids):
    # The model expects two inputs, but we only care about one of them,
    # so we provide a dummy input for the other one.
    dummy_ids = np.zeros_like(ids)
    inputs = [ids, dummy_ids] if input_index == 0 else [dummy_ids, ids]

    # The model's output includes the extra dimension from the Embedding layer,
    # so we squeeze that out to get our embeddings.
    embeddings = np.squeeze(embedding_model.predict(inputs), axis=1)
    return embeddings

# Build models to extract embeddings
user_embedding_model, user_input_index = get_embedding_model(model, 'UserEmbedding')
item_embedding_model, item_input_index = get_embedding_model(model, 'ItemEmbedding')

# Get user embeddings
user_ids = np.array([1, 2, 3, 4, 5])
user_embeddings = get_embeddings(user_embedding_model, user_input_index, user_ids)
print("User Embeddings:")
print(user_embeddings)

# Get item embeddings
item_ids = np.array([1, 2, 3, 4, 5])
item_embeddings = get_embeddings(item_embedding_model, item_input_index, item_ids)
print("Item Embeddings:")
print(item_embeddings)

User Embeddings:
[[ 0.03057821 -0.00083538 -0.03813531  0.02806881]
 [-0.03876324 -0.01676876  0.01186867  0.0328059 ]
 [-0.01270732  0.05802165 -0.01408031 -0.02351359]
 [ 0.00845449  0.0068738   0.02862895  0.02341361]
 [ 0.02500677  0.02297346  0.04870379  0.0599381 ]]
Item Embeddings:
[[ 0.00794118  0.02097261  0.00263787 -0.0164912 ]
 [ 0.05480177 -0.03529551 -0.00170366 -0.01798011]
 [-0.03258595  0.03495893  0.01534543  0.03675174]
 [-0.02847774  0.04735751 -0.02875307 -0.05036929]
 [-0.0170122  -0.02932907 -0.02240189 -0.03961917]]


In [None]:
userID = 1
def nearest_neighbor_recommendation(id):
  user_embedding = user_embeddings[id - 1]
  relevance = np.argsort((user_embedding * item_embeddings).sum(axis=1))[::-1]
  return df_product.iloc[relevance]['product']

nearest_neighbor_recommendation(1)

1                    Macbook
0                     Kindle
2        Brown Leather Shoes
4         LG 60-Inch QLED TV
3    Samsung 75-Inch QLED TV
Name: product, dtype: object