In [22]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
import json
# Load a pre-trained Sentence Transformer model
sent_model = SentenceTransformer('all-MiniLM-L6-v2')
df = pd.read_csv('../data/preprocessed.csv',index_col=[0])

Content Based Model
*   First, I will generate embedding vector for the combination on itemName,brand and category
*   Embbedding will be calculated by using all-MiniLM-L6-v2 pretrained sentence transformer
*   By using embbedded vectors I will create the profile vector for each user and item by aggregating the each one of them.
*   Then , calculate cosine similarity matrix of U*I user and items matrices
*   Note , after generating embbedding I could have done PCA for reducing redundancy , but I didnt have the time.

In [23]:
# Function to concatenate itemName and description
def concatenate_textual_features(row):
    return str("itemName : " + row['itemName']) + " brand : " + str(row['brand'])+ " category : " + str(row['category'])

# Function to encode a batch of texts
def batch_encode(texts, batch_size=32):
    return sent_model.encode(texts, batch_size=batch_size, show_progress_bar=True)

#Calculate the user profile
def user_profile_calculate(user_input_rating,user_item_matrix):
    #calculate user_weight_moview
    user_weighted_movies = user_input_rating.values.reshape(-1,1)*user_item_matrix
    # Sum each column (axis 0 refers to rows)
    sum_user_embedd = np.sum(user_weighted_movies, axis=0).T

    # Normalize the resulting vector
    norm = np.linalg.norm(sum_user_embedd)
    if norm == 0: 
        # Handle the case where the norm is 0
        user_profile = sum_user_embedd
    else:
        user_profile = sum_user_embedd / norm
    return pd.DataFrame(user_profile.values.reshape(1,-1))

#Calculate the item profile
def item_profile_calculate(item_avg_rating,item_vector):
    item_vector = item_vector*item_avg_rating
    norm = np.linalg.norm(item_vector)
    if norm == 0: 
        # Handle the case where the norm is 0
        item_profile = item_vector
    else:
        item_profile = item_vector / norm
    return pd.DataFrame(item_profile.values.reshape(1,-1))

#Clculate the user profile for each user#
def generate_users_profiles(embeddings_df):
    unique_users = embeddings_df['user_id'].unique()
    users_profiles = pd.DataFrame()
    for user_id in unique_users:
        user_input_rating = embeddings_df[(embeddings_df.user_id==user_id)]['rating']
        user_item_matrix = embeddings_df[(embeddings_df.user_id==user_id)].drop(columns=['user_id','item_id','rating'])
        user_p = user_profile_calculate(user_input_rating,user_item_matrix)
        users_profiles = pd.concat([users_profiles, user_p], ignore_index=True)
    users_profiles['user_id'] = unique_users
    return users_profiles

#Clculate the item profile for each user#
def generate_items_profiles(embeddings_df):
    unique_items = embeddings_df['item_id'].unique()
    item_profiles = pd.DataFrame()
    for item_id in unique_items:
        item_avg_rating = embeddings_df[embeddings_df.item_id==item_id]['rating'].mean()
        item_vector = embeddings_df[embeddings_df.item_id==item_id].drop(columns=['user_id','item_id','rating']).iloc[0]
        item_profile = item_profile_calculate(item_avg_rating,item_vector)
        item_profiles = pd.concat([item_profiles, item_profile], ignore_index=True)
    item_profiles['item_id'] = unique_items
    return item_profiles

All the process that i have described above is done in the following code.

In [24]:
# Concatenate the features
df['itemData'] = df.apply(concatenate_textual_features, axis=1)

# Split the dataframe into batches and process each batch
batch_size = 100  
embeddings = batch_encode(df['itemData'].tolist(), batch_size=batch_size)
embeddings_df = pd.concat([df[['user_id','item_id','rating']], pd.DataFrame(embeddings.tolist())], axis=1)

#Generate users and items profiles
users_profiles = generate_users_profiles(embeddings_df)
items_profiles = generate_items_profiles(embeddings_df)

# Extract the features (excluding the user_id and item_id)
user_features = users_profiles.iloc[:, :-1].values
item_features = items_profiles.iloc[:, :-1].values

# Calculate cosine similarity between user vectors and item vectors
similarity_matrix = cosine_similarity(user_features, item_features)

Batches: 100%|██████████| 1111/1111 [09:53<00:00,  1.87it/s]


Loading user-items ids

In [38]:
import json
with open('../outputs/IDs/ids_items_dict.json', 'r') as file:
    ids_items_dict = json.load(file)
with open('../outputs/IDs/usernames_ids_dict.json', 'r') as file:
    usernames_ids_dict = json.load(file)

The following functions will be used for recommending top_k items for specific user with a given similarity metric according to the content.

In [80]:
#Return user_profile for given user_id
def get_user_profile_by_user_id(users_profiles,user_id):
    #Getting the user vector profile#
    if users_profiles[users_profiles['user_id']==user_id].drop(columns=['user_id']).shape[0]==0:
        return users_profiles.mean().to_frame().transpose().drop(columns=['user_id'])
    else:
        return users_profiles[users_profiles['user_id']==user_id].drop(columns=['user_id'])

#Return item_name for given item_id
def get_items_names(ids_items_dict, items_list):
    top_items_names = [ids_items_dict[str(it_id)] for it_id in items_list]
    return pd.DataFrame(top_items_names,columns=['itemName'])
    #return items_ids[items_ids['item_id'].isin(items_list)]

# Top items to recommend for specific user
def get_top_k_items_for_specific_user(top_k,user_profile,items_profiles,dis_metric):
    if dis_metric=='Cosine':
        items_scores = cosine_similarity(user_profile,items_profiles.iloc[:, :-1].values)
        #Looking for the higer cosin simillarity#
        top_items_scores = np.sort(items_scores)[0][::-1][:top_k]
        ratings = [cosine_similarity_to_rating(score) for score in top_items_scores]
        top_items_indices = np.argsort(items_scores)[0][::-1][:top_k]
    elif dis_metric=='Euclidean':
        items_scores = euclidean_distances(user_profile,items_profiles.iloc[:, :-1].values)
        #Looking for the lower euclidean#
        top_items_scores = np.sort(items_scores)[0][:top_k]
        ratings = [euclidean_to_rating(score) for score in top_items_scores]
        top_items_indices = np.argsort(items_scores)[0][:top_k]
    
    top_items_ids = items_profiles.iloc[top_items_indices]['item_id'].values
    return top_items_ids,top_items_scores,ratings

# Convert the cosine similarity score to rating
def cosine_similarity_to_rating(cosine_similarity): 
    normalized_score = (cosine_similarity + 1) / 2
    rating = 1 + (normalized_score * 4)
    return rating

# Convert euclidean score to rating
def euclidean_to_rating(euclidean_dist):
    normalized_score = 1/(euclidean_dist + 1) 
    rating = 1 + (normalized_score * 4)
    return rating

#Recommen k items for a given user#
def model_recommend(user_name,k,dis_metric):
    user_id = usernames_ids_dict.get(user_name,0)
    user_profile = get_user_profile_by_user_id(users_profiles,user_id)
    top_k_items_ids,top_items_scores,ratings = get_top_k_items_for_specific_user(k,user_profile,items_profiles,dis_metric)
    recommendation_df = get_items_names(ids_items_dict,top_k_items_ids)
    recommendation_df['Metric_Score'] = top_items_scores
    recommendation_df['Est_Rating'] = ratings
    return recommendation_df

Similar to the user that was test with SVD , I will check which items are recommended to 'kristina' according to CB cosie-sim

In [81]:
df = pd.read_csv('../data/preprocessed.csv',index_col=[0])
userName='kristina'
#Get the items of specific userName#
df[df['userName']==userName]

Unnamed: 0,userName,itemName,brand,category,price,rating,vote,user_id,item_id
6875,kristina,Pet ID Tags 8 Lines Engraving Available Size S...,Providence Engraving,Pet Supplies,$2.99,5.0,0,15735,6297
7322,kristina,Chuckit Max Glow Ball,Chuckit,Pet Supplies,$5.95,5.0,0,15735,1785
54520,kristina,Blue Buffalo Wilderness High Protein Grain Fre...,Blue Buffalo,Pet Supplies,$33.99,5.0,0,15735,1139


Accoding to the recommendation below
*   Most of the items are related to Pet supplie
*   Embedding vector seemed to act well in terms of getting similar items to the general profile of the user.
*   This method is more conservative , items that arent ralted to the user list will not be recommended.


In [82]:
model_recommend(userName,10,'Cosine')

Unnamed: 0,itemName,Metric_Score,Est_Rating
0,Chuckit Max Glow Ball,0.834548,4.669096
1,Blue Kitty YUMS Cat Treats,0.82094,4.64188
2,Blue Buffalo Wilderness Cat Treats,0.816406,4.632812
3,Pet ID Tags 8 Lines Engraving Available Size S...,0.814725,4.629449
4,Blueberry Pet 7 Patterns Statement Collection ...,0.811267,4.622533
5,Da Bird,0.810505,4.621011
6,Blue Buffalo Wilderness High Protein Grain Fre...,0.810193,4.620385
7,Kitty Convict Cat ID Collars,0.808518,4.617036
8,Blue Buffalo Blue Wilderness Kitten Chicken Re...,0.807545,4.615089
9,Blue Buffalo Freedom Grain Free Natural Puppy ...,0.806481,4.612963


Saving users and items profiles for recommendation

In [83]:
#Save user profiles    
users_profiles.columns = users_profiles.columns.astype(str)
users_profiles.to_feather('../outputs/CB/users_profiles.feather')
#Save item profiles
items_profiles.columns = items_profiles.columns.astype(str)
items_profiles.to_feather('../outputs/CB/items_profiles.feather')
#Save cosin similarity matrix
#np.save('../outputs/CB/similarity_matrix.npy',similarity_matrix)
