In [516]:
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from evaluation_metrics import precision_recall_at_k
from collections import namedtuple
import json

In [None]:
# Load a pre-trained Sentence Transformer model
sent_model = SentenceTransformer('all-MiniLM-L6-v2')
df = pd.read_csv('../data/preprocessed.csv',index_col=[0])

Content Based Model
*   First, I will generate embedding vector for the combination on itemName,brand and category
*   Embbedding will be calculated by using all-MiniLM-L6-v2 pretrained sentence transformer
*   By using embbedded vectors I will create the profile vector for each user and item by aggregating the each one of them.
*   Then , calculate cosine similarity matrix of U*I user and items matrices
*   Note , after generating embbedding I could have done PCA for reducing redundancy , but I didnt have the time.

In [459]:
# Function to concatenate itemName and description
def concatenate_textual_features(row):
    return str("itemName : " + row['itemName']) + " brand : " + str(row['brand'])+ " category : " + str(row['category'])

# Function to encode a batch of texts
def batch_encode(texts, batch_size=32):
    return sent_model.encode(texts, batch_size=batch_size, show_progress_bar=True)

#Calculate the user profile
def user_profile_calculate(user_input_rating,user_item_matrix):
    #calculate user_weight_items
    user_weighted_items = user_input_rating.values.reshape(-1,1)*user_item_matrix
    # Sum each column (axis 0 refers to rows)
    sum_user_embedd = np.sum(user_weighted_items, axis=0).T

    # Normalize the resulting vector
    norm = np.linalg.norm(sum_user_embedd)
    if norm == 0: 
        # Handle the case where the norm is 0
        user_profile = sum_user_embedd
    else:
        user_profile = sum_user_embedd / norm
    return pd.DataFrame(user_profile.values.reshape(1,-1))

#Calculate the item profile
def item_profile_calculate(item_avg_rating,item_vector):
    item_vector = item_vector*item_avg_rating
    norm = np.linalg.norm(item_vector)
    if norm == 0: 
        # Handle the case where the norm is 0
        item_profile = item_vector
    else:
        item_profile = item_vector / norm
    return pd.DataFrame(item_profile.values.reshape(1,-1))

#Clculate the user profile for each user#
def generate_users_profiles(embeddings_df):
    unique_users = embeddings_df['user_id'].unique()
    users_profiles = pd.DataFrame()
    for user_id in unique_users:
        user_input_rating = embeddings_df[(embeddings_df.user_id==user_id)]['rating']
        user_item_matrix = embeddings_df[(embeddings_df.user_id==user_id)].drop(columns=['user_id','item_id','rating'])
        user_p = user_profile_calculate(user_input_rating,user_item_matrix)
        users_profiles = pd.concat([users_profiles, user_p], ignore_index=True)
    users_profiles['user_id'] = unique_users
    return users_profiles

#Clculate the item profile for each user#
def generate_items_profiles(embeddings_df):
    unique_items = embeddings_df['item_id'].unique()
    item_profiles = pd.DataFrame()
    for item_id in unique_items:
        item_avg_rating = embeddings_df[embeddings_df.item_id==item_id]['rating'].mean()
        item_vector = embeddings_df[embeddings_df.item_id==item_id].drop(columns=['user_id','item_id','rating']).iloc[0]
        item_profile = item_profile_calculate(item_avg_rating,item_vector)
        item_profiles = pd.concat([item_profiles, item_profile], ignore_index=True)
    item_profiles['item_id'] = unique_items
    return item_profiles

All the process that i have described above is done in the following code.

*   Generateing embedding for all items (content)

In [460]:
# Concatenate the features
df['itemData'] = df.apply(concatenate_textual_features, axis=1)
# Split the dataframe into batches and process each batch
batch_size = 100  
embeddings = batch_encode(df['itemData'].tolist(), batch_size=batch_size)
embeddings_df = pd.concat([df[['user_id','item_id','rating']], pd.DataFrame(embeddings.tolist())], axis=1)

Batches: 100%|██████████| 1512/1512 [13:43<00:00,  1.84it/s]


In [461]:
#Generate users and items profiles
users_profiles = generate_users_profiles(embeddings_df)
items_profiles = generate_items_profiles(embeddings_df)

# Extract the features (excluding the user_id and item_id)
user_features = users_profiles.iloc[:, :-1].values
item_features = items_profiles.iloc[:, :-1].values

# Calculate cosine similarity between user vectors and item vectors
similarity_matrix = cosine_similarity(user_features, item_features)

Loading user-items ids

In [462]:
with open('../outputs/IDs/ids_items_dict.json', 'r') as file:
    ids_items_dict = json.load(file)
with open('../outputs/IDs/usernames_ids_dict.json', 'r') as file:
    usernames_ids_dict = json.load(file)

The following functions will be used for recommending top_k items for specific user with a given similarity metric according to the content.

In [539]:
#Return user_profile for given user_id
def get_user_profile_by_user_id(users_profiles,user_id):    
    user_data = users_profiles[users_profiles['user_id'] == user_id]
    
    if user_data.empty:
        return users_profiles.drop(columns=['user_id']).mean().to_frame().transpose()
    else:
        return user_data.drop(columns=['user_id'])


#Return user_profile for given user_id
def get_item_profile_by_item_id(items_profiles, item_id):
    item_data = items_profiles[items_profiles['item_id'] == item_id]
    
    if item_data.empty:
        return items_profiles.drop(columns=['item_id']).mean().to_frame().transpose()
    else:
        return item_data.drop(columns=['item_id'])

#Return item_name for given item_id
def get_items_names(ids_items_dict, items_list):
    top_items_names = [ids_items_dict[str(it_id)] for it_id in items_list]
    return pd.DataFrame(top_items_names,columns=['itemName'])

# Top items to recommend for specific user
def get_top_k_items_for_specific_user(top_k,user_profile,items_profiles,user_items_ids,dis_metric):
    #Filter items that the user didnt purchased#
    unseen_items = items_profiles[~items_profiles['item_id'].isin(user_items_ids)]
    
    if dis_metric=='Cosine':
        items_scores = cosine_similarity(user_profile,unseen_items.iloc[:, :-1].values)
        #Looking for the higer cosin simillarity#
        top_items_scores = np.sort(items_scores)[0][::-1][:top_k]
        ratings = [cosine_similarity_to_rating(score) for score in top_items_scores]
        top_items_indices = np.argsort(items_scores)[0][::-1][:top_k]
    elif dis_metric=='Euclidean':
        items_scores = euclidean_distances(user_profile,unseen_items.iloc[:, :-1].values)
        #Looking for the lower euclidean#
        top_items_scores = np.sort(items_scores)[0][:top_k]
        ratings = [euclidean_to_rating(score) for score in top_items_scores]
        top_items_indices = np.argsort(items_scores)[0][:top_k]
    
    top_items_ids = unseen_items.iloc[top_items_indices]['item_id'].values
    return top_items_ids,top_items_scores,ratings

# Convert the cosine similarity score to rating
def cosine_similarity_to_rating(cosine_similarity): 
    normalized_score = (cosine_similarity + 1) / 2
    rating = 1 + (normalized_score * 4)
    return rating

# Convert euclidean score to rating
def euclidean_to_rating(euclidean_dist):
    normalized_score = 1/(euclidean_dist + 1) 
    rating = 1 + (normalized_score * 4)
    return rating

#Recommen k items for a given user#
def model_recommend(user_name,user_items_ids,k,dis_metric):
    user_id = usernames_ids_dict.get(user_name,0)
    user_profile = get_user_profile_by_user_id(users_profiles,user_id)
    top_k_items_ids,top_items_scores,ratings = get_top_k_items_for_specific_user(k,user_profile,items_profiles,user_items_ids,dis_metric)
    recommendation_df = get_items_names(ids_items_dict,top_k_items_ids)
    recommendation_df['Metric_Score'] = top_items_scores
    recommendation_df['Est_Rating'] = ratings
    return recommendation_df

#Recommen k items for a given user#
def model_recommend_items(user_name,user_items_ids,k,dis_metric,df):
    user_id = usernames_ids_dict.get(user_name,0)
    user_profile = get_user_profile_by_user_id(users_profiles,user_id)
    top_k_items_ids,top_items_scores,ratings = get_top_k_items_for_specific_user(k,user_profile,items_profiles,user_items_ids,dis_metric)
    recommendation_df = get_items_names(ids_items_dict,top_k_items_ids)
    item_neighbors = (df[df['item_id']==rid].iloc[0,1:4] for rid in top_k_items_ids)
    top_k_df = pd.DataFrame()
    for item in item_neighbors:
        top_k_df = top_k_df.append(item, ignore_index=True)
    return top_k_df

#Get rating given a user_id and item_id
def get_user_movie_rating_est(users_profiles,items_profiles,user_id,item_id,dist_metric='Cosine'):
    user_profile = get_user_profile_by_user_id(users_profiles,user_id)
    item_profile = get_item_profile_by_item_id(items_profiles,item_id)
    if(dist_metric=='Cosine'):
        rating = cosine_similarity_to_rating(user_profile@item_profile.T).values[0][0]
    else:
        rating = euclidean_to_rating(euclidean_distances(user_profile,item_profile)[0][0])
    return rating

#Return rmse for given list of user and items with ratings and the embbedding profiles#
def rmse_cb_model(user_item_rating,users_profiles,items_profiles,dist_metric='Cosine'):
    # Assuming user_item_rating is a DataFrame
    errors = user_item_rating.apply(lambda row: (row['rating'] - get_user_movie_rating_est(users_profiles, items_profiles, row['user_id'], row['item_id'], dist_metric))**2, axis=1)
    rmse = np.sqrt(errors.mean())
    return rmse

Similar to the user that was test with SVD , I will check which items are recommended to 'kristina' according to CB cosie-sim

In [546]:
df = pd.read_csv('../data/preprocessed.csv',index_col=[0])
userName='Kristie D'
user_id = usernames_ids_dict[userName] 
#Get the items of specific userName#
df[df['userName']==userName]

Unnamed: 0,userName,itemName,brand,category,price,rating,vote,user_id,item_id
128729,Kristie D,quot Guinea Habitat rdquo Guinea Pig Cage amp ...,MidWest Homes Pets,Pet Supplies,55.77,5.0,0,7642,11046
138704,Kristie D,Kaytee Clean amp Cozy Colored Small Animal Bed...,Kaytee,Pet Supplies,8.27,5.0,0,7642,5003
138810,Kristie D,Oxbow Animal Health Hamster Gerbil Fortified Food,Oxbow,Pet Supplies,20.178349,5.0,0,7642,6831


Accoding to the recommendation below
*   Most of the items are related to Pet supplie
*   Embedding vector seemed to act well in terms of getting similar items to the general profile of the user.
*   This method is more conservative , items that arent ralted to the user list will not be recommended.


Getting top K items for user by using Conten-Based cosin

In [550]:
#Content Based#
user_items_ids = df[df['user_id']==user_id]['item_id']
model_recommend_items(userName,user_items_ids,5,'Cosine',df)

Unnamed: 0,itemName,brand,category
0,Oxbow Animal Health Cavy Cuisine Adult Guinea ...,Oxbow,Pet Supplies
1,Oxbow Animal Health Orchard Grass Hay Pets,Oxbow,Pet Supplies
2,Oxbow Carnivore Care Pet Supplement,Oxbow,Pet Supplies
3,Oxbow Simple Rewards Natural Oven Baked Treats...,Oxbow,Pet Supplies
4,Oxbow Natural Science JOINT Supplement small a...,Oxbow,Pet Supplies


Saving users and items profiles for recommendation

In [519]:
#Save user profiles    
users_profiles.columns = users_profiles.columns.astype(str)
users_profiles.to_feather('../outputs/CB/users_profiles.feather')
#Save item profiles
items_profiles.columns = items_profiles.columns.astype(str)
items_profiles.to_feather('../outputs/CB/items_profiles.feather')
#Save embbeddings
embeddings_df.columns = embeddings_df.columns.astype(str)
embeddings_df.to_feather('../outputs/CB/items_user_content_embbedding.feather')


Evaluation of content-based model
*   Model RMSE is 1.14 which is better than both collaborative-filtering models (user-based and item based).
*   SVD still perform better , however that can be imporved by updated embedding with more informative features.
*   In terms of precision@K and recall@K the results are close CF(item-based) with 0.77

In [467]:
embeddings_df = pd.read_feather('../outputs/CB/items_user_content_embbedding.feather')
#splite embbedd users and items to train and test
train_emb_df, test_emb_df = train_test_split(embeddings_df, test_size=0.25, random_state=42)

#Generate users and items profiles according to the train set
users_profiles = generate_users_profiles(train_emb_df)
items_profiles = generate_items_profiles(train_emb_df)
#Crate user_item_rating according to test set
user_item_rating_test =  test_emb_df.loc[:,['user_id','item_id','rating']]

In [468]:
rmse_cosine = rmse_cb_model(user_item_rating_test,users_profiles,items_profiles,'Cosine')
rmse_euclidean = rmse_cb_model(user_item_rating_test,users_profiles,items_profiles,'Eucl')

RMSE evalution
*   Euclidean gets 1.58 which is worse than random model
*   Cosine get 1.14 which is around the rmse of CF models and it can be exceptable. 

In [469]:
print(f'RMSE Content-Based Model (Cosine) is : {rmse_cosine}')
print(f'RMSE Content-Based Model (Euclidean) is : {rmse_euclidean}')

RMSE Content-Based Model (Cosine) is : 1.13226469756025
RMSE Content-Based Model (Euclidean) is : 1.5738855231744706


In [470]:

predictions_cosine=[]
Prediction = namedtuple("Prediction", ["uid", "iid", "r_ui", "est", "details"])
predictions_cosine = [
    Prediction(row.user_id, row.item_id, row.rating, get_user_movie_rating_est(users_profiles, items_profiles, row.user_id, row.item_id,'Cosine'), '')
    for _, row in user_item_rating_test.iterrows()
]

predictions_euc=[]
predictions_euc = [
    Prediction(row.user_id, row.item_id, row.rating, get_user_movie_rating_est(users_profiles, items_profiles, row.user_id, row.item_id,'Eucl'), '')
    for _, row in user_item_rating_test.iterrows()
]

In [471]:
precisions, recalls =  precision_recall_at_k(predictions_cosine, k=10, threshold=4)
precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
# Precision and recall can then be averaged over all users
print(f'Preision@K for Content-Based Cosine  is  {precision_at_k}')
print(f'Recall@K for Content-Based Cosine is  {recall_at_k}')

Preision@K for Content-Based Cosine  is  0.8040670869293485
Recall@K for Content-Based Cosine is  0.8292484762694153


The normalization of the rating score with euclidean metric seems to be out of scale.
* May be caused by the fact that eculidean metric is not limit in a range.
* As we would have excpected cosine similarity metric is more sutiable for the current mission.
* Cosine measures the direction while Euclidean measures the magnitude.

In [472]:
precisions, recalls =  precision_recall_at_k(predictions_euc, k=10, threshold=4)
precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)
# Precision and recall can then be averaged over all users
print(f'Preision@K for Content-Based Euclidean is  {precision_at_k}')
print(f'Recall@K for Content-Based Euclidean is  {recall_at_k}')

Preision@K for Content-Based Euclidean is  0.07288650839606253
Recall@K for Content-Based Euclidean is  0.054635498524829726
