In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm
from scipy.sparse import csr_matrix, lil_matrix
from scipy.sparse import save_npz
from joblib import Parallel, delayed
import multiprocessing
from sklearn.preprocessing import normalize

In [2]:
data = pd.read_csv('./Data/interactions_train.csv')

In [3]:
recipe_data = pd.read_csv('./Data/RAW_recipes.csv')
recipe_data = recipe_data.rename(columns={"id": "recipe_id"})
recipe_data = recipe_data.iloc[:100000]

In [4]:
merged_recipe_interaction_data = data.merge(recipe_data, how='inner', on='recipe_id', sort=True)

In [5]:
user_recipe_matrix = merged_recipe_interaction_data[['user_id', 'recipe_id', 'u', 'i', 'date', 'name', 'rating']]
user_recipe_matrix = user_recipe_matrix.pivot_table(index='user_id', columns='recipe_id', values='rating')

In [6]:
recipe_data.reset_index(level = 0, inplace = True, drop=True)
ind = pd.Series(recipe_data.index, index = recipe_data['name'])
vectorizer = TfidfVectorizer(ngram_range=(2, 2), stop_words='english')
tf_data = vectorizer.fit_transform(recipe_data['name'].values.astype('U'))
sim_matrix = cosine_similarity(tf_data)

In [7]:
def recommend_from_id(recipe_id, top_n):
    indx = recipe_data.index[recipe_data['recipe_id'] == recipe_id]
    sim = list(enumerate(sim_matrix[indx][0]))
    sim = sorted(sim, key=lambda x:x[1], reverse=True)
    sim = sim[1:top_n+1]
    item_list = []
    for item in sim:
        r_item = (recipe_data.iloc[item[0]]['recipe_id'], item[0], item[1])
        item_list.append(r_item)
    return item_list

In [8]:
def recommend_for_user(user_id, top_n=10):
    # Create set to store recommendations
    recs = []
    # Get recipe ids of recipes user rated
    user_vector = user_recipe_matrix.iloc[user_id]
    recipe_ids = user_vector[~np.isnan(user_vector)].index.tolist()
    # Generate recommendations for each recipe id and store it in a set
    for recipe_id in recipe_ids:
        rating = user_vector[recipe_id]
        rec_list = recommend_from_id(recipe_id, top_n)
        rec_new_list = [tuple([rating]+list(rec)) for rec in rec_list]
        recs.extend(rec_new_list)
    # sort the set based on the similarity
    sorted_recs = sorted(recs, key=lambda x:x[3], reverse=True)
    # recommend top n similar items
    final_recs = []
    for item in sorted_recs:
        rec_item = (recipe_data.iloc[item[2]]['name'], item[0], item[1], item[2], item[3], item[0]*item[3])
        final_recs.append(rec_item)
    item_rating_map_numer = dict()
    item_rating_map_denom = dict()
    ignore_list = []
    for rec in final_recs:
        if rec[4] != 0:
            if str(rec[2]) in item_rating_map_numer.keys() and str(rec[2]) in item_rating_map_denom.keys():
                item_rating_map_numer[str(rec[2])] += rec[5]
                item_rating_map_denom[str(rec[2])] += rec[4]
            else:
                item_rating_map_numer[str(rec[2])] = rec[5]
                item_rating_map_denom[str(rec[2])] = rec[4]
        else:
            ignore_list.append(rec[2])
    scored_recs = []
    for rec in final_recs:
        if rec[2] not in ignore_list:
            score_rec_item = (item_rating_map_numer[str(rec[2])]/item_rating_map_denom[str(rec[2])], rec[0], rec[1], rec[2], rec[3], rec[4], rec[5])
            scored_recs.append(score_rec_item)
    sorted_scored_recs = sorted(scored_recs, key=lambda x:x[0], reverse=True)
    return sorted_scored_recs[:top_n]

In [9]:
train_df = pd.read_csv('./Data/interactions_train.csv')
filtered_df = train_df[train_df['rating'] >= 4][['u', 'i']]
filtered_df['u'] = filtered_df['u'].astype('category')
filtered_df['i'] = filtered_df['i'].astype('category')
min_ratings = 2  # Need at least two users that have rated one item
counts = filtered_df['i'].value_counts()
filtered_df = filtered_df[filtered_df['i'].isin(counts[counts >= min_ratings].index)]

In [10]:
# Map categories to indices
user_ids = filtered_df['u'].cat.codes
item_ids = filtered_df['i'].cat.codes
item_user_matrix_csr = csr_matrix((np.ones(len(filtered_df)), (item_ids, user_ids)))
item_user_matrix_csr

<153630x24846 sparse matrix of type '<class 'numpy.float64'>'
	with 580819 stored elements in Compressed Sparse Row format>

In [11]:
# Normalize the item vectors
normalized_item_vectors = normalize(item_user_matrix_csr, axis=1, norm='l2')

In [12]:
# Function to compute the top 20 similarities for a single item
def compute_item_similarity(item_index, item_vectors):
    similarities = cosine_similarity(item_vectors[item_index], item_vectors).flatten()
    top_indices = np.argsort(-similarities)[1:21]  # Exclude the item itself and get top 20
    top_similarities = similarities[top_indices]
    return item_index, top_indices, top_similarities

# Function to compute top 20 similarity matrix using parallel processing
def compute_top_20_similarity_matrix_parallel(item_vectors, n_jobs=20):
    num_items = item_vectors.shape[0]
    similarity_matrix = lil_matrix((num_items, num_items))

    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_item_similarity)(i, item_vectors) for i in range(num_items)
    )

    for item_index, top_indices, top_similarities in results:
        for index, similarity in zip(top_indices, top_similarities):
            similarity_matrix[item_index, index] = similarity

    return similarity_matrix.tocsr()

In [13]:
from scipy.sparse import load_npz
similarity_matrix = load_npz("./similarity_matrix_IB.npz")

user_item_matrix_csr = csr_matrix((np.ones(len(filtered_df)), (user_ids, item_ids)))

In [14]:
def get_ib_recommendations_with_scores(user_index, item_user_matrix, similarity_matrix, top_n=10):
    # Get the items that this user has interacted with
    interacted_items = item_user_matrix[:, user_index].nonzero()[0]

    # Dictionary to keep track of items and their highest similarity score
    item_scores = {}

    for item in interacted_items:
        # Get the top N similar items for this item
        top_similar_items = np.argsort(-similarity_matrix[item].toarray().ravel())[1:top_n+1]

        # Find the highest similarity score among these top similar items
        for similar_item in top_similar_items:
            if similar_item not in interacted_items:  # Skip items the user has already interacted with
                current_score = similarity_matrix[item, similar_item]
                if similar_item in item_scores:
                    item_scores[similar_item] = max(item_scores[similar_item], current_score)
                else:
                    item_scores[similar_item] = current_score

    # Sort items by their highest similarity score and get the top N
    top_items_and_scores = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    return [(item, score) for item, score in top_items_and_scores]

In [63]:
def parallel_ib_recommendations_with_scores(user_index):
    return get_ib_recommendations_with_scores(user_index, item_user_matrix_csr, similarity_matrix, top_n=5)

In [17]:
def u_lookup(user_id):
    u = train_df[train_df['user_id'] == user_id]['u']
    return u.iloc[0]

In [122]:
def recipe_lookup(i):
    r_id = train_df[train_df['i'] == i]['recipe_id']
    return r_id.iloc[0]

In [146]:
def i_lookup(recipe_id):
    i = train_df[train_df['recipe_id'] == recipe_id]['i']
    if len(i) > 0:
        return i.iloc[0]    
    return -1

In [164]:
def hybrid_recommend(user_id):
    u = u_lookup(user_id)
    content_recs = recommend_for_user(user_id, top_n=5)
    reduced_content_recs = []
    for rec in content_recs:
        reduced_rec = (rec[3], rec[5])
        reduced_content_recs.append(reduced_rec)
    item_based_recs = parallel_ib_recommendations_with_scores(u)
    reduced_content_recs.extend(item_based_recs)
    sorted_recs = sorted(reduced_content_recs, key=lambda x:x[1], reverse=True)
    return sorted_recs[:10]

In [165]:
hybrid_recommend(1535)

[(62913, 0.632455532033676),
 (98165, 0.632455532033676),
 (87220, 0.632455532033676),
 (80451, 0.632455532033676),
 (315403, 0.5302631129228309),
 (30660, 0.5163977794943224),
 (353600, 0.4885146749447504),
 (135585, 0.4743302627134814),
 (458464, 0.4593491403505437),
 (504083, 0.4276582548319066)]