In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, lil_matrix
from scipy.sparse import save_npz
from joblib import Parallel, delayed
import multiprocessing
from sklearn.preprocessing import normalize

In [2]:
train_df = pd.read_csv('interactions_train.csv')
filtered_df = train_df[train_df['rating'] >= 4][['u', 'i']]
filtered_df['u'] = filtered_df['u'].astype('category')
filtered_df['i'] = filtered_df['i'].astype('category')
min_ratings = 2  # Need at least two users that have rated one item
counts = filtered_df['i'].value_counts()
filtered_df = filtered_df[filtered_df['i'].isin(counts[counts >= min_ratings].index)]
filtered_df

Unnamed: 0,u,i
0,22095,44367
1,22095,87844
2,24732,138181
3,24732,93054
4,22095,101723
...,...,...
698896,13681,141067
698897,14897,99787
698898,11605,76163
698899,3604,29101


In [3]:
# Map categories to indices
user_ids = filtered_df['u'].cat.codes
item_ids = filtered_df['i'].cat.codes
item_user_matrix_csr = csr_matrix((np.ones(len(filtered_df)), (item_ids, user_ids)))
item_user_matrix_csr

<153630x24846 sparse matrix of type '<class 'numpy.float64'>'
	with 580819 stored elements in Compressed Sparse Row format>

In [4]:
# Normalize the item vectors
normalized_item_vectors = normalize(item_user_matrix_csr, axis=1, norm='l2')

In [5]:
# Function to compute the top 20 similarities for a single item
def compute_item_similarity(item_index, item_vectors):
    similarities = cosine_similarity(item_vectors[item_index], item_vectors).flatten()
    top_indices = np.argsort(-similarities)[1:21]  # Exclude the item itself and get top 20
    top_similarities = similarities[top_indices]
    return item_index, top_indices, top_similarities

# Function to compute top 20 similarity matrix using parallel processing
def compute_top_20_similarity_matrix_parallel(item_vectors, n_jobs=20):
    num_items = item_vectors.shape[0]
    similarity_matrix = lil_matrix((num_items, num_items))

    results = Parallel(n_jobs=n_jobs)(
        delayed(compute_item_similarity)(i, item_vectors) for i in range(num_items)
    )

    for item_index, top_indices, top_similarities in results:
        for index, similarity in zip(top_indices, top_similarities):
            similarity_matrix[item_index, index] = similarity

    return similarity_matrix.tocsr()

In [6]:
# Compute the similarity matrix
similarity_matrix = compute_top_20_similarity_matrix_parallel(normalized_item_vectors, n_jobs=20)

In [7]:
similarity_matrix

<153630x153630 sparse matrix of type '<class 'numpy.float64'>'
	with 1768057 stored elements in Compressed Sparse Row format>

In [9]:
save_npz("similarity_matrix_IB.npz", similarity_matrix)

In [6]:
from scipy.sparse import load_npz
similarity_matrix = load_npz("similarity_matrix_IB.npz")

user_item_matrix_csr = csr_matrix((np.ones(len(filtered_df)), (user_ids, item_ids)))

In [17]:
def get_ib_recommendations_with_scores(user_index, item_user_matrix, similarity_matrix, top_n=10):
    # Get the items that this user has interacted with
    interacted_items = item_user_matrix[:, user_index].nonzero()[0]

    # Dictionary to keep track of items and their highest similarity score
    item_scores = {}

    for item in interacted_items:
        # Get the top N similar items for this item
        top_similar_items = np.argsort(-similarity_matrix[item].toarray().ravel())[1:top_n+1]

        # Find the highest similarity score among these top similar items
        for similar_item in top_similar_items:
            if similar_item not in interacted_items:  # Skip items the user has already interacted with
                current_score = similarity_matrix[item, similar_item]
                if similar_item in item_scores:
                    item_scores[similar_item] = max(item_scores[similar_item], current_score)
                else:
                    item_scores[similar_item] = current_score

    # Sort items by their highest similarity score and get the top N
    top_items_and_scores = sorted(item_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]

    return [(item, score) for item, score in top_items_and_scores]

In [18]:
def parallel_ib_recommendations_with_scores(user_index):
    return get_ib_recommendations_with_scores(user_index, item_user_matrix_csr, similarity_matrix, top_n=10)

# Generate recommendations for each user using parallel processing
score_results = Parallel(n_jobs=20)(delayed(parallel_ib_recommendations_with_scores)(i) for i in range(item_user_matrix_csr.shape[1]))

# Convert results to a DataFrame
score_recommendation_df = pd.DataFrame(score_results)
print(score_recommendation_df)

                                   0                             1  \
0        (80039, 0.5000000000000001)   (98679, 0.5000000000000001)   
1       (134459, 0.5773502691896258)   (31121, 0.5773502691896258)   
2       (121023, 0.7071067811865476)   (17664, 0.7071067811865476)   
3       (124114, 0.8164965809277261)  (136635, 0.8164965809277261)   
4        (89736, 0.8164965809277261)   (82098, 0.8164965809277261)   
...                              ...                           ...   
24841  (147807, 0.22360679774997902)  (79431, 0.22360679774997902)   
24842  (111387, 0.27854300726557785)   (2980, 0.26261286571944514)   
24843    (142461, 0.632455532033676)    (30666, 0.447213595499958)   
24844    (85391, 0.3651483716701108)  (144092, 0.3651483716701108)   
24845   (142593, 0.4082482904638631)   (54275, 0.4082482904638631)   

                                   2                              3  \
0       (137127, 0.5000000000000001)    (54545, 0.5000000000000001)   
1       (151405, 

In [19]:
# Save to CSV
score_recommendation_df.to_csv("item_recommendation_scores.csv", index=False, header=False)