In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz
from joblib import Parallel, delayed
import multiprocessing

train_df = pd.read_csv('./Data/interactions_train.csv')
train_df = train_df[['u', 'i', 'rating']]
train_df

Unnamed: 0,u,i,rating
0,22095,44367,5.0
1,22095,87844,5.0
2,24732,138181,5.0
3,24732,93054,4.0
4,22095,101723,5.0
...,...,...,...
698896,13681,141067,5.0
698897,14897,99787,5.0
698898,11605,76163,5.0
698899,3604,29101,5.0


In [2]:
filtered_df = train_df[train_df['rating'] >= 4]
filtered_df = filtered_df[['u', 'i']]
filtered_df['u'] = filtered_df['u'].astype('category')
filtered_df['i'] = filtered_df['i'].astype('category')
filtered_df

Unnamed: 0,u,i
0,22095,44367
1,22095,87844
2,24732,138181
3,24732,93054
4,22095,101723
...,...,...
698896,13681,141067
698897,14897,99787
698898,11605,76163
698899,3604,29101


In [3]:
# Keep only items rated by a certain number of users
min_ratings = 2  # Adjust as needed
counts = filtered_df['i'].value_counts()
filtered_df = filtered_df[filtered_df['i'].isin(counts[counts >= min_ratings].index)]
filtered_df

Unnamed: 0,u,i
0,22095,44367
1,22095,87844
2,24732,138181
3,24732,93054
4,22095,101723
...,...,...
698896,13681,141067
698897,14897,99787
698898,11605,76163
698899,3604,29101


In [4]:
# Map categories to indices
user_ids = filtered_df['u'].cat.codes
item_ids = filtered_df['i'].cat.codes

user_item_matrix_csr = csr_matrix((np.ones(len(filtered_df)), (user_ids, item_ids)))

In [5]:
def batch_cosine_similarity(matrix, batch_size=1):
    n_rows = matrix.shape[0]
    similarity_matrix = []

    for start_index in range(0, n_rows, batch_size):
        end_index = min(start_index + batch_size, n_rows)
        batch = matrix[start_index:end_index]

        # Compute similarities for the batch against the entire matrix
        similarities = cosine_similarity(batch, matrix)
        similarity_matrix.append(similarities)

    return np.vstack(similarity_matrix)

In [6]:
similarity_matrix = batch_cosine_similarity(user_item_matrix_csr)

KeyboardInterrupt: 

In [None]:
similarity_matrix

In [9]:
#np.savetxt("UB_similarity_matrix.csv", similarity_matrix, delimiter=",")

In [10]:
sparse_similarity_matrix = csr_matrix(similarity_matrix)
sparse_similarity_matrix

<24846x24846 sparse matrix of type '<class 'numpy.float64'>'
	with 19951700 stored elements in Compressed Sparse Row format>

In [11]:
save_npz("sparse_similarity_matrix.npz", sparse_similarity_matrix)

In [10]:
from scipy.sparse import load_npz
sparse_similarity_matrix = load_npz("sparse_similarity_matrix.npz")

In [9]:
def get_recommendations_scores(user_index, user_item_matrix, similarity_matrix, top_n=10):
    # Get the user's similarity scores with all items
    user_similarity_scores = similarity_matrix[user_index, :].toarray().flatten()

    # Exclude items the user has already interacted with
    user_interactions = user_item_matrix[user_index, :].nonzero()[1]
    user_similarity_scores[user_interactions] = 0

    # Get the indices of the top N items based on similarity scores
    top_item_indices = np.argsort(-user_similarity_scores)[:top_n]

    # Get the top N similarity scores
    top_similarity_scores = user_similarity_scores[top_item_indices]

    return top_similarity_scores

def parallel_recommendations_scores(user_index):
    return get_recommendations(user_index, user_item_matrix_csr, sparse_similarity_matrix, top_n=10)

results = Parallel(n_jobs=20)(delayed(parallel_recommendations)(i) for i in range(user_item_matrix_csr.shape[0]))

# Convert results to a DataFrame
u_recommendation_scores_df = pd.DataFrame(results, columns=[f'Score_{i+1}' for i in range(10)])
print(u_recommendation_scores_df)
u_recommendation_scores_df.to_csv("user_recommendation_scores.csv", index=False, header=False)

                             Score_1                        Score_2  \
0       (92869, 0.13608276348795434)   (92730, 0.13608276348795434)   
1       (75780, 0.20100756305184245)  (104256, 0.20100756305184245)   
2      (120065, 0.15811388300841894)   (60587, 0.15811388300841894)   
3          (63, 0.16402168874614093)     (145, 0.16402168874614093)   
4       (73240, 0.08519427513705971)   (20772, 0.08519427513705971)   
...                              ...                            ...   
24841    (86800, 0.4999999999999999)    (69167, 0.4999999999999999)   
24842    (72459, 0.4999999999999999)     (13629, 0.408248290463863)   
24843    (69786, 0.4999999999999999)   (30870, 0.35355339059327373)   
24844     (22190, 0.408248290463863)    (112590, 0.408248290463863)   
24845   (46186, 0.24999999999999994)   (53478, 0.24999999999999994)   

                             Score_3                        Score_4  \
0      (119725, 0.13608276348795434)   (26066, 0.13608276348795434)   
1    