## Item-Based Collaborative Filtering
For a given user $ u $ and item $ i $, predict the rating $ r_{ui} $ as follows:

$$
r_{ui} = \frac{\sum_{j \in R_u} \text{sim}(i, j) \cdot r_{uj}}{\sum_{j \in R_u} |\text{sim}(i, j)|}
$$

Where:
- $ R_u $: Items rated by user $ u $.
- $ \text{sim}(i, j) $: Similarity between items $ i $ and $ j $.
- $ r_{uj} $: Rating of user $ u $ for item $ j $.

---


In [2]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix, lil_matrix
import numpy as np
from tqdm import tqdm

In [3]:
train_df = pd.read_csv('../data/train.csv')

In [4]:
user_item_matrix = train_df.pivot(index='user_id', columns='book_id', values='rating')
user_item_matrix_sparse = csr_matrix(user_item_matrix.fillna(0).values)  # Convert to sparse
print(f"Sparse User-Item Matrix shape: {user_item_matrix_sparse.shape}")

Sparse User-Item Matrix shape: (18905, 15712)


In [5]:
def compute_item_similarity(user_item_matrix_sparse, batch_size=500):
    num_items = user_item_matrix_sparse.shape[1]  
    sparse_similarity = lil_matrix((num_items, num_items))  

    for start in tqdm(range(0, num_items, batch_size), desc="Processing similarity batches"):
        end = min(start + batch_size, num_items)
        partial_similarity = cosine_similarity(
            user_item_matrix_sparse.T[start:end], user_item_matrix_sparse.T
        )

        sparse_similarity[start:end] = partial_similarity

    return sparse_similarity.tocsr() 

item_similarity_sparse = compute_item_similarity(user_item_matrix_sparse, batch_size=32)
print(f"Sparse Item-Item Similarity Matrix shape: {item_similarity_sparse.shape}")


Processing similarity batches: 100%|██████████| 491/491 [00:06<00:00, 77.96it/s] 

Sparse Item-Item Similarity Matrix shape: (15712, 15712)





In [None]:
def predict_item_based(user_id, book_id, user_item_matrix_sparse, item_similarity_sparse, user_to_idx, book_to_idx):
    if user_id not in user_to_idx or book_id not in book_to_idx:
        return np.nan 

    user_idx = user_to_idx[user_id]
    item_idx = book_to_idx[book_id]
    user_ratings = user_item_matrix_sparse[user_idx].toarray().flatten()
    item_similarities = item_similarity_sparse[item_idx].toarray().flatten()
    rated_indices = np.where(user_ratings > 0)[0]

    numerator, denominator = 0, 0
    for j in rated_indices:
        sim = item_similarities[j]
        rating = user_ratings[j]
        numerator += sim * rating
        denominator += abs(sim)

    if denominator == 0:
        return np.nan  

    return numerator / denominator

def predict_test_ratings(test_df, user_item_matrix_sparse, item_similarity_sparse, user_to_idx, book_to_idx):
    predictions = []
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Predicting ratings"):
        user_id = row['user_id']
        book_id = row['book_id']
        predicted_rating = predict_item_based(
            user_id, book_id, user_item_matrix_sparse, item_similarity_sparse, user_to_idx, book_to_idx
        )
        if np.isnan(predicted_rating):
            predicted_rating = user_item_matrix_sparse.data.mean() 
        predictions.append([row['id'], predicted_rating])

    return predictions

### Submit

In [9]:
test_df = pd.read_csv('../data/test.csv')

In [None]:
user_to_idx = {user: idx for idx, user in enumerate(train_df['user_id'].unique())}
book_to_idx = {book: idx for idx, book in enumerate(train_df['book_id'].unique())}

In [11]:
predictions = predict_test_ratings(test_df, user_item_matrix_sparse, item_similarity_sparse, user_to_idx, book_to_idx)

Predicting ratings: 100%|██████████| 29367/29367 [00:05<00:00, 5843.70it/s]


In [14]:
output_df = pd.DataFrame(predictions, columns=['id', 'rating'])
output_df.to_csv('item_collaborative.csv', index=False)
print("Predictions saved to 'item_based_predictions.csv'")

Predictions saved to 'item_based_predictions.csv'
