Hybrid approach of item-based and "content"-based methods.
For "content"-based method, I combined `title`, `subtitle` and `description` to represent the content of the book to compute content_similarity. In regular cases where the user has rated more than 2 times, I adopt item-based method; while in cold start cases where the user has only rated 1 or 2 times, I use the content-based method.

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Load Datasets
train = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/train.csv')  # book_id, user_id, rating
test = pd.read_csv('/kaggle/input/dis-project-2-recommender-systems-f2024/test.csv')    # id, book_id, user_id
metadata = pd.read_csv('/kaggle/input/books-metadata/books_augmented.csv')  # book_id, title, description, genre, etc.

# Find common book_ids
common_book_ids = set(train['book_id']).intersection(metadata['book_id'])

# Filter train and metadata datasets
train = train[train['book_id'].isin(common_book_ids)]
metadata = metadata[metadata['book_id'].isin(common_book_ids)]

# Create User-Item Matrix
ratings_matrix = train.pivot_table(index='book_id', columns='user_id', values='rating')

# Compute Item-Based Similarity
# Fill missing values with item mean
item_mean_ratings = ratings_matrix.mean(axis=1)
ratings_matrix_filled = ratings_matrix.apply(lambda x: x.fillna(item_mean_ratings[x.name]), axis=1)

# Compute cosine similarity on the filled matrix
item_similarity = cosine_similarity(ratings_matrix_filled)
item_similarity_df = pd.DataFrame(item_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

# Compute Content-Based Similarity
# Combine metadata into a single textual feature
metadata = metadata.drop_duplicates(subset=['book_id'])
metadata['description'] = metadata['description'].fillna("")
metadata['subtitle'] = metadata['subtitle'].fillna("")
metadata['combined_features'] = metadata['title'] + " " + metadata['subtitle'] + " " + metadata['description']

# Use TF-IDF to vectorize combined features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(metadata['combined_features'])

# Compute cosine similarity based on metadata
content_similarity = cosine_similarity(tfidf_matrix)
content_similarity_df = pd.DataFrame(content_similarity, index=metadata['book_id'], columns=metadata['book_id'])

# Find common book_ids between the two similarity matrices
common_book_ids = item_similarity_df.index.intersection(content_similarity_df.index)

# Align both similarity matrices
item_similarity_df = item_similarity_df.loc[common_book_ids, common_book_ids]
content_similarity_df = content_similarity_df.loc[common_book_ids, common_book_ids]

# Define Adaptive Prediction Function
book_rating_counts = train['book_id'].value_counts()
min_ratings = 3  # Threshold for cold start

def predict_rating(user_id, book_id, ratings_matrix, item_similarity_df, content_similarity_df, item_mean_ratings):
    # Check if book_id exists in the similarity matrix
    if book_id not in item_similarity_df.index:
        return item_mean_ratings.mean()  # Fallback to global mean if book_id is unknown

    # Determine if the book is a cold-start case
    is_cold_start = book_rating_counts.get(book_id, 0) < min_ratings

    if is_cold_start:
        # Use content-based similarity exclusively
        if user_id in ratings_matrix.columns:
            user_ratings = ratings_matrix.loc[:, user_id]
            rated_books = user_ratings[user_ratings > 0].index  # Books the user rated
        else:
            return item_mean_ratings.mean()  # Fallback for unknown users

        rated_books = [b for b in rated_books if b in content_similarity_df.index]
        if len(rated_books) == 0:
            return item_mean_ratings.mean()  # Fallback if no valid rated books

        similarities = content_similarity_df.loc[book_id, rated_books]
        ratings = user_ratings[rated_books]

        # Compute weighted average
        weighted_sum = np.dot(similarities, ratings)
        sim_sum = np.abs(similarities).sum()

        return weighted_sum / sim_sum if sim_sum > 0 else item_mean_ratings.mean()

    else:
        # Use collaborative or blended similarity
        if user_id in ratings_matrix.columns:
            user_ratings = ratings_matrix.loc[:, user_id]
            rated_books = user_ratings[user_ratings > 0].index  # Books the user rated
        else:
            return item_mean_ratings.mean()  # Fallback for unknown users

        rated_books = [b for b in rated_books if b in item_similarity_df.index]
        if len(rated_books) == 0:
            return item_mean_ratings.mean()  # Fallback if no valid rated books

        similarities = item_similarity_df.loc[book_id, rated_books]
        ratings = user_ratings[rated_books]

        # Compute weighted average
        weighted_sum = np.dot(similarities, ratings)
        sim_sum = np.abs(similarities).sum()

        return weighted_sum / sim_sum if sim_sum > 0 else item_mean_ratings.mean()

# Generate Predictions for Test Set
predictions = []
for _, row in test.iterrows():
    book_id = row['book_id']
    user_id = row['user_id']
    pred_rating = predict_rating(user_id, book_id, ratings_matrix_filled, item_similarity_df, content_similarity_df, item_mean_ratings)
    predictions.append({'id': row['id'], 'rating': pred_rating})

# Save Predictions
submission = pd.DataFrame(predictions)
submission.to_csv('submission.csv', index=False)
