# Content-Based Recommender

This notebook implements content-based filtering using item features and profiles.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import MinMaxScaler

# Set display options
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

## Load Data

In [None]:
# Load item features/metadata
# items = pd.read_csv('../data/items.csv')
# print(f"Items shape: {items.shape}")
# items.head()

## Feature Engineering

In [None]:
# Create combined features (e.g., concatenate text columns)
# items['combined_features'] = items['title'] + ' ' + items['genre'] + ' ' + items['description']
# items['combined_features'] = items['combined_features'].fillna('')

# print("Sample combined features:")
# print(items['combined_features'].head())

## TF-IDF Vectorization

In [None]:
# Create TF-IDF matrix
# tfidf = TfidfVectorizer(
#     max_features=5000,
#     stop_words='english',
#     ngram_range=(1, 2)
# )

# tfidf_matrix = tfidf.fit_transform(items['combined_features'])
# print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# # Show top features
# feature_names = tfidf.get_feature_names_out()
# print(f"\nTop 10 features: {feature_names[:10]}")

## Calculate Item Similarity

In [None]:
# Calculate cosine similarity between items
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# print(f"Similarity matrix shape: {cosine_sim.shape}")

# # Create DataFrame for easier access
# item_similarity_df = pd.DataFrame(
#     cosine_sim,
#     index=items['item_id'],
#     columns=items['item_id']
# )

## Build Recommendation Function

In [None]:
def get_content_based_recommendations(item_id, item_similarity, items_df, top_n=10):
    """
    Get top-N similar items based on content.
    
    Args:
        item_id: Target item ID
        item_similarity: Item similarity matrix
        items_df: DataFrame with item information
        top_n: Number of recommendations
        
    Returns:
        DataFrame with recommended items and similarity scores
    """
    # Get similarity scores for the item
    sim_scores = item_similarity[item_id].sort_values(ascending=False)
    
    # Remove the item itself
    sim_scores = sim_scores[sim_scores.index != item_id]
    
    # Get top-N
    top_items = sim_scores.head(top_n)
    
    # Create result DataFrame
    recommendations = pd.DataFrame({
        'item_id': top_items.index,
        'similarity_score': top_items.values
    })
    
    # Merge with item details
    recommendations = recommendations.merge(
        items_df[['item_id', 'title', 'genre']],
        on='item_id',
        how='left'
    )
    
    return recommendations

# Example usage
# sample_item = items['item_id'].iloc[0]
# recommendations = get_content_based_recommendations(sample_item, item_similarity_df, items)
# print(f"\nRecommendations for item {sample_item}:")
# print(recommendations)

## User Profile-Based Recommendations

In [None]:
def get_user_profile_recommendations(user_id, user_item_matrix, item_similarity, items_df, top_n=10):
    """
    Get recommendations based on user's profile (items they've rated).
    
    Args:
        user_id: Target user ID
        user_item_matrix: User-item rating matrix
        item_similarity: Item similarity matrix
        items_df: DataFrame with item information
        top_n: Number of recommendations
        
    Returns:
        DataFrame with recommended items
    """
    # Get user's rated items
    user_ratings = user_item_matrix.loc[user_id]
    rated_items = user_ratings[user_ratings > 0]
    
    # Calculate weighted scores for all items
    scores = {}
    for item in item_similarity.columns:
        if item not in rated_items.index:
            # Calculate score based on similarity to rated items
            weighted_sum = 0
            weight_sum = 0
            
            for rated_item, rating in rated_items.items():
                similarity = item_similarity.loc[item, rated_item]
                weighted_sum += similarity * rating
                weight_sum += similarity
            
            if weight_sum > 0:
                scores[item] = weighted_sum / weight_sum
    
    # Get top-N recommendations
    top_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    recommendations = pd.DataFrame(top_items, columns=['item_id', 'predicted_rating'])
    recommendations = recommendations.merge(
        items_df[['item_id', 'title', 'genre']],
        on='item_id',
        how='left'
    )
    
    return recommendations

# Example usage
# user_item_matrix = pd.read_csv('../results/user_item_matrix.csv', index_col=0)
# user_id = user_item_matrix.index[0]
# recommendations = get_user_profile_recommendations(user_id, user_item_matrix, item_similarity_df, items)
# print(f"\nContent-based recommendations for user {user_id}:")
# print(recommendations)

## Visualize Results

In [None]:
# Visualize item similarity heatmap
# plt.figure(figsize=(12, 10))
# sample_items = item_similarity_df.iloc[:30, :30]
# sns.heatmap(sample_items, cmap='YlOrRd', annot=False)
# plt.title('Item Similarity Matrix (Sample)')
# plt.tight_layout()
# plt.show()

## Save Results

In [None]:
# Save TF-IDF matrix and similarity matrix
# import pickle

# with open('../results/tfidf_vectorizer.pkl', 'wb') as f:
#     pickle.dump(tfidf, f)

# item_similarity_df.to_csv('../results/content_item_similarity.csv')
# print("Saved content-based models and results")