In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', '{:.4f}'.format)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

np.random.seed(42)

In [43]:

df = pd.read_csv('data/processed/amazon_reviews_with_metadata.csv')

print(f"  Initial load: {len(df):,} records")

  Initial load: 168,919 records


In [None]:
df = df[
    df['title_clean'].notna() & 
    (df['title_clean'].astype(str) != 'nan') &
    (df['title_clean'].astype(str).str.strip() != '') &
    (df['title_clean'] != 'Unknown Product') & 
    df['reviewText_clean'].notna() &
    (df['reviewText_clean'].astype(str) != 'nan') &
    (df['reviewText_clean'].astype(str).str.strip() != '')
].copy()

print(f"  Final records: {len(df):,}")
print(f"  Shape: {df.shape}")
print(f"  Filtering removed NaN, empty strings, and 'Unknown Product' titles")

  Final records: 16,379
  Shape: (16379, 19)
  Filtering removed NaN, empty strings, and 'Unknown Product' titles


In [45]:
title_check = df['title_clean'].notna().all() and (df['title_clean'].astype(str) != 'nan').all()
review_check = df['reviewText_clean'].notna().all() and (df['reviewText_clean'].astype(str) != 'nan').all()
print(f"  All rows have title: {title_check}")
print(f"  All rows have review text: {review_check}")

  All rows have title: True
  All rows have review text: True


In [None]:
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,reviewText_clean,summary_clean,reviewTime_dt,title_clean,description_clean,price,brand,main_category,subcategory,data_source
0,AES2AZFVTXKBZ,1400532620,"Amazon Customer ""Charge It""","[2, 3]",Highly disappointed. I purchased the new B/W t...,1.0,Not for Students and Professors,1310947200,"07 18, 2011",Highly disappointed. I purchased the new B/W t...,Not for Students and Professors,2011-07-18,Barnes & Noble Nook eReader - no 3G,Barnes & Noble Nook eReader - no 3GMeet nook. ...,74.95,Barnes &amp; Noble,eBook Readers & Accessories,Unknown,Amazon
1,A1QPUBF6U5EQM6,1400532620,Dare2Dream,"[1, 1]",When I purchased this item it was still runnin...,4.0,Not Sure I'd Do It Over Again,1322265600,"11 26, 2011",When I purchased this item it was still runnin...,Not Sure I'd Do It Over Again,2011-11-26,Barnes & Noble Nook eReader - no 3G,Barnes & Noble Nook eReader - no 3GMeet nook. ...,74.95,Barnes &amp; Noble,eBook Readers & Accessories,Unknown,Amazon
2,A3OEKYU0C8ZXAO,1400532620,joon,"[0, 0]",I loved using this nook reader. The bottom tou...,5.0,Great reader,1307059200,"06 3, 2011",I loved using this nook reader. The bottom tou...,Great reader,2011-06-03,Barnes & Noble Nook eReader - no 3G,Barnes & Noble Nook eReader - no 3GMeet nook. ...,74.95,Barnes &amp; Noble,eBook Readers & Accessories,Unknown,Amazon
3,A1S71JIN40YHXK,1400532620,Pamela,"[0, 0]",I ordered this product and it was not what I w...,5.0,Nook,1369872000,"05 30, 2013",I ordered this product and it was not what I w...,Nook,2013-05-30,Barnes & Noble Nook eReader - no 3G,Barnes & Noble Nook eReader - no 3GMeet nook. ...,74.95,Barnes &amp; Noble,eBook Readers & Accessories,Unknown,Amazon
4,A10BOETDPAFJ4C,1400532620,weapon x,"[1, 1]",the battery is great-about 12-14 hours from 80...,5.0,Love it!!,1302480000,"04 11, 2011",the battery is great-about 12-14 hours from 80...,Love it!!,2011-04-11,Barnes & Noble Nook eReader - no 3G,Barnes & Noble Nook eReader - no 3GMeet nook. ...,74.95,Barnes &amp; Noble,eBook Readers & Accessories,Unknown,Amazon


In [47]:
# Convert reviewTime to datetime
df['reviewTime_dt'] = pd.to_datetime(df['reviewTime'], format='%m %d, %Y')
df['review_year'] = df['reviewTime_dt'].dt.year
df['review_month'] = df['reviewTime_dt'].dt.month
df['review_day_of_week'] = df['reviewTime_dt'].dt.dayofweek

In [None]:
min_date = df['reviewTime_dt'].min()
df['days_since_start'] = (df['reviewTime_dt'] - min_date).dt.days

In [None]:
max_days = df['days_since_start'].max()
df['time_weight'] = (df['days_since_start'] / max_days)

In [50]:
df['helpful_votes'] = df['helpful'].apply(
    lambda x: eval(x)[0] if isinstance(x, str) else (x[0] if isinstance(x, list) else 0)
)
df['total_votes'] = df['helpful'].apply(
    lambda x: eval(x)[1] if isinstance(x, str) else (x[1] if isinstance(x, list) else 0)
)
df['helpful_ratio'] = df.apply(
    lambda row: row['helpful_votes'] / row['total_votes'] if row['total_votes'] > 0 else 0,
    axis=1
)

In [51]:
df['review_length'] = df['reviewText_clean'].str.len()
df['review_length_normalized'] = (df['review_length'] - df['review_length'].min()) / \
                                  (df['review_length'].max() - df['review_length'].min())

df['implicit_rating'] = (
    df['overall'] * 0.7 +
    df['helpful_ratio'] * 5 * 0.15 +
    df['review_length_normalized'] * 5 * 0.15
)

In [52]:
print(f"Average review length: {df['review_length'].mean():.0f} characters")
print(f"Average helpful ratio: {df['helpful_ratio'].mean():.3f}")
print(f"Implicit rating range: {df['implicit_rating'].min():.2f} - {df['implicit_rating'].max():.2f}")

Average review length: 652 characters
Average helpful ratio: 0.331
Implicit rating range: 0.70 - 4.93


In [53]:
# User-level features
user_stats = df.groupby('reviewerID').agg({
    'overall': ['mean', 'std', 'count'],
    'asin': 'nunique'
}).reset_index()
user_stats.columns = ['reviewerID', 'user_avg_rating', 'user_rating_std', 
                      'user_review_count', 'user_products_reviewed']
user_stats['user_rating_std'] = user_stats['user_rating_std'].fillna(0)

df = df.merge(user_stats, on='reviewerID', how='left')

In [54]:
# Product-level features
product_stats = df.groupby('asin').agg({
    'overall': ['mean', 'std', 'count'],
    'reviewerID': 'nunique'
}).reset_index()
product_stats.columns = ['asin', 'product_avg_rating', 'product_rating_std',
                         'product_review_count', 'product_unique_reviewers']
product_stats['product_rating_std'] = product_stats['product_rating_std'].fillna(0)

df = df.merge(product_stats, on='asin', how='left')

print(f"Average reviews per user: {df['user_review_count'].mean():.2f}")
print(f"Average reviews per product: {df['product_review_count'].mean():.2f}")
print(f"Users who reviewed multiple products: {(df['user_products_reviewed'] > 1).sum():,}")

Average reviews per user: 1.16
Average reviews per product: 20.80
Users who reviewed multiple products: 2,022


In [None]:
# Combine title and description
df['product_text'] = df['title_clean'].fillna('') + ' ' + df['description_clean'].fillna('')

# Create TF-IDF vectorizer for products
product_tfidf = TfidfVectorizer(
    max_features=500,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2, 
    max_df=0.8
)

product_texts = df.groupby('asin')['product_text'].first().reset_index()
product_tfidf_matrix = product_tfidf.fit_transform(product_texts['product_text'])

print(f"Matrix shape: {product_tfidf_matrix.shape}")
print(f"Vocabulary size: {len(product_tfidf.vocabulary_)}")
print(f"Sparsity: {(1 - product_tfidf_matrix.nnz / (product_tfidf_matrix.shape[0] * product_tfidf_matrix.shape[1]))*100:.2f}%")

product_to_idx = {asin: idx for idx, asin in enumerate(product_texts['asin'])}
idx_to_product = {idx: asin for asin, idx in product_to_idx.items()}

Matrix shape: (5231, 500)
Vocabulary size: 500
Sparsity: 94.57%


In [None]:
print("  Creating TF-IDF vectors for review text...")
review_tfidf = TfidfVectorizer(
    max_features=300,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=3,
    max_df=0.7
)

review_sample = df['reviewText_clean'].head(min(10000, len(df)))
review_tfidf_matrix = review_tfidf.fit_transform(review_sample)

print(f"  Sample size: {len(review_sample):,}")
print(f"  Matrix shape: {review_tfidf_matrix.shape}")
print(f"  Vocabulary size: {len(review_tfidf.vocabulary_)}")

  Creating TF-IDF vectors for review text...
  Sample size: 10,000
  Matrix shape: (10000, 300)
  Vocabulary size: 300


In [57]:
user_ids = df['reviewerID'].unique()
product_ids = df['asin'].unique()

user_to_idx = {user: idx for idx, user in enumerate(user_ids)}
product_to_idx_matrix = {product: idx for idx, product in enumerate(product_ids)}

print(f"\nMatrix dimensions:")
print(f"  Users: {len(user_ids):,}")
print(f"  Products: {len(product_ids):,}")
print(f"  Possible interactions: {len(user_ids) * len(product_ids):,}")
print(f"  Actual interactions: {len(df):,}")


Matrix dimensions:
  Users: 15,290
  Products: 5,231
  Possible interactions: 79,981,990
  Actual interactions: 16,379


In [None]:
sparsity = 1 - (len(df) / (len(user_ids) * len(product_ids)))
print(f"  Sparsity: {sparsity * 100:.2f}%")

df['user_idx'] = df['reviewerID'].map(user_to_idx)
df['product_idx'] = df['asin'].map(product_to_idx_matrix)

  Sparsity: 99.98%


In [None]:
test_size=0.2 
min_user_interactions=2

user_counts = df['reviewerID'].value_counts()
valid_users = user_counts[user_counts >= min_user_interactions].index
df_filtered = df[df['reviewerID'].isin(valid_users)].copy()

df_filtered = df_filtered.sort_values('reviewTime_dt').reset_index(drop=True)

In [None]:

train_data = []
test_data = []

for user_id in df_filtered['reviewerID'].unique():
    user_data = df_filtered[df_filtered['reviewerID'] == user_id].sort_values('reviewTime_dt')
    n_user_items = len(user_data)
    n_test = max(1, int(n_user_items * test_size))
    
    train_data.append(user_data.iloc[:-n_test])
    test_data.append(user_data.iloc[-n_test:])

train_df = pd.concat(train_data, ignore_index=True)
test_df = pd.concat(test_data, ignore_index=True)

print(f"\nSplit results:")
print(f"  Train: {len(train_df):,} ({len(train_df)/len(df_filtered)*100:.1f}%)")
print(f"  Test:  {len(test_df):,} ({len(test_df)/len(df_filtered)*100:.1f}%)")
print(f"\n  Train users: {train_df['reviewerID'].nunique():,}")
print(f"  Test users:  {test_df['reviewerID'].nunique():,}")
print(f"  Train products: {train_df['asin'].nunique():,}")
print(f"  Test products:  {test_df['asin'].nunique():,}")

train_products = set(train_df['asin'].unique())
test_products = set(test_df['asin'].unique())
overlap = len(train_products & test_products)
print(f"\n  Product overlap: {overlap:,} ({overlap/len(test_products)*100:.1f}% of test products)")



Split results:
  Train: 1,089 (53.9%)
  Test:  933 (46.1%)

  Train users: 933
  Test users:  933
  Train products: 823
  Test products:  684

  Product overlap: 188 (27.5% of test products)


### Model 1: Popularity-Based Recommender

In [None]:
class PopularityRecommender:
    def __init__(self):
        self.popular_items = None
    
    def fit(self, df):
        popularity = df.groupby('asin').agg({
            'overall': ['mean', 'count']
        }).reset_index()
        popularity.columns = ['asin', 'avg_rating', 'review_count']
        
        # Weighted score: (avg_rating * log(count))
        popularity['popularity_score'] = (
            popularity['avg_rating'] * np.log1p(popularity['review_count'])
        )
        
        self.popular_items = popularity.sort_values(
            'popularity_score', ascending=False
        )
        
        return self
    
    def recommend(self, user_id=None, n=10, exclude_items=None):
        recommendations = self.popular_items.head(n + (len(exclude_items) if exclude_items else 0))
        
        if exclude_items:
            recommendations = recommendations[~recommendations['asin'].isin(exclude_items)]
        
        return recommendations.head(n)['asin'].tolist()

pop_model = PopularityRecommender()
pop_model.fit(train_df)

print(f"\nTop 10 Popular Products:")
top_10 = pop_model.popular_items.head(10)
for idx, row in top_10.iterrows():
    product_name = train_df[train_df['asin'] == row['asin']]['title_clean'].iloc[0][:60]
    print(f"  {product_name}...")
    print(f"    Rating: {row['avg_rating']:.2f}, Reviews: {row['review_count']}, Score: {row['popularity_score']:.2f}")


Top 10 Popular Products:
  Roku 3 Streaming Media Player...
    Rating: 4.88, Reviews: 8, Score: 10.71
  Belkin 3-Outlet Mini Travel Swivel Charger Surge Protector w...
    Rating: 4.62, Reviews: 8, Score: 10.16
  Transcend 4 GB Class 6 SDHC Flash Memory Card TS4GSDHC6...
    Rating: 4.71, Reviews: 7, Score: 9.80
  Crucial m4 64GB 2.5-Inch (9.5mm) SATA 6Gb/s Solid State Driv...
    Rating: 5.00, Reviews: 6, Score: 9.73
  Garmin nuvi 350 3.5-Inch Portable GPS Navigator (Discontinue...
    Rating: 4.22, Reviews: 9, Score: 9.72
  AmazonBasics Hard Carrying Case for My Passport Essential - ...
    Rating: 4.57, Reviews: 7, Score: 9.51
  Linksys E1200 Wireless-N300 Router...
    Rating: 4.11, Reviews: 9, Score: 9.47
  eneloop SEC-CSPACER4PK C Size Spacers for use with AA batter...
    Rating: 4.50, Reviews: 6, Score: 8.76
  Logitech Wireless Touch Keyboard K400 with Built-In Multi-To...
    Rating: 4.80, Reviews: 5, Score: 8.60
  Flip MinoHD Video Camera - Brushed Metal, 8 GB, 2 Hours (2nd

### Model 2: Collaborative Filtering (SVD)

In [None]:
class MatrixFactorizationSVD:

    def __init__(self, n_factors=5):
        self.n_factors = 5
        self.user_factors = None
        self.item_factors = None
        self.global_mean = None
        self.user_bias = None
        self.item_bias = None
        self.user_to_idx = None
        self.item_to_idx = None
        self.idx_to_user = None
        self.idx_to_item = None
    
    def fit(self, train_df, user_col='reviewerID', item_col='asin', rating_col='overall'):
        """
        Train SVD model on training data
        """
        print("\nPreparing data for SVD...")
        
        # Create user and item mappings
        users = train_df[user_col].unique()
        items = train_df[item_col].unique()
        
        self.user_to_idx = {user: idx for idx, user in enumerate(users)}
        self.item_to_idx = {item: idx for idx, item in enumerate(items)}
        self.idx_to_user = {idx: user for user, idx in self.user_to_idx.items()}
        self.idx_to_item = {idx: item for item, idx in self.item_to_idx.items()}
        
        n_users = len(users)
        n_items = len(items)
        
        print(f"  Users: {n_users:,}")
        print(f"  Items: {n_items:,}")
        print(f"  Ratings: {len(train_df):,}")
        
        # Create user-item rating matrix
        user_indices = train_df[user_col].map(self.user_to_idx).values
        item_indices = train_df[item_col].map(self.item_to_idx).values
        ratings = train_df[rating_col].values
        
        rating_matrix = csr_matrix(
            (ratings, (user_indices, item_indices)),
            shape=(n_users, n_items)
        )
        
        # Calculate global mean and biases
        self.global_mean = ratings.mean()
        
        # User biases
        user_ratings = train_df.groupby(user_col)[rating_col].mean()
        self.user_bias = {user: rating - self.global_mean 
                         for user, rating in user_ratings.items()}
        
        # Item biases
        item_ratings = train_df.groupby(item_col)[rating_col].mean()
        self.item_bias = {item: rating - self.global_mean 
                         for item, rating in item_ratings.items()}
        
        # Perform SVD
        print("\nTraining SVD model...")
        print(f"  Latent factors: {self.n_factors}")
        
        U, sigma, Vt = svds(rating_matrix, k=self.n_factors)
        
        self.user_factors = U
        self.item_factors = Vt.T
        self.sigma = sigma
        
        print("\n SVD model trained")
        print(f"  User factors shape: {self.user_factors.shape}")
        print(f"  Item factors shape: {self.item_factors.shape}")
        
        return self
    
    def predict(self, user_id, item_id):
        """
        Predict rating for a user-item pair
        """
        if user_id not in self.user_to_idx:
            return self.global_mean + self.item_bias.get(item_id, 0)
        
        if item_id not in self.item_to_idx:
            return self.global_mean + self.user_bias.get(user_id, 0)
        
        user_idx = self.user_to_idx[user_id]
        item_idx = self.item_to_idx[item_id]
        
        baseline = self.global_mean + \
                  self.user_bias.get(user_id, 0) + \
                  self.item_bias.get(item_id, 0)
        
        interaction = np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
        
        prediction = baseline + interaction
        
        return np.clip(prediction, 1, 5)

print("\nTraining collaborative filtering model...")
svd_model = MatrixFactorizationSVD(n_factors=50)
svd_model.fit(train_df)


Training collaborative filtering model...

Preparing data for SVD...
  Users: 933
  Items: 823
  Ratings: 1,089

Training SVD model...
  Latent factors: 5

 SVD model trained
  User factors shape: (933, 5)
  Item factors shape: (823, 5)


<__main__.MatrixFactorizationSVD at 0x14f878640>

In [None]:
def get_svd_recommendations(user_id, n=10, exclude_items=None):
    """
    Get top N recommendations for a user using SVD
    """
    all_products = train_df['asin'].unique()
    
    # Get products user hasn't interacted with
    user_items = set(train_df[train_df['reviewerID'] == user_id]['asin'])
    unseen_products = [p for p in all_products if p not in user_items]
    
    if exclude_items:
        unseen_products = [p for p in unseen_products if p not in exclude_items]
    
    predictions = []
    for product_id in unseen_products:
        pred_rating = svd_model.predict(user_id, product_id)
        predictions.append((product_id, pred_rating))
    
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    return [p[0] for p in predictions[:n]]

sample_user = train_df['reviewerID'].iloc[0]
svd_recs = get_svd_recommendations(sample_user, n=10)

print(f"\nTop 10 recommendations for user '{sample_user}':")
for i, asin in enumerate(svd_recs, 1):
    if asin in df['asin'].values:
        product_name = df[df['asin'] == asin]['title_clean'].iloc[0][:60]
        pred_rating = svd_model.predict(sample_user, asin)
        print(f"  {i}. {product_name}... (predicted rating: {pred_rating:.2f})")


Top 10 recommendations for user 'A3TUZOJZM9008Y':
  1. Cisco-Linksys PS2KVMSK ProConnect 2-Port Compact KVM Switch ... (predicted rating: 5.00)
  2. Lexar Media 128 MB Memory Stick... (predicted rating: 5.00)
  3. Canon EF 135mm f/2L USM Lens for Canon SLR Cameras... (predicted rating: 5.00)
  4. Winegard FV-HD30 FreeVision HDTV Antenna (Discontinued by Ma... (predicted rating: 5.00)
  5. Kanex HDMIMINI6feet High Speed Mini HDMI Cable (6 feet)... (predicted rating: 5.00)
  6. SanDisk SDSM-128-A10 SmartMedia 128 MB... (predicted rating: 5.00)
  7. Canon EOS-1Ds 11.1MP Digital SLR Camera (Body Only)... (predicted rating: 5.00)
  8. Apple iPod shuffle 512 MB White (1st Generation) (Discontinu... (predicted rating: 5.00)
  9. Logitech Quickcam Orbit WebCam... (predicted rating: 5.00)
  10. Logitech Harmony H-688 Universal Remote Control (Silver) (Di... (predicted rating: 5.00)


### Model 3: Content-Based Filtering (KNN on TF-IDF)

In [None]:
class ContentBasedRecommender:
    def __init__(self, tfidf_matrix, product_mapping):
        self.tfidf_matrix = tfidf_matrix
        self.product_mapping = product_mapping
        self.knn_model = None
    
    def fit(self):
        self.knn_model = NearestNeighbors(
            n_neighbors=20,
            metric='cosine',
            algorithm='brute'
        )
        self.knn_model.fit(self.tfidf_matrix)
        return self
    
    def recommend_similar(self, product_id, n=10):
        if product_id not in self.product_mapping:
            return []
        
        product_idx = self.product_mapping[product_id]
        product_vector = self.tfidf_matrix[product_idx]
        
        # Find k nearest neighbors
        distances, indices = self.knn_model.kneighbors(
            product_vector, 
            n_neighbors=n+1
        )
        
        similar_indices = indices[0][1:]
        
        idx_to_product_local = {idx: pid for pid, idx in self.product_mapping.items()}
        similar_products = [idx_to_product_local[idx] for idx in similar_indices]
        
        return similar_products

content_model = ContentBasedRecommender(product_tfidf_matrix, product_to_idx)
content_model.fit()

print(f"  Using TF-IDF matrix: {product_tfidf_matrix.shape}")
print(f"  KNN neighbors: 20")

  Using TF-IDF matrix: (5231, 500)
  KNN neighbors: 20


In [None]:
sample_product = df['asin'].iloc[0]
sample_title = df[df['asin'] == sample_product]['title_clean'].iloc[0][:60]

content_recs = content_model.recommend_similar(sample_product, n=10)

print(f"\nFor product: '{sample_title}...'")
print(f"\nTop 10 similar products:")
for i, asin in enumerate(content_recs, 1):
    if asin in df['asin'].values:
        product_name = df[df['asin'] == asin]['title_clean'].iloc[0][:60]
        print(f"  {i}. {product_name}...")


For product: 'Barnes & Noble Nook eReader - no 3G...'

Top 10 similar products:
  1. NETGEAR AC1600 Dual Band Wi-Fi Gigabit Router (R6250)...
  2. Honeywell L5100-WIFI - L5100 Wifi Module for Lynx Touch 5100...
  3. Patriot Box Office Wireless N USB Adapter PCBOWAU2-N...
  4. Grace Digital Wi-Fi Music Player with 3.5-Inch Color Display...
  5. Amped Wireless High Power 1000mW Wi-Fi Signal Booster (SB100...
  6. TRENDnet N300 Wireless High Power Easy-N Range Stand Alone W...
  7. NETGEAR N300 Wi-Fi Range Extender - Wall Plug Version (WN300...
  8. D-Link Wireless Dual Band N 300+ Mbps Wi-Fi Gigabit Range Ex...
  9. NETGEAR Dual Band Wi-Fi Range Extender - Desktop Version wit...
  10. MSI X320-037US 13.4-Inch Laptop - Black...


In [None]:
def evaluate_recommendations(model, train_df, test_df, model_type='svd', 
                            k_values=[5, 10, 20], min_rating_threshold=3.5):
    
    test_users = test_df['reviewerID'].unique()
    print(f"\nEvaluating on {len(test_users):,} users")
    
    results = {
        'predictions': [],
        'actuals': [],
        'hit_rates': {k: [] for k in k_values},
        'precisions': {k: [] for k in k_values},
        'recalls': {k: [] for k in k_values},
        'ndcg': {k: [] for k in k_values}
    }
    
    users_evaluated = 0
    
    for user_id in test_users:
        user_test = test_df[test_df['reviewerID'] == user_id]
        user_train = train_df[train_df['reviewerID'] == user_id]
        
        if len(user_train) == 0:
            continue
        
        try:
            if model_type == 'svd':
                train_items = set(user_train['asin'])
                all_train_items = set(train_df['asin'].unique())
                candidate_items = list(all_train_items - train_items)
                
                predictions = []
                for item in candidate_items:
                    pred_rating = model.predict(user_id, item)
                    predictions.append((item, pred_rating))
                
                predictions.sort(key=lambda x: x[1], reverse=True)
                recommended_items = [item for item, rating in predictions[:max(k_values)]]
                
            elif model_type == 'popularity':
                train_items = set(user_train['asin'])
                recommended_items = model.recommend(n=max(k_values), exclude_items=train_items)
                
            else:
                seed_items = user_train[user_train['overall'] >= 4.0]['asin'].values
                if len(seed_items) == 0:
                    continue
                seed_item = seed_items[-1]
                recommended_items = model.recommend_similar(seed_item, n=max(k_values))
            
        except Exception as e:
            continue
        
        relevant_items = set(user_test[user_test['overall'] >= min_rating_threshold]['asin'])
        
        if len(relevant_items) == 0:
            continue
        
        for k in k_values:
            recs_at_k = set(recommended_items[:k])
            
            hit = 1 if len(recs_at_k & relevant_items) > 0 else 0
            results['hit_rates'][k].append(hit)
            
            # Precision@K
            precision = len(recs_at_k & relevant_items) / k if k > 0 else 0
            results['precisions'][k].append(precision)
            
            # Recall@K
            recall = len(recs_at_k & relevant_items) / len(relevant_items)
            results['recalls'][k].append(recall)
            
            # NDCG@K
            dcg = 0
            idcg = sum([1/math.log2(i+2) for i in range(min(k, len(relevant_items)))])
            for i, item in enumerate(recommended_items[:k]):
                if item in relevant_items:
                    dcg += 1 / math.log2(i + 2)
            ndcg = dcg / idcg if idcg > 0 else 0
            results['ndcg'][k].append(ndcg)
        
        if model_type == 'svd':
            for _, row in user_test.iterrows():
                pred = model.predict(user_id, row['asin'])
                results['predictions'].append(pred)
                results['actuals'].append(row['overall'])
        
        users_evaluated += 1
    
    print(f"\nSuccessfully evaluated {users_evaluated} users")
    
    if model_type == 'svd' and len(results['predictions']) > 0:
        rmse = math.sqrt(mean_squared_error(results['actuals'], results['predictions']))
        mae = mean_absolute_error(results['actuals'], results['predictions'])
        print(f"\nRating Prediction Metrics:")
        print(f"  RMSE: {rmse:.4f}")
        print(f"  MAE:  {mae:.4f}")
    
    print(f"\nRanking Metrics:")
    for k in k_values:
        if len(results['hit_rates'][k]) > 0:
            hit_rate = np.mean(results['hit_rates'][k])
            precision = np.mean(results['precisions'][k])
            recall = np.mean(results['recalls'][k])
            ndcg = np.mean(results['ndcg'][k])
            
            print(f"\n  @{k}:")
            print(f"    Hit Rate:  {hit_rate:.4f}")
            print(f"    Precision: {precision:.4f}")
            print(f"    Recall:    {recall:.4f}")
            print(f"    NDCG:      {ndcg:.4f}")
    
    return results

In [None]:
svd_model = MatrixFactorizationSVD(n_factors=50)
svd_model.fit(train_df)

pop_model = PopularityRecommender()
pop_model.fit(train_df)

svd_results = evaluate_recommendations(
    svd_model, 
    train_df, 
    test_df, 
    model_type='svd',
    k_values=[5, 10, 20],
    min_rating_threshold=3.5
)

pop_results = evaluate_recommendations(
    pop_model,
    train_df,
    test_df,
    model_type='popularity',
    k_values=[5, 10, 20],
    min_rating_threshold=3.5
)


Preparing data for SVD...
  Users: 933
  Items: 823
  Ratings: 1,089

Training SVD model...
  Latent factors: 5

 SVD model trained
  User factors shape: (933, 5)
  Item factors shape: (823, 5)

Evaluating on 933 users

Successfully evaluated 745 users

Rating Prediction Metrics:
  RMSE: 1.1020
  MAE:  0.6658

Ranking Metrics:

  @5:
    Hit Rate:  0.0027
    Precision: 0.0005
    Recall:    0.0027
    NDCG:      0.0022

  @10:
    Hit Rate:  0.0054
    Precision: 0.0005
    Recall:    0.0054
    NDCG:      0.0030

  @20:
    Hit Rate:  0.0161
    Precision: 0.0008
    Recall:    0.0161
    NDCG:      0.0056

Evaluating on 933 users

Successfully evaluated 745 users

Ranking Metrics:

  @5:
    Hit Rate:  0.0255
    Precision: 0.0051
    Recall:    0.0255
    NDCG:      0.0185

  @10:
    Hit Rate:  0.0644
    Precision: 0.0064
    Recall:    0.0644
    NDCG:      0.0309

  @20:
    Hit Rate:  0.0926
    Precision: 0.0046
    Recall:    0.0926
    NDCG:      0.0382


In [98]:

import pickle
import os

os.makedirs('models', exist_ok=True)

print("\nSaving models...")

with open('models/popularity_model.pkl', 'wb') as f:
    pickle.dump(pop_model, f)
print("  Saved: models/popularity_model.pkl")

with open('models/svd_model.pkl', 'wb') as f:
    pickle.dump(svd_model, f)
print("  Saved: models/svd_model.pkl")

with open('models/content_model.pkl', 'wb') as f:
    pickle.dump(content_model, f)
print("  Saved: models/content_model.pkl")


Saving models...
  Saved: models/popularity_model.pkl
  Saved: models/svd_model.pkl
  Saved: models/content_model.pkl


In [99]:
print("\nSaving TF-IDF artifacts...")
with open('models/product_tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(product_tfidf, f)
print("  Saved: models/product_tfidf_vectorizer.pkl")

with open('models/product_tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(product_tfidf_matrix, f)
print("  Saved: models/product_tfidf_matrix.pkl")


Saving TF-IDF artifacts...
  Saved: models/product_tfidf_vectorizer.pkl
  Saved: models/product_tfidf_matrix.pkl


In [100]:
with open('models/user_to_idx.pkl', 'wb') as f:
    pickle.dump(user_to_idx, f)
print("  Saved: models/user_to_idx.pkl")

with open('models/product_to_idx.pkl', 'wb') as f:
    pickle.dump(product_to_idx, f)
print("  Saved: models/product_to_idx.pkl")

  Saved: models/user_to_idx.pkl
  Saved: models/product_to_idx.pkl


In [101]:
print("\nSaving processed datasets...")
df.to_csv('data/processed/features_complete.csv', index=False)
print("  Saved: data/processed/features_complete.csv")

train_df.to_csv('data/processed/train_set.csv', index=False)
test_df.to_csv('data/processed/test_set.csv', index=False)
print("  Saved: data/processed/train_set.csv")
print("  Saved: data/processed/test_set.csv")


Saving processed datasets...
  Saved: data/processed/features_complete.csv
  Saved: data/processed/train_set.csv
  Saved: data/processed/test_set.csv
