In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_raw = pd.read_csv("data/amazon_elec_500k/train_ratings.csv")
test_raw = pd.read_csv("data/amazon_elec_500k/test_ratings.csv")

print(f"Train dataset shape: {train_raw.shape}")
print(f"Test dataset shape: {test_raw.shape}")

In [None]:
train_subset = train_raw[['user_id', 'item_id', 'rating']].copy()
test_subset = test_raw[['user_id', 'item_id', 'rating']].copy()

print(f"Train subset shape: {train_subset.shape}")
print(f"Test subset shape: {test_subset.shape}")

In [None]:
# Merge train and test data
merged_ratings = pd.concat([train_subset, test_subset], ignore_index=True)

print(f"Merged dataset shape: {merged_ratings.shape}")
print(f"Unique users: {merged_ratings['user_id'].nunique()}")
print(f"Unique items: {merged_ratings['item_id'].nunique()}")

In [None]:
# Check for duplicates across train and test
duplicate_mask = merged_ratings.duplicated(subset=['user_id', 'item_id'], keep=False)
duplicates = merged_ratings[duplicate_mask]

if len(duplicates) > 0:
    print(f"Found {len(duplicates)} duplicate user-item pairs")
    # Count how many are in both train and test
    dup_groups = duplicates.groupby(['user_id', 'item_id'])
    cross_set_dups = sum(1 for _, group in dup_groups if len(group['source'].unique()) > 1)
    print(f"User-item pairs in both train and test: {cross_set_dups}")
else:
    print("No duplicates found")

In [None]:
merged_ratings.to_csv("data/amazon_elec_500k/ratings.csv", index=False)
print(f"Saved merged ratings to ratings.csv with {len(merged_ratings)} rows")

In [None]:
assert merged_ratings['user_id'].notna().all(), "Missing user IDs found"
assert merged_ratings['item_id'].notna().all(), "Missing item IDs found"
assert merged_ratings['rating'].notna().all(), "Missing ratings found"
assert merged_ratings['rating'].between(1, 5).all(), "Rating values outside expected range (1-5)"
assert len(merged_ratings) == len(train_subset) + len(test_subset), "Row count mismatch after merging"
print("Data integrity checks passed!")

# 2. EDA

In [None]:
import pandas as pd
import numpy as np
ratings = pd.read_csv("data/amazon_elec_500k/ratings.csv")
print(ratings.shape)

In [None]:
print(f"Unique users: {ratings['user_id'].nunique()}")
print(f"Unique items: {ratings['item_id'].nunique()}")

In [None]:
# Compute interactions per user and item
user_counts = ratings['user_id'].value_counts()
item_counts = ratings['item_id'].value_counts()

print(f"Interactions per user (mean): {user_counts.mean():.2f}")
print(f"Interactions per user (median): {user_counts.median():.2f}")
print(f"Interactions per user (min): {user_counts.min()}")
print(f"Interactions per user (max): {user_counts.max()}")

print(f"Interactions per item (mean): {item_counts.mean():.2f}")
print(f"Interactions per item (median): {item_counts.median():.2f}")
print(f"Interactions per item (min): {item_counts.min()}")
print(f"Interactions per item (max): {item_counts.max()}")

In [None]:
# Calculate utility matrix density
n_users = ratings['user_id'].nunique()
n_items = ratings['item_id'].nunique()
n_possible_interactions = n_users * n_items
n_actual_interactions = len(ratings)

density = n_actual_interactions / n_possible_interactions
print(f"Utility matrix density: {density:.6f} ({density*100:.4f}%)")

In [None]:
# Calculate average ratings
avg_rating_per_user = ratings.groupby('user_id')['rating'].mean()
avg_rating_per_item = ratings.groupby('item_id')['rating'].mean()

print(f"Average rating per user (mean): {avg_rating_per_user.mean():.2f}")
print(f"Average rating per user (std): {avg_rating_per_user.std():.2f}")
print(f"Average rating per user (min): {avg_rating_per_user.min():.2f}")
print(f"Average rating per user (max): {avg_rating_per_user.max():.2f}")

print(f"Average rating per item (mean): {avg_rating_per_item.mean():.2f}")
print(f"Average rating per item (std): {avg_rating_per_item.std():.2f}")
print(f"Average rating per item (min): {avg_rating_per_item.min():.2f}")
print(f"Average rating per item (max): {avg_rating_per_item.max():.2f}")

In [None]:
# Analyze distribution of ratings
rating_counts = ratings['rating'].value_counts().sort_index()
print("Rating distribution:")
for rating, count in rating_counts.items():
    percentage = count / len(ratings) * 100
    print(f"Rating {rating}: {count} ({percentage:.2f}%)")

In [None]:
# Identify users and items with few interactions
low_activity_users = user_counts[user_counts < 5].count()
low_activity_items = item_counts[item_counts < 5].count()

print(f"Users with less than 5 ratings: {low_activity_users} ({low_activity_users/n_users*100:.2f}%)")
print(f"Items with less than 5 ratings: {low_activity_items} ({low_activity_items/n_items*100:.2f}%)")

In [None]:
# Check for potential cold start issues
single_interaction_users = user_counts[user_counts == 1].count()
single_interaction_items = item_counts[item_counts == 1].count()

print(f"Users with exactly 1 rating: {single_interaction_users} ({single_interaction_users/n_users*100:.2f}%)")
print(f"Items with exactly 1 rating: {single_interaction_items} ({single_interaction_items/n_items*100:.2f}%)")

In [None]:
# Assert statements to check data integrity
assert ratings['user_id'].notna().all(), "Missing user IDs found"
assert ratings['item_id'].notna().all(), "Missing item IDs found"
assert ratings['rating'].notna().all(), "Missing ratings found"
assert ratings['rating'].between(1, 5).all(), "Rating values outside expected range (1-5)"
assert n_users > 0, "No users found in dataset"
assert n_items > 0, "No items found in dataset"
assert len(ratings) == n_actual_interactions, "Interaction count mismatch"
assert avg_rating_per_user.between(1, 5).all(), "Invalid user average ratings"
assert avg_rating_per_item.between(1, 5).all(), "Invalid item average ratings"
print("All data integrity checks passed!")

In [None]:
unique_users = ratings['user_id'].unique()
unique_items = ratings['item_id'].unique()

user_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_users)}
item_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_items)}

print(f"Number of unique users: {len(user_id_map)}")
print(f"Number of unique items: {len(item_id_map)}")

In [None]:
# Apply mapping to create new columns
ratings['user_idx'] = ratings['user_id'].map(user_id_map)
ratings['item_idx'] = ratings['item_id'].map(item_id_map)

In [None]:
# Sort by user_idx
mapped_ratings = ratings.sort_values(by='user_idx').reset_index(drop=True)

# Select only the necessary columns
mapped_ratings = mapped_ratings[['user_idx', 'item_idx', 'rating']]

print(f"Mapped ratings shape: {mapped_ratings.shape}")
mapped_ratings.head()

In [None]:
mapped_ratings.to_csv("data/amazon_elec_500k/cleaned_ratings.csv", index=False)