In [3]:
import pandas as pd

# 1. LOAD DATA
df = pd.read_csv('products.csv', nrows=10000)
print("Dataset loaded successfully!")

# 2. DATA PREPARATION (Cleaning & Structuring)
# Using your specific column names found in the previous step
essential_cols = {
    'reviews.username': 'userId',
    'id': 'productId',
    'reviews.rating': 'rating'
}

# Keep only the columns we need and rename them to standard names
df_clean = df[list(essential_cols.keys())].copy()
df_clean.rename(columns=essential_cols, inplace=True)

# Remove rows where username or rating is missing (Handling Inconsistencies)
df_clean.dropna(subset=['userId', 'rating'], inplace=True)

print(f"Cleaned data has {df_clean.shape[0]} rows and is now structured.")

# 3. BUILD THE MATRIX (Evaluation Criteria)
# We use pivot_table here to handle cases where one user rated the same product twice
user_item_matrix = df_clean.pivot_table(
    index='userId', 
    columns='productId', 
    values='rating',
    aggfunc='mean'
).fillna(0)

# 4. SAVE THE OUTPUT
user_item_matrix.to_csv('user_item_matrix.csv')
print("Success! 'user_item_matrix.csv' has been created in your sidebar.")
print(f"Matrix Size: {user_item_matrix.shape[0]} Users x {user_item_matrix.shape[1]} Products")

Dataset loaded successfully!
Cleaned data has 1177 rows and is now structured.
Success! 'user_item_matrix.csv' has been created in your sidebar.
Matrix Size: 836 Users x 66 Products
