# Data Preprocessing - Amazon Beauty Ratings

This notebook applies preprocessing pipeline to prepare data for recommendation system:
- Handle missing values
- Detect and handle outliers
- Normalization and standardization
- Feature engineering
- Filter sparse data


## 1. Setup and Load Data


In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src'))

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_loader import load_csv_numpy, validate_data, get_basic_stats
from data_processing import (
    detect_missing_values, impute_missing_mean, impute_missing_median,
    detect_outliers_iqr, detect_outliers_zscore, remove_outliers,
    normalize_minmax, normalize_log, standardize_zscore,
    unix_to_datetime_features, filter_by_min_ratings
)
from feature_engineering import (
    compute_user_stats, compute_product_stats,
    compute_recency_score, compute_rating_velocity
)

np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)


In [None]:
data, header = load_csv_numpy('../data/raw/ratings_Beauty.csv')
print(f"Original data shape: {data.shape}")
print(f"Columns: {header}")


## 2. Missing Values Handling


In [None]:
missing_report = detect_missing_values(data)
print("Missing values check:")
for field, mask in missing_report.items():
    missing_count = np.sum(mask)
    print(f"  {field}: {missing_count} missing values")

validation = validate_data(data)
print(f"\nTotal nulls: {validation['total_nulls']}")
print(f"Null percentage: {validation['null_percentage']:.2f}%")

if validation['total_nulls'] > 0:
    print("\nApplying mean imputation for numeric columns...")
    for field in data.dtype.names:
        if np.issubdtype(data[field].dtype, np.number):
            data[field] = impute_missing_mean(data, field)
else:
    print("\nNo missing values found. Data is clean!")


## 3. Outlier Detection and Handling


In [None]:
ratings = data['Rating']

print("=== Outlier Detection ===\n")

outliers_iqr = detect_outliers_iqr(ratings)
outliers_zscore = detect_outliers_zscore(ratings, threshold=3.0)

print(f"IQR method: {np.sum(outliers_iqr)} outliers ({np.sum(outliers_iqr)/len(ratings)*100:.2f}%)")
print(f"Z-score method: {np.sum(outliers_zscore)} outliers ({np.sum(outliers_zscore)/len(ratings)*100:.2f}%)")

print(f"\nRating statistics:")
print(f"  Mean: {np.mean(ratings):.4f}")
print(f"  Std: {np.std(ratings):.4f}")
print(f"  Min: {np.min(ratings):.1f}")
print(f"  Max: {np.max(ratings):.1f}")

print("\nNote: For ratings (1-5 scale), outliers are expected and may be valid.")
print("We will NOT remove outliers as they represent genuine user opinions.")


## 4. Normalization and Standardization


In [None]:
print("=== Normalization Examples ===\n")

ratings_sample = ratings[:1000]

minmax_normalized = normalize_minmax(ratings_sample, feature_min=0, feature_max=1)
log_normalized = normalize_log(ratings_sample, base='e')
zscore_standardized = standardize_zscore(ratings_sample)

print("Original ratings (sample):")
print(f"  Range: [{np.min(ratings_sample):.2f}, {np.max(ratings_sample):.2f}]")
print(f"  Mean: {np.mean(ratings_sample):.4f}, Std: {np.std(ratings_sample):.4f}")

print("\nMin-Max Normalized:")
print(f"  Range: [{np.min(minmax_normalized):.4f}, {np.max(minmax_normalized):.4f}]")

print("\nLog Normalized:")
print(f"  Range: [{np.min(log_normalized):.4f}, {np.max(log_normalized):.4f}]")

print("\nZ-score Standardized:")
print(f"  Mean: {np.mean(zscore_standardized):.4f}, Std: {np.std(zscore_standardized):.4f}")

print("\nNote: For recommendation systems, we typically keep original ratings.")
print("Normalization may be applied to derived features if needed.")


## 5. Timestamp Feature Engineering


In [None]:
timestamps = data['Timestamp']
datetime_features = unix_to_datetime_features(timestamps)

print("=== Extracted Datetime Features ===\n")
print(f"Year range: {np.min(datetime_features['year'])} - {np.max(datetime_features['year'])}")
print(f"Month range: {np.min(datetime_features['month'])} - {np.max(datetime_features['month'])}")
print(f"Weekday distribution:")
unique_weekdays, weekday_counts = np.unique(datetime_features['weekday'], return_counts=True)
for wd, count in zip(unique_weekdays, weekday_counts):
    print(f"  Weekday {wd}: {count:,} ratings ({count/len(timestamps)*100:.2f}%)")


## 6. Filter Sparse Data


In [None]:
print("=== Filtering by Minimum Ratings ===\n")
print(f"Original data size: {len(data):,} ratings")

filtered_data = filter_by_min_ratings(
    data, 
    min_user_ratings=5, 
    min_product_ratings=5
)

print(f"Filtered data size: {len(filtered_data):,} ratings")
print(f"Reduction: {(1 - len(filtered_data)/len(data))*100:.2f}%")

print(f"\nUnique users: {len(np.unique(filtered_data['UserId'])):,}")
print(f"Unique products: {len(np.unique(filtered_data['ProductId'])):,}")

data = filtered_data


## 7. Compute User and Product Features


In [None]:
print("=== Computing User Statistics ===\n")
user_stats = compute_user_stats(data)
print(f"Total users: {len(user_stats['user_id']):,}")
print(f"Average ratings per user: {np.mean(user_stats['total_ratings']):.2f}")
print(f"Average user rating: {np.mean(user_stats['avg_rating']):.4f}")

print("\n=== Computing Product Statistics ===\n")
product_stats = compute_product_stats(data)
print(f"Total products: {len(product_stats['product_id']):,}")
print(f"Average ratings per product: {np.mean(product_stats['total_ratings']):.2f}")
print(f"Average product rating: {np.mean(product_stats['avg_rating']):.4f}")


## 8. Compute Recency and Velocity Features


In [None]:
timestamps = data['Timestamp']

recency_scores = compute_recency_score(timestamps, decay_factor=0.1)
print("=== Recency Scores ===\n")
print(f"Recency score range: [{np.min(recency_scores):.4f}, {np.max(recency_scores):.4f}]")
print(f"Mean recency: {np.mean(recency_scores):.4f}")

rating_velocity = compute_rating_velocity(timestamps, window_days=30)
print(f"\n=== Rating Velocity (30-day windows) ===\n")
print(f"Number of windows: {len(rating_velocity)}")
print(f"Average ratings per window: {np.mean(rating_velocity):.2f}")
print(f"Max ratings in a window: {np.max(rating_velocity):.0f}")


## 9. Save Processed Data


In [None]:
print("=== Saving Processed Data ===\n")

output_dir = '../data/processed'
os.makedirs(output_dir, exist_ok=True)

np.save(f'{output_dir}/filtered_data.npy', data)
np.save(f'{output_dir}/user_stats.npy', user_stats)
np.save(f'{output_dir}/product_stats.npy', product_stats)

print(f"Saved:")
print(f"  - {output_dir}/filtered_data.npy")
print(f"  - {output_dir}/user_stats.npy")
print(f"  - {output_dir}/product_stats.npy")

print(f"\nFinal processed data shape: {data.shape}")
print(f"Final unique users: {len(np.unique(data['UserId'])):,}")
print(f"Final unique products: {len(np.unique(data['ProductId'])):,}")


## Summary

Preprocessing pipeline completed:
1. ✅ Missing values checked (none found)
2. ✅ Outliers detected (kept as valid ratings)
3. ✅ Normalization methods demonstrated
4. ✅ Datetime features extracted
5. ✅ Sparse data filtered (min 5 ratings per user/product)
6. ✅ User and product statistics computed
7. ✅ Recency and velocity features created
8. ✅ Processed data saved

Data is now ready for recommendation system modeling!
