In [1]:
# Import libraries
import polars as pl
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

# Set display options
pl.Config.set_tbl_rows(20)
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Data Loading & Overview

In [2]:
# Load data using recommender functions (auto-loads all chunks)
import sys
sys.path.append('..')

from src.recommender import load_transactions, load_items, load_users

print("Loading data from chunks...")
print("This may take a moment as we're loading ~36M transactions...")

# Load all data (LazyFrame - not yet executed)
transactions_lazy = load_transactions()
items_lazy = load_items()
users_lazy = load_users()

# Collect to DataFrame (execute the query)
transactions = transactions_lazy.collect()
items = items_lazy.collect()
users = users_lazy.collect()

print("\nData loaded successfully!")
print(f"Transactions: {transactions.shape}")
print(f"Items: {items.shape}")
print(f"Users: {users.shape}")

Loading data from chunks...
This may take a moment as we're loading ~36M transactions...


FileNotFoundError: The system cannot find the file specified. (os error 2): E:\Nam_3_HK1\PythonMayHoc\dataset\transactions.parquet

This error occurred with the following context stack:
	[1] 'parquet scan'
	[2] 'sink'


In [None]:
# Quick schema overview
print("\n=== TRANSACTIONS SCHEMA ===")
print(transactions.schema)
print("\nSample:")
transactions.head()

In [None]:
print("\n=== ITEMS SCHEMA ===")
print(items.schema)
print("\nSample:")
items.head()

In [None]:
print("\n=== USERS SCHEMA ===")
print(users.schema)
print("\nSample:")
users.head()

## 2. Task 1: Univariate Analysis

### 2.1 Basic Statistics & Null Rates

In [None]:
def compute_null_rates(df: pl.DataFrame, name: str) -> pl.DataFrame:
    """Compute null rates for all columns."""
    total = df.shape[0]
    
    null_counts = df.select([
        pl.col(col).is_null().sum().alias(col) 
        for col in df.columns
    ])
    
    null_rates = pl.DataFrame({
        'column': df.columns,
        'null_count': null_counts.row(0),
        'null_rate': [n / total for n in null_counts.row(0)],
    }).sort('null_rate', descending=True)
    
    print(f"\n{'='*50}")
    print(f"NULL RATES - {name}")
    print(f"{'='*50}")
    print(null_rates)
    
    return null_rates

# Compute null rates for all datasets
null_txns = compute_null_rates(transactions, 'TRANSACTIONS')
null_items = compute_null_rates(items, 'ITEMS')
null_users = compute_null_rates(users, 'USERS')

In [None]:
# Visualize null rates
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, (df_null, title) in zip(axes, [
    (null_txns, 'Transactions'),
    (null_items, 'Items'),
    (null_users, 'Users')
]):
    if df_null.shape[0] > 0 and df_null['null_rate'].max() > 0:
        ax.barh(df_null['column'].to_list(), df_null['null_rate'].to_list())
        ax.set_xlabel('Null Rate')
        ax.set_title(f'{title} - Null Rates')
        ax.grid(axis='x', alpha=0.3)
    else:
        ax.text(0.5, 0.5, 'No nulls', ha='center', va='center', transform=ax.transAxes)
        ax.set_title(f'{title} - Null Rates')

plt.tight_layout()
plt.show()

### 2.2 Customer Statistics

In [None]:
# Customer-level statistics
customer_stats = transactions.group_by('customer_id').agg([
    pl.count('item_id').alias('num_purchases'),
    pl.n_unique('item_id').alias('num_unique_items'),
    pl.n_unique('order_id').alias('num_orders'),
    (pl.col('created_at').max() - pl.col('created_at').min()).dt.total_days().alias('days_active'),
])

print("\nCustomer Statistics:")
print(customer_stats.describe())

In [None]:
# Plot customer distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Number of purchases per customer
axes[0, 0].hist(customer_stats['num_purchases'].to_numpy(), bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Number of Purchases')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution: Purchases per Customer')
axes[0, 0].set_yscale('log')

# Unique items per customer
axes[0, 1].hist(customer_stats['num_unique_items'].to_numpy(), bins=50, edgecolor='black', color='green')
axes[0, 1].set_xlabel('Number of Unique Items')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution: Unique Items per Customer')
axes[0, 1].set_yscale('log')

# Orders per customer
axes[1, 0].hist(customer_stats['num_orders'].to_numpy(), bins=50, edgecolor='black', color='orange')
axes[1, 0].set_xlabel('Number of Orders')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Distribution: Orders per Customer')
axes[1, 0].set_yscale('log')

# Days active
axes[1, 1].hist(customer_stats['days_active'].to_numpy(), bins=50, edgecolor='black', color='red')
axes[1, 1].set_xlabel('Days Active')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Distribution: Customer Activity Duration')
axes[1, 1].set_yscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Top customers by purchase volume
top_customers = customer_stats.sort('num_purchases', descending=True).head(20)

plt.figure(figsize=(12, 6))
plt.barh(range(len(top_customers)), top_customers['num_purchases'].to_list())
plt.xlabel('Number of Purchases')
plt.ylabel('Customer Rank')
plt.title('Top 20 Customers by Purchase Volume')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

### 2.3 Item Statistics

In [None]:
# Item popularity
item_stats = transactions.group_by('item_id').agg([
    pl.count().alias('num_purchases'),
    pl.n_unique('customer_id').alias('num_customers'),
]).join(items, on='item_id', how='left')

print("\nItem Statistics:")
print(item_stats.select(['num_purchases', 'num_customers']).describe())

In [None]:
# Plot item distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Purchases per item
axes[0].hist(item_stats['num_purchases'].to_numpy(), bins=50, edgecolor='black')
axes[0].set_xlabel('Number of Purchases')
axes[0].set_ylabel('Frequency (Items)')
axes[0].set_title('Distribution: Purchases per Item')
axes[0].set_yscale('log')

# Customers per item
axes[1].hist(item_stats['num_customers'].to_numpy(), bins=50, edgecolor='black', color='purple')
axes[1].set_xlabel('Number of Unique Customers')
axes[1].set_ylabel('Frequency (Items)')
axes[1].set_title('Distribution: Customers per Item')
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Top items by popularity
top_items = item_stats.sort('num_purchases', descending=True).head(20)

plt.figure(figsize=(12, 6))
plt.barh(range(len(top_items)), top_items['num_purchases'].to_list())
plt.xlabel('Number of Purchases')
plt.ylabel('Item Rank')
plt.title('Top 20 Most Popular Items')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

### 2.4 Categorical Feature Distributions

In [None]:
# Brand distribution
brand_dist = items.group_by('brand').agg(pl.count().alias('count')).sort('count', descending=True)

print("\nTop 10 Brands:")
print(brand_dist.head(10))

# Plot top 15 brands
top_brands = brand_dist.head(15)
plt.figure(figsize=(12, 6))
plt.bar(range(len(top_brands)), top_brands['count'].to_list())
plt.xticks(range(len(top_brands)), top_brands['brand'].to_list(), rotation=45, ha='right')
plt.xlabel('Brand')
plt.ylabel('Number of Items')
plt.title('Top 15 Brands by Item Count')
plt.tight_layout()
plt.show()

In [None]:
# Age group distribution
age_group_dist = items.group_by('age_group').agg(pl.count().alias('count')).sort('count', descending=True)

print("\nAge Group Distribution:")
print(age_group_dist)

plt.figure(figsize=(10, 6))
plt.bar(age_group_dist['age_group'].to_list(), age_group_dist['count'].to_list())
plt.xlabel('Age Group')
plt.ylabel('Number of Items')
plt.title('Distribution: Age Groups')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Category distribution
category_dist = items.group_by('category').agg(pl.count().alias('count')).sort('count', descending=True)

print("\nTop 10 Categories:")
print(category_dist.head(10))

# Plot top 15 categories
top_categories = category_dist.head(15)
plt.figure(figsize=(12, 6))
plt.bar(range(len(top_categories)), top_categories['count'].to_list(), color='coral')
plt.xticks(range(len(top_categories)), top_categories['category'].to_list(), rotation=45, ha='right')
plt.xlabel('Category')
plt.ylabel('Number of Items')
plt.title('Top 15 Categories by Item Count')
plt.tight_layout()
plt.show()

### 2.5 Temporal Patterns

In [None]:
# Transaction time range
time_range = transactions.select([
    pl.col('created_at').min().alias('first_transaction'),
    pl.col('created_at').max().alias('last_transaction'),
])

print("\nTransaction Time Range:")
print(time_range)

# Daily transaction volume
daily_txns = transactions.group_by(
    pl.col('created_at').dt.date().alias('date')
).agg(
    pl.count().alias('num_transactions')
).sort('date')

plt.figure(figsize=(14, 5))
plt.plot(daily_txns['date'].to_list(), daily_txns['num_transactions'].to_list())
plt.xlabel('Date')
plt.ylabel('Number of Transactions')
plt.title('Daily Transaction Volume Over Time')
plt.grid(alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Task 2: Multivariate Analysis

### 3.1 Sparsity Analysis

In [None]:
# Calculate interaction matrix sparsity
num_customers = transactions['customer_id'].n_unique()
num_items = transactions['item_id'].n_unique()
num_interactions = transactions.shape[0]
total_possible = num_customers * num_items
sparsity = 1 - (num_interactions / total_possible)

print(f"\n{'='*50}")
print("INTERACTION MATRIX SPARSITY")
print(f"{'='*50}")
print(f"Unique customers: {num_customers:,}")
print(f"Unique items: {num_items:,}")
print(f"Total interactions: {num_interactions:,}")
print(f"Possible interactions: {total_possible:,}")
print(f"Sparsity: {sparsity:.4%}")
print(f"Density: {(1-sparsity):.4%}")

In [None]:
# Visualize sparsity
fig, ax = plt.subplots(figsize=(8, 6))
categories = ['Filled', 'Empty']
values = [1-sparsity, sparsity]
colors = ['#2ecc71', '#e74c3c']

ax.pie(values, labels=categories, autopct='%1.2f%%', colors=colors, startangle=90)
ax.set_title('Customer-Item Interaction Matrix Sparsity')
plt.tight_layout()
plt.show()

### 3.2 Purchase Patterns by Category

In [None]:
# Join transactions with items to analyze category purchases
txns_with_items = transactions.join(items, on='item_id', how='left')

# Category purchase volume
category_purchases = txns_with_items.group_by('category').agg(
    pl.count().alias('num_purchases')
).sort('num_purchases', descending=True)

print("\nTop Categories by Purchase Volume:")
print(category_purchases.head(10))

# Plot
top_cat = category_purchases.head(15)
plt.figure(figsize=(12, 6))
plt.barh(range(len(top_cat)), top_cat['num_purchases'].to_list())
plt.yticks(range(len(top_cat)), top_cat['category'].to_list())
plt.xlabel('Number of Purchases')
plt.title('Top 15 Categories by Purchase Volume')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### 3.3 Brand vs Age Group Analysis

In [None]:
# Cross-tabulation: Brand vs Age Group
brand_age = items.group_by(['brand', 'age_group']).agg(
    pl.count().alias('item_count')
)

# Get top 10 brands
top_10_brands = brand_dist.head(10)['brand'].to_list()

# Filter for top brands and pivot
brand_age_pivot = brand_age.filter(
    pl.col('brand').is_in(top_10_brands)
).pivot(
    index='brand',
    columns='age_group',
    values='item_count',
    aggregate_function='sum'
).fill_null(0)

print("\nBrand vs Age Group Matrix (Top 10 Brands):")
print(brand_age_pivot)

In [None]:
# Heatmap-style visualization
pivot_np = brand_age_pivot.select(pl.exclude('brand')).to_numpy()

fig, ax = plt.subplots(figsize=(12, 8))
im = ax.imshow(pivot_np, aspect='auto', cmap='YlOrRd')

# Set ticks
ax.set_xticks(range(len(brand_age_pivot.columns[1:])))
ax.set_yticks(range(len(brand_age_pivot)))
ax.set_xticklabels(brand_age_pivot.columns[1:], rotation=45, ha='right')
ax.set_yticklabels(brand_age_pivot['brand'].to_list())

# Add colorbar
plt.colorbar(im, ax=ax, label='Item Count')

ax.set_title('Brand vs Age Group Heatmap (Top 10 Brands)')
ax.set_xlabel('Age Group')
ax.set_ylabel('Brand')

plt.tight_layout()
plt.show()

### 3.4 Cohort-Like Time Analysis

In [None]:
# Customer first purchase month
customer_first_purchase = transactions.group_by('customer_id').agg(
    pl.col('created_at').min().alias('first_purchase')
).with_columns(
    pl.col('first_purchase').dt.month_start().alias('cohort_month')
)

# Join back to transactions
cohort_txns = transactions.join(
    customer_first_purchase.select(['customer_id', 'cohort_month']),
    on='customer_id',
    how='left'
).with_columns(
    pl.col('created_at').dt.month_start().alias('purchase_month')
)

# Calculate months since first purchase
cohort_analysis = cohort_txns.with_columns(
    ((pl.col('purchase_month') - pl.col('cohort_month')).dt.total_days() / 30).floor().cast(pl.Int32).alias('months_since_first')
).group_by(['cohort_month', 'months_since_first']).agg(
    pl.n_unique('customer_id').alias('active_customers')
).sort(['cohort_month', 'months_since_first'])

print("\nCohort Analysis (Sample):")
print(cohort_analysis.head(20))

In [None]:
# Plot retention by cohort
# Select a few cohorts for visualization
sample_cohorts = cohort_analysis['cohort_month'].unique().sort().head(6).to_list()

plt.figure(figsize=(14, 6))

for cohort in sample_cohorts:
    cohort_data = cohort_analysis.filter(pl.col('cohort_month') == cohort)
    plt.plot(
        cohort_data['months_since_first'].to_list(),
        cohort_data['active_customers'].to_list(),
        marker='o',
        label=f'Cohort {cohort}'
    )

plt.xlabel('Months Since First Purchase')
plt.ylabel('Active Customers')
plt.title('Customer Retention by Cohort')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

### 3.5 Purchase Frequency Correlation

In [None]:
# Analyze relationship between customer activity metrics
customer_metrics = customer_stats.select([
    'num_purchases',
    'num_unique_items',
    'num_orders',
    'days_active'
])

# Compute correlation matrix
corr_matrix = customer_metrics.to_pandas().corr()

print("\nCustomer Metrics Correlation Matrix:")
print(corr_matrix)

In [None]:
# Scatter plots
fig, axes = plt.subplots(2, 2, figsize=(14, 12))

# Purchases vs Unique Items
axes[0, 0].scatter(
    customer_stats['num_purchases'].to_numpy(),
    customer_stats['num_unique_items'].to_numpy(),
    alpha=0.3, s=20
)
axes[0, 0].set_xlabel('Number of Purchases')
axes[0, 0].set_ylabel('Number of Unique Items')
axes[0, 0].set_title('Purchases vs Unique Items')
axes[0, 0].grid(alpha=0.3)

# Purchases vs Orders
axes[0, 1].scatter(
    customer_stats['num_purchases'].to_numpy(),
    customer_stats['num_orders'].to_numpy(),
    alpha=0.3, s=20, color='green'
)
axes[0, 1].set_xlabel('Number of Purchases')
axes[0, 1].set_ylabel('Number of Orders')
axes[0, 1].set_title('Purchases vs Orders')
axes[0, 1].grid(alpha=0.3)

# Days Active vs Purchases
axes[1, 0].scatter(
    customer_stats['days_active'].to_numpy(),
    customer_stats['num_purchases'].to_numpy(),
    alpha=0.3, s=20, color='orange'
)
axes[1, 0].set_xlabel('Days Active')
axes[1, 0].set_ylabel('Number of Purchases')
axes[1, 0].set_title('Activity Duration vs Purchases')
axes[1, 0].grid(alpha=0.3)

# Items per Order
items_per_order = (customer_stats['num_unique_items'] / customer_stats['num_orders']).to_numpy()
axes[1, 1].hist(items_per_order[~np.isnan(items_per_order)], bins=30, edgecolor='black', color='purple')
axes[1, 1].set_xlabel('Unique Items per Order')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_title('Distribution: Items per Order')
axes[1, 1].set_yscale('log')

plt.tight_layout()
plt.show()

## 4. Task 3: Preprocessing Recommendations

### 4.1 Outlier Detection

In [None]:
def detect_outliers_iqr(df: pl.DataFrame, column: str) -> dict:
    """Detect outliers using IQR method."""
    q1 = df[column].quantile(0.25)
    q3 = df[column].quantile(0.75)
    iqr = q3 - q1
    
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    
    outliers = df.filter(
        (pl.col(column) < lower_bound) | (pl.col(column) > upper_bound)
    ).shape[0]
    
    return {
        'column': column,
        'q1': q1,
        'q3': q3,
        'iqr': iqr,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound,
        'num_outliers': outliers,
        'outlier_rate': outliers / df.shape[0]
    }

# Detect outliers in customer metrics
print("\n" + "="*60)
print("OUTLIER DETECTION - Customer Metrics")
print("="*60)

for col in ['num_purchases', 'num_unique_items', 'num_orders', 'days_active']:
    outlier_info = detect_outliers_iqr(customer_stats, col)
    print(f"\n{col}:")
    print(f"  Q1: {outlier_info['q1']:.2f}")
    print(f"  Q3: {outlier_info['q3']:.2f}")
    print(f"  IQR: {outlier_info['iqr']:.2f}")
    print(f"  Bounds: [{outlier_info['lower_bound']:.2f}, {outlier_info['upper_bound']:.2f}]")
    print(f"  Outliers: {outlier_info['num_outliers']} ({outlier_info['outlier_rate']:.2%})")

In [None]:
# Visualize outliers with box plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
metrics = ['num_purchases', 'num_unique_items', 'num_orders', 'days_active']
titles = ['Purchases', 'Unique Items', 'Orders', 'Days Active']

for ax, metric, title in zip(axes.flat, metrics, titles):
    data = customer_stats[metric].to_numpy()
    ax.boxplot([data], vert=False)
    ax.set_xlabel(title)
    ax.set_title(f'Box Plot: {title} per Customer')
    ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

### 4.2 Data Quality Summary

In [None]:
print("\n" + "="*70)
print("DATA QUALITY SUMMARY")
print("="*70)

print("\n1. DUPLICATES:")
print(f"   Transactions: {transactions.shape[0] - transactions.unique().shape[0]} duplicate rows")
print(f"   Items: {items.shape[0] - items.unique().shape[0]} duplicate rows")
print(f"   Users: {users.shape[0] - users.unique().shape[0]} duplicate rows")

print("\n2. ID CONSISTENCY:")
txn_customers = set(transactions['customer_id'].unique().to_list())
user_ids = set(users['customer_id'].unique().to_list())
txn_items = set(transactions['item_id'].unique().to_list())
item_ids = set(items['item_id'].unique().to_list())

print(f"   Customers in txns not in users: {len(txn_customers - user_ids)}")
print(f"   Items in txns not in items table: {len(txn_items - item_ids)}")

print("\n3. TEMPORAL CONSISTENCY:")
if 'date_of_birth' in users.columns:
    future_dob = users.filter(pl.col('date_of_birth') > pl.lit(datetime.now().date())).shape[0]
    print(f"   Users with future date_of_birth: {future_dob}")

### 4.3 Preprocessing Recommendations

In [None]:
recommendations = """
═══════════════════════════════════════════════════════════════════
                   PREPROCESSING RECOMMENDATIONS
═══════════════════════════════════════════════════════════════════

1. COLUMNS TO DROP/HANDLE:
   ✓ Check null rates above 50% - consider dropping or imputing
   ✓ If 'order_id' has high nulls but not needed, can drop
   ✓ Redundant ID columns should be removed after joining

2. NULL HANDLING PLAN:
   Transactions:
   - customer_id, item_id, created_at: DROP rows (critical fields)
   - order_id: Fill with auto-generated ID or keep null if not used
   
   Items:
   - item_id: DROP rows (primary key)
   - brand/category/age_group: Fill with 'Unknown' or mode
   
   Users:
   - customer_id: DROP rows (primary key)
   - date_of_birth: Keep null or impute with median/mode

3. OUTLIER HANDLING:
   Customer Metrics:
   - Cap extreme purchase counts at 99th percentile
   - Consider log transformation for skewed distributions
   - Flag power users (top 1%) for separate analysis
   
   Item Metrics:
   - Handle cold-start items (< 5 purchases) separately
   - Consider minimum support thresholds

4. FEATURE ENGINEERING SUGGESTIONS:
   Time-based:
   - Recency: days since last purchase
   - Frequency: purchases per active day
   - Seasonality: month, day of week, holidays
   
   Interaction:
   - Purchase velocity: purchases / days_active
   - Category diversity: unique categories purchased
   - Repeat purchase rate: items bought multiple times
   
   Categorical:
   - One-hot encode brand, category, age_group
   - Or use target encoding for high cardinality

5. DATA SPLITS:
   Recommendation: Time-based split (not random!)
   - Training: transactions before date T
   - Validation: T to T+14 days
   - Test: T+14 to T+30 days
   
   Prevents data leakage and mimics production scenario

6. COLD START HANDLING:
   - New users: Use popularity-based recommendations
   - New items: Require minimum interaction threshold
   - Consider hybrid approach with content-based features

7. SPARSITY MITIGATION:
   Given high sparsity (>99%):
   - Use negative sampling for training
   - Consider matrix factorization or embedding methods
   - Implement candidate generation stage (top-K retrieval)

═══════════════════════════════════════════════════════════════════
"""

print(recommendations)

### 4.4 Quick Preprocessing Pipeline Example

In [None]:
def preprocess_transactions(df: pl.DataFrame) -> pl.DataFrame:
    """Example preprocessing pipeline for transactions."""
    return (
        df
        # Remove nulls in critical columns
        .filter(
            pl.col('customer_id').is_not_null() &
            pl.col('item_id').is_not_null() &
            pl.col('created_at').is_not_null()
        )
        # Remove duplicates
        .unique()
        # Sort by time
        .sort('created_at')
    )

def preprocess_items(df: pl.DataFrame) -> pl.DataFrame:
    """Example preprocessing pipeline for items."""
    return (
        df
        .filter(pl.col('item_id').is_not_null())
        # Fill categorical nulls
        .with_columns([
            pl.col('brand').fill_null('Unknown'),
            pl.col('category').fill_null('Unknown'),
            pl.col('age_group').fill_null('Unknown'),
        ])
        .unique(subset=['item_id'])
    )

def preprocess_users(df: pl.DataFrame) -> pl.DataFrame:
    """Example preprocessing pipeline for users."""
    return (
        df
        .filter(pl.col('customer_id').is_not_null())
        .unique(subset=['customer_id'])
    )

print("Preprocessing pipeline functions defined.")
print("\nUsage:")
print("  clean_txns = preprocess_transactions(transactions)")
print("  clean_items = preprocess_items(items)")
print("  clean_users = preprocess_users(users)")

## Summary

This EDA notebook covered:
- **Task 1**: Univariate analysis (null rates, distributions, top items/customers)
- **Task 2**: Multivariate analysis (sparsity, correlations, cohort retention)
- **Task 3**: Preprocessing recommendations (outliers, null handling, feature engineering)

Next steps:
1. Apply preprocessing pipeline
2. Implement time-based train/test split
3. Generate candidates using methods from `candidates.py`
4. Build features using `build_feature_label_table()`
5. Train and evaluate recommender model