
### Data Exploration for MovieLens 100K Dataset
============================================

This notebook explores the MovieLens dataset to understand:
- Rating distribution and patterns
- User behavior
- Item popularity
- Data sparsity


In [None]:

# ============================================================================
# Cell 2: Load Data
# ============================================================================
print("Loading MovieLens 100K dataset...")
loader = MovieLensLoader()

# Download if needed
loader.download_data()

# Load datasets
ratings = loader.load_ratings()
movies = loader.load_movies()
users = loader.load_users()

print("\n✓ Data loaded successfully")


In [None]:
# ============================================================================
# Cell 1: Imports and Setup
# ============================================================================
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loader import MovieLensLoader
from src.utils import (
    plot_rating_distribution,
    plot_sparsity_heatmap,
    analyze_user_behavior,
    analyze_item_popularity
)

# Styling
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✓ Imports successful")


In [None]:
# ============================================================================
# Cell 3: Basic Statistics
# ============================================================================
print("="*70)
print("DATASET OVERVIEW")
print("="*70)

print(f"\n📊 Ratings Dataset")
print(f"   Total ratings: {len(ratings):,}")
print(f"   Unique users: {ratings['user_id'].nunique():,}")
print(f"   Unique items: {ratings['item_id'].nunique():,}")
print(f"   Rating range: {ratings['rating'].min()} - {ratings['rating'].max()}")
print(f"   Average rating: {ratings['rating'].mean():.2f}")
print(f"   Sparsity: {1 - len(ratings) / (ratings['user_id'].nunique() * ratings['item_id'].nunique()):.4f}")

print(f"\n🎬 Movies Dataset")
print(f"   Total movies: {len(movies):,}")
print(f"   Movies with genres: {movies['genres'].apply(len).gt(0).sum():,}")

print(f"\n👤 Users Dataset")
print(f"   Total users: {len(users):,}")
print(f"   Age range: {users['age'].min()} - {users['age'].max()}")
print(f"   Gender distribution:")
print(users['gender'].value_counts())

In [None]:
# ============================================================================
# Cell 4: Sample Data
# ============================================================================
print("\n" + "="*70)
print("SAMPLE DATA")
print("="*70)

print("\n📝 Sample Ratings:")
display(ratings.head(10))

print("\n🎬 Sample Movies:")
display(movies.head(10))

print("\n👤 Sample Users:")
display(users.head(10))


In [None]:
# ============================================================================
# Cell 5: Rating Distribution Analysis
# ============================================================================
print("\n" + "="*70)
print("RATING DISTRIBUTION ANALYSIS")
print("="*70)

# Overall rating distribution
print("\n Rating value counts:")
print(ratings['rating'].value_counts().sort_index())

# Plot rating distribution
plot_rating_distribution(ratings)

# Rating statistics
print("\n Rating statistics:")
print(ratings['rating'].describe())



In [None]:
# ============================================================================
# Cell 6: User Behavior Analysis
# ============================================================================
print("\n" + "="*70)
print("USER BEHAVIOR ANALYSIS")
print("="*70)

user_stats = analyze_user_behavior(ratings)

# Display top active users
print("\n🏆 Top 10 Most Active Users:")
display(user_stats.nlargest(10, 'num_ratings'))

# Display users with extreme rating patterns
print("\n😊 Most Generous Users (highest avg rating):")
display(user_stats.nlargest(10, 'avg_rating')[['num_ratings', 'avg_rating', 'std_rating']])

print("\n😠 Most Critical Users (lowest avg rating):")
display(user_stats.nsmallest(10, 'avg_rating')[['num_ratings', 'avg_rating', 'std_rating']])


In [None]:
# ============================================================================
# Cell 7: Item Popularity Analysis
# ============================================================================
print("\n" + "="*70)
print("ITEM POPULARITY ANALYSIS")
print("="*70)

item_stats = analyze_item_popularity(ratings)

# Merge with movie titles
item_stats_with_titles = item_stats.merge(
    movies[['item_id', 'title']], 
    left_index=True, 
    right_on='item_id'
)

print("\n🌟 Top 10 Most Popular Movies:")
display(item_stats_with_titles.nlargest(10, 'num_ratings')[
    ['title', 'num_ratings', 'avg_rating']
])

print("\n⭐ Top 10 Highest Rated Movies (min 50 ratings):")
popular_items = item_stats_with_titles[item_stats_with_titles['num_ratings'] >= 50]
display(popular_items.nlargest(10, 'avg_rating')[
    ['title', 'num_ratings', 'avg_rating']
])


In [None]:
# ============================================================================
# Cell 8: Genre Analysis
# ============================================================================
print("\n" + "="*70)
print("GENRE ANALYSIS")
print("="*70)

# Extract all genres
all_genres = []
for genres_list in movies['genres']:
    all_genres.extend(genres_list)

genre_counts = pd.Series(all_genres).value_counts()

print("\n📊 Genre Distribution:")
print(genre_counts)

# Plot genre distribution
plt.figure(figsize=(12, 6))
genre_counts.plot(kind='bar', color='teal', edgecolor='black')
plt.title('Movie Genre Distribution', fontsize=14, fontweight='bold')
plt.xlabel('Genre')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Genre popularity by ratings
print("\n⭐ Average Rating by Genre:")
genre_ratings = {}
for idx, row in movies.iterrows():
    item_ratings = ratings[ratings['item_id'] == row['item_id']]['rating']
    for genre in row['genres']:
        if genre not in genre_ratings:
            genre_ratings[genre] = []
        genre_ratings[genre].extend(item_ratings.tolist())

genre_avg_ratings = {genre: np.mean(ratings_list) 
                     for genre, ratings_list in genre_ratings.items()}
genre_avg_ratings = pd.Series(genre_avg_ratings).sort_values(ascending=False)

print(genre_avg_ratings)

plt.figure(figsize=(12, 6))
genre_avg_ratings.plot(kind='bar', color='coral', edgecolor='black')
plt.title('Average Rating by Genre', fontsize=14, fontweight='bold')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.xticks(rotation=45, ha='right')
plt.axhline(y=ratings['rating'].mean(), color='red', linestyle='--', 
            label=f'Overall Avg: {ratings["rating"].mean():.2f}')
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# ============================================================================
# Cell 9: Sparsity Visualization
# ============================================================================
print("\n" + "="*70)
print("SPARSITY ANALYSIS")
print("="*70)

# Calculate overall sparsity
n_users = ratings['user_id'].nunique()
n_items = ratings['item_id'].nunique()
n_ratings = len(ratings)
sparsity = 1 - (n_ratings / (n_users * n_items))

print(f"\n Matrix dimensions: {n_users} users × {n_items} items")
print(f" Total possible ratings: {n_users * n_items:,}")
print(f" Actual ratings: {n_ratings:,}")
print(f" Sparsity: {sparsity:.4%}")
print(f" Density: {1-sparsity:.4%}")

# Visualize sparsity
plot_sparsity_heatmap(ratings, sample_users=100, sample_items=100)



In [None]:
# ============================================================================
# Cell 10: Temporal Patterns
# ============================================================================
print("\n" + "="*70)
print("TEMPORAL PATTERNS")
print("="*70)

# Convert timestamp to datetime
ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['date'] = ratings['datetime'].dt.date
ratings['year'] = ratings['datetime'].dt.year
ratings['month'] = ratings['datetime'].dt.month
ratings['day_of_week'] = ratings['datetime'].dt.day_name()

print(f"\n📅 Date range: {ratings['datetime'].min()} to {ratings['datetime'].max()}")

# Ratings over time
ratings_over_time = ratings.groupby('date').size()

plt.figure(figsize=(14, 6))
plt.plot(ratings_over_time.index, ratings_over_time.values, linewidth=1.5)
plt.title('Ratings Over Time', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Number of Ratings')
plt.xticks(rotation=45)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# Day of week pattern
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_counts = ratings['day_of_week'].value_counts().reindex(day_order)

plt.figure(figsize=(10, 6))
day_counts.plot(kind='bar', color='mediumpurple', edgecolor='black')
plt.title('Ratings by Day of Week', fontsize=14, fontweight='bold')
plt.xlabel('Day of Week')
plt.ylabel('Number of Ratings')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# Cell 11: Cold Start Analysis
# ============================================================================
print("\n" + "="*70)
print("COLD START ANALYSIS")
print("="*70)

# Users with few ratings (cold start users)
user_rating_counts = ratings.groupby('user_id').size()
cold_start_users = user_rating_counts[user_rating_counts < 20]

print(f"\n Users with < 20 ratings: {len(cold_start_users)} ({len(cold_start_users)/len(user_rating_counts):.1%})")

# Items with few ratings (cold start items)
item_rating_counts = ratings.groupby('item_id').size()
cold_start_items = item_rating_counts[item_rating_counts < 5]

print(f" Items with < 5 ratings: {len(cold_start_items)} ({len(cold_start_items)/len(item_rating_counts):.1%})")

# Plot distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# User ratings distribution
axes[0].hist(user_rating_counts, bins=50, color='skyblue', edgecolor='black')
axes[0].axvline(x=20, color='red', linestyle='--', linewidth=2, label='Cold Start Threshold')
axes[0].set_title('Distribution of Ratings per User')
axes[0].set_xlabel('Number of Ratings')
axes[0].set_ylabel('Number of Users')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Item ratings distribution  
axes[1].hist(item_rating_counts, bins=50, color='lightcoral', edgecolor='black')
axes[1].axvline(x=5, color='red', linestyle='--', linewidth=2, label='Cold Start Threshold')
axes[1].set_title('Distribution of Ratings per Item')
axes[1].set_xlabel('Number of Ratings')
axes[1].set_ylabel('Number of Items')
axes[1].set_yscale('log')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:

# ============================================================================
# Cell 12: Key Insights Summary
# ============================================================================
print("\n" + "="*70)
print("KEY INSIGHTS")
print("="*70)

insights = f"""
📊 DATA CHARACTERISTICS:
   • Dataset is highly sparse ({sparsity:.2%})
   • Users rated an average of {user_rating_counts.mean():.1f} movies
   • Items received an average of {item_rating_counts.mean():.1f} ratings
   
⭐ RATING PATTERNS:
   • Most common rating: {ratings['rating'].mode()[0]}
   • Average rating: {ratings['rating'].mean():.2f}
   • Rating distribution is skewed towards positive ratings
   
❄️ COLD START CHALLENGES:
   • {len(cold_start_users)/len(user_rating_counts):.1%} of users have < 20 ratings
   • {len(cold_start_items)/len(item_rating_counts):.1%} of items have < 5 ratings
   • Content-based methods may help with item cold start
   
🎭 GENRE INSIGHTS:
   • Most common genre: {genre_counts.index[0]}
   • Highest rated genre: {genre_avg_ratings.index[0]}
   
🚀 RECOMMENDATION STRATEGY:
   • High sparsity suggests collaborative filtering may struggle
   • Hybrid approach (CF + content-based) recommended
   • Need to handle cold start for new users/items
   • Consider popularity bias in evaluation
"""

print(insights)



In [None]:
# ============================================================================
# Cell 13: Save Processed Data
# ============================================================================
print("\n" + "="*70)
print("SAVING PROCESSED DATA")
print("="*70)

# Save to processed directory
import os
os.makedirs('../data/processed', exist_ok=True)

ratings.to_csv('../data/processed/ratings_with_datetime.csv', index=False)
user_stats.to_csv('../data/processed/user_statistics.csv')
item_stats.to_csv('../data/processed/item_statistics.csv')

print("\n✓ Processed data saved to data/processed/")
print("\n✅ Data exploration complete!")