# MovieLens 100K Comprehensive Exploratory Data Analysis

Enhanced analysis for recommendation system development based on common MovieLens 100K research patterns.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import ast

plt.style.use('default')
sns.set_palette('husl')

In [None]:
# Load the combined dataset
df = pd.read_csv('../data/ml100k_combined.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

## 1. Basic Dataset Statistics

In [None]:
print("Dataset Info:")
print(f"Total ratings: {len(df):,}")
print(f"Unique users: {df['user_id'].nunique():,}")
print(f"Unique movies: {df['item_id'].nunique():,}")
print(f"Rating range: {df['rating'].min()} - {df['rating'].max()}")
print(f"Average rating: {df['rating'].mean():.2f}")
print(f"Rating std: {df['rating'].std():.2f}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

# Sparsity calculation
total_possible = df['user_id'].nunique() * df['item_id'].nunique()
sparsity = (1 - len(df) / total_possible) * 100
print(f"\nSparsity: {sparsity:.2f}%")

## 2. Rating Distribution Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Overall rating distribution
rating_counts = df['rating'].value_counts().sort_index()
axes[0,0].bar(rating_counts.index, rating_counts.values)
axes[0,0].set_title('Rating Distribution')
axes[0,0].set_xlabel('Rating')
axes[0,0].set_ylabel('Count')
for i, v in enumerate(rating_counts.values):
    axes[0,0].text(i+1, v + 1000, str(v), ha='center')

# User demographics
axes[0,1].hist(df['age'], bins=20, edgecolor='black', alpha=0.7)
axes[0,1].set_title('User Age Distribution')
axes[0,1].set_xlabel('Age')
axes[0,1].set_ylabel('Count')

# Gender distribution
gender_counts = df['gender'].value_counts()
axes[1,0].pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%')
axes[1,0].set_title('Gender Distribution')

# Movie year distribution
df_clean = df.dropna(subset=['year'])
axes[1,1].hist(df_clean['year'], bins=20, edgecolor='black', alpha=0.7)
axes[1,1].set_title('Movie Release Year Distribution')
axes[1,1].set_xlabel('Year')
axes[1,1].set_ylabel('Count')

plt.tight_layout()
plt.show()

## 3. Data Sparsity and Cold Start Analysis

In [None]:
# Movie rating distribution - shows data sparsity
movie_rating_counts = df['item_id'].value_counts().sort_values(ascending=False)
user_rating_counts = df['user_id'].value_counts().sort_values(ascending=False)

plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
plt.hist(movie_rating_counts.values, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Number of Ratings per Movie')
plt.ylabel('Number of Movies')
plt.title('Distribution of Ratings per Movie')
plt.yscale('log')

plt.subplot(2, 2, 2)
plt.plot(range(len(movie_rating_counts)), movie_rating_counts.values)
plt.xlabel('Movie Rank (Most to Least Rated)')
plt.ylabel('Number of Ratings')
plt.title('Movie Popularity (Long Tail Distribution)')
plt.yscale('log')

plt.subplot(2, 2, 3)
plt.hist(user_rating_counts.values, bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Number of Ratings per User')
plt.ylabel('Number of Users')
plt.title('User Activity Distribution')

plt.subplot(2, 2, 4)
plt.plot(range(len(user_rating_counts)), user_rating_counts.values)
plt.xlabel('User Rank (Most to Least Active)')
plt.ylabel('Number of Ratings')
plt.title('User Activity (Power Law Distribution)')

plt.tight_layout()
plt.show()

print("Cold Start Analysis:")
print(f"Movies with only 1 rating: {sum(movie_rating_counts == 1)}")
print(f"Movies with ≤5 ratings: {sum(movie_rating_counts <= 5)} ({sum(movie_rating_counts <= 5)/len(movie_rating_counts)*100:.1f}%)")
print(f"Movies with ≥100 ratings: {sum(movie_rating_counts >= 100)} ({sum(movie_rating_counts >= 100)/len(movie_rating_counts)*100:.1f}%)")
print(f"Users with ≤5 ratings: {sum(user_rating_counts <= 5)} ({sum(user_rating_counts <= 5)/len(user_rating_counts)*100:.1f}%)")
print(f"Users with ≥100 ratings: {sum(user_rating_counts >= 100)} ({sum(user_rating_counts >= 100)/len(user_rating_counts)*100:.1f}%)")

## 4. Rating Bias Analysis

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# User rating bias
user_avg_ratings = df.groupby('user_id')['rating'].mean()
axes[0,0].hist(user_avg_ratings, bins=30, edgecolor='black', alpha=0.7)
axes[0,0].set_title('User Rating Bias Distribution')
axes[0,0].set_xlabel('Average Rating Given by User')
axes[0,0].set_ylabel('Number of Users')
axes[0,0].axvline(user_avg_ratings.mean(), color='red', linestyle='--', label=f'Mean: {user_avg_ratings.mean():.2f}')
axes[0,0].legend()

# Movie rating bias
movie_avg_ratings = df.groupby('item_id')['rating'].mean()
axes[0,1].hist(movie_avg_ratings, bins=30, edgecolor='black', alpha=0.7)
axes[0,1].set_title('Movie Rating Bias Distribution')
axes[0,1].set_xlabel('Average Rating Received by Movie')
axes[0,1].set_ylabel('Number of Movies')
axes[0,1].axvline(movie_avg_ratings.mean(), color='red', linestyle='--', label=f'Mean: {movie_avg_ratings.mean():.2f}')
axes[0,1].legend()

# Rating variance by user
user_rating_std = df.groupby('user_id')['rating'].std()
axes[1,0].hist(user_rating_std.dropna(), bins=30, edgecolor='black', alpha=0.7)
axes[1,0].set_title('User Rating Variance')
axes[1,0].set_xlabel('Standard Deviation of User Ratings')
axes[1,0].set_ylabel('Number of Users')

# Rating variance by movie
movie_rating_std = df.groupby('item_id')['rating'].std()
axes[1,1].hist(movie_rating_std.dropna(), bins=30, edgecolor='black', alpha=0.7)
axes[1,1].set_title('Movie Rating Variance')
axes[1,1].set_xlabel('Standard Deviation of Movie Ratings')
axes[1,1].set_ylabel('Number of Movies')

plt.tight_layout()
plt.show()

print(f"Users who rate everything highly (avg > 4.5): {sum(user_avg_ratings > 4.5)}")
print(f"Users who rate everything lowly (avg < 2.5): {sum(user_avg_ratings < 2.5)}")
print(f"Movies with high consensus (std < 0.5): {sum(movie_rating_std < 0.5)}")
print(f"Movies with low consensus (std > 1.5): {sum(movie_rating_std > 1.5)}")

## 5. Genre Analysis

In [None]:
# Genre preference analysis
genre_cols = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 
              'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 
              'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Average rating by genre
genre_ratings = {}
genre_counts = {}
for genre in genre_cols:
    if genre in df.columns:
        genre_mask = df[genre] == 1
        genre_ratings[genre] = df[genre_mask]['rating'].mean()
        genre_counts[genre] = df[genre_mask].shape[0]

plt.figure(figsize=(15, 10))

plt.subplot(2, 1, 1)
genres = list(genre_ratings.keys())
ratings = list(genre_ratings.values())
colors = plt.cm.viridis(np.linspace(0, 1, len(genres)))
bars = plt.bar(genres, ratings, color=colors)
plt.title('Average Rating by Genre')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.axhline(df['rating'].mean(), color='red', linestyle='--', label='Overall Average')
plt.legend()

# Add value labels on bars
for bar, rating in zip(bars, ratings):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{rating:.2f}', ha='center', va='bottom', fontsize=8)

plt.subplot(2, 1, 2)
counts = list(genre_counts.values())
bars = plt.bar(genres, counts, color=colors)
plt.title('Number of Ratings by Genre')
plt.xlabel('Genre')
plt.ylabel('Number of Ratings')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("Top 5 highest rated genres:")
sorted_genres = sorted(genre_ratings.items(), key=lambda x: x[1], reverse=True)
for genre, rating in sorted_genres[:5]:
    print(f"{genre}: {rating:.3f} ({genre_counts[genre]} ratings)")

print("\nTop 5 most popular genres:")
sorted_popularity = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)
for genre, count in sorted_popularity[:5]:
    print(f"{genre}: {count} ratings (avg: {genre_ratings[genre]:.3f})")

## 6. Temporal Analysis

In [None]:
# Temporal analysis - rating patterns over time
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['rating_date'] = df['timestamp'].dt.date
df['rating_month'] = df['timestamp'].dt.to_period('M')
df['movie_age'] = 1998 - df['year']

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Ratings over time
daily_ratings = df.groupby('rating_date').size()
axes[0,0].plot(daily_ratings.index, daily_ratings.values, alpha=0.7)
axes[0,0].set_title('Number of Ratings Over Time')
axes[0,0].set_xlabel('Date')
axes[0,0].set_ylabel('Number of Ratings')
axes[0,0].tick_params(axis='x', rotation=45)

# Average rating over time
monthly_avg_rating = df.groupby('rating_month')['rating'].mean()
axes[0,1].plot(monthly_avg_rating.index.astype(str), monthly_avg_rating.values, marker='o')
axes[0,1].set_title('Average Rating Over Time')
axes[0,1].set_xlabel('Month')
axes[0,1].set_ylabel('Average Rating')
axes[0,1].tick_params(axis='x', rotation=45)

# Movie age vs rating
movie_age_rating = df.groupby('movie_age')['rating'].agg(['mean', 'count']).reset_index()
movie_age_rating = movie_age_rating[movie_age_rating['count'] >= 10]  # Filter for statistical significance
axes[1,0].scatter(movie_age_rating['movie_age'], movie_age_rating['mean'], 
                 s=movie_age_rating['count']/10, alpha=0.6)
axes[1,0].set_title('Movie Age vs Average Rating')
axes[1,0].set_xlabel('Movie Age (years)')
axes[1,0].set_ylabel('Average Rating')

# User age vs rating behavior
age_rating = df.groupby('age')['rating'].agg(['mean', 'count']).reset_index()
age_rating = age_rating[age_rating['count'] >= 10]
axes[1,1].scatter(age_rating['age'], age_rating['mean'], 
                 s=age_rating['count']/10, alpha=0.6)
axes[1,1].set_title('User Age vs Average Rating Given')
axes[1,1].set_xlabel('User Age')
axes[1,1].set_ylabel('Average Rating Given')

plt.tight_layout()
plt.show()

print(f"Rating period: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Most active day: {daily_ratings.idxmax()} ({daily_ratings.max()} ratings)")
print(f"Movie age range: {df['movie_age'].min():.0f} to {df['movie_age'].max():.0f} years")

## 7. User Similarity Analysis for Collaborative Filtering

In [None]:
# User similarity analysis for collaborative filtering
user_item_matrix = df.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)

print(f"User-Item Matrix Shape: {user_item_matrix.shape}")
print(f"Matrix Sparsity: {(user_item_matrix == 0).sum().sum() / (user_item_matrix.shape[0] * user_item_matrix.shape[1]) * 100:.2f}%")

# Sample similarity calculation for first 100 users (for performance)
sample_size = min(100, len(user_item_matrix))
sample_matrix = user_item_matrix.iloc[:sample_size]
user_similarity = cosine_similarity(sample_matrix)

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
# Remove diagonal (self-similarity)
similarity_values = user_similarity[np.triu_indices_from(user_similarity, k=1)]
plt.hist(similarity_values, bins=50, edgecolor='black', alpha=0.7)
plt.title('User Similarity Distribution')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.axvline(np.mean(similarity_values), color='red', linestyle='--', label=f'Mean: {np.mean(similarity_values):.3f}')
plt.legend()

plt.subplot(1, 3, 2)
# Common ratings analysis
common_ratings = []
for i in range(min(50, len(user_item_matrix))):
    for j in range(i+1, min(50, len(user_item_matrix))):
        user1_ratings = user_item_matrix.iloc[i]
        user2_ratings = user_item_matrix.iloc[j]
        common = sum((user1_ratings > 0) & (user2_ratings > 0))
        common_ratings.append(common)

plt.hist(common_ratings, bins=30, edgecolor='black', alpha=0.7)
plt.title('Common Ratings Between Users')
plt.xlabel('Number of Common Rated Movies')
plt.ylabel('Frequency')
plt.axvline(np.mean(common_ratings), color='red', linestyle='--', label=f'Mean: {np.mean(common_ratings):.1f}')
plt.legend()

plt.subplot(1, 3, 3)
# Similarity heatmap for first 20 users
plt.imshow(user_similarity[:20, :20], cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(label='Cosine Similarity')
plt.title('User Similarity Heatmap (First 20 Users)')
plt.xlabel('User Index')
plt.ylabel('User Index')

plt.tight_layout()
plt.show()

print(f"Average user similarity: {np.mean(similarity_values):.3f}")
print(f"High similarity pairs (>0.5): {sum(similarity_values > 0.5)} out of {len(similarity_values)} ({sum(similarity_values > 0.5)/len(similarity_values)*100:.1f}%)")
print(f"Very high similarity pairs (>0.8): {sum(similarity_values > 0.8)} out of {len(similarity_values)} ({sum(similarity_values > 0.8)/len(similarity_values)*100:.1f}%)")
print(f"Average common ratings: {np.mean(common_ratings):.1f}")
print(f"Pairs with ≥10 common ratings: {sum(np.array(common_ratings) >= 10)} ({sum(np.array(common_ratings) >= 10)/len(common_ratings)*100:.1f}%)")

## 8. Recommendation System Insights

In [None]:
print("MovieLens 100K Recommendation System Insights:")
print("=" * 50)

# Data characteristics
print("\n1. DATA CHARACTERISTICS:")
print(f"   • High sparsity ({sparsity:.1f}%) - typical for recommendation datasets")
print(f"   • Power law distribution in both users and items")
print(f"   • Rating bias: users tend to rate movies they like (avg: {df['rating'].mean():.2f})")
print(f"   • Temporal span: {(df['timestamp'].max() - df['timestamp'].min()).days} days")

# Cold start challenges
print("\n2. COLD START CHALLENGES:")
new_users = sum(user_rating_counts <= 5)
new_items = sum(movie_rating_counts <= 5)
print(f"   • New users (≤5 ratings): {new_users} ({new_users/len(user_rating_counts)*100:.1f}%)")
print(f"   • New items (≤5 ratings): {new_items} ({new_items/len(movie_rating_counts)*100:.1f}%)")
print(f"   • Requires content-based or demographic filtering for cold start")

# Collaborative filtering potential
print("\n3. COLLABORATIVE FILTERING POTENTIAL:")
print(f"   • Average user similarity: {np.mean(similarity_values):.3f} (moderate)")
print(f"   • High similarity pairs: {sum(similarity_values > 0.5)/len(similarity_values)*100:.1f}%")
print(f"   • Average common ratings: {np.mean(common_ratings):.1f}")
print(f"   • Sufficient overlap for neighborhood-based CF")

# Content-based opportunities
print("\n4. CONTENT-BASED OPPORTUNITIES:")
print(f"   • {len(genre_cols)} genres available for content filtering")
print(f"   • Genre rating variance: {np.std(list(genre_ratings.values())):.3f}")
print(f"   • Movie age spans {df['movie_age'].max():.0f} years")
print(f"   • User demographics: age, gender, occupation available")

# Hybrid approach recommendations
print("\n5. HYBRID APPROACH RECOMMENDATIONS:")
power_users = sum(user_rating_counts >= 100)
popular_movies = sum(movie_rating_counts >= 100)
print(f"   • Use CF for {power_users} power users ({power_users/len(user_rating_counts)*100:.1f}%)")
print(f"   • Use CF for {popular_movies} popular movies ({popular_movies/len(movie_rating_counts)*100:.1f}%)")
print(f"   • Use content-based for cold start scenarios")
print(f"   • Consider matrix factorization for scalability")
print(f"   • Implement rating bias correction (user/item means)")