# MovieLens 100K Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import ast

plt.style.use('default')
sns.set_palette('husl')

In [None]:
# Load the combined dataset
df = pd.read_csv('ml100k_combined.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Basic statistics
print("Dataset Info:")
print(f"Total ratings: {len(df):,}")
print(f"Unique users: {df['user_id'].nunique():,}")
print(f"Unique movies: {df['item_id'].nunique():,}")
print(f"Rating range: {df['rating'].min()} - {df['rating'].max()}")
print(f"Average rating: {df['rating'].mean():.2f}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

In [None]:
# Rating distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Rating distribution
axes[0,0].hist(df['rating'], bins=5, edgecolor='black', alpha=0.7)
axes[0,0].set_title('Rating Distribution')
axes[0,0].set_xlabel('Rating')
axes[0,0].set_ylabel('Count')

# Age distribution
axes[0,1].hist(df['age'], bins=20, edgecolor='black', alpha=0.7)
axes[0,1].set_title('User Age Distribution')
axes[0,1].set_xlabel('Age')
axes[0,1].set_ylabel('Count')

# Gender distribution
gender_counts = df['gender'].value_counts()
axes[1,0].pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%')
axes[1,0].set_title('Gender Distribution')

# Movie year distribution
df_clean = df.dropna(subset=['year'])
axes[1,1].hist(df_clean['year'], bins=20, edgecolor='black', alpha=0.7)
axes[1,1].set_title('Movie Release Year Distribution')
axes[1,1].set_xlabel('Year')
axes[1,1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Top occupations
occupation_counts = df['occupation'].value_counts().head(10)
plt.figure(figsize=(12, 6))
occupation_counts.plot(kind='bar')
plt.title('Top 10 User Occupations')
plt.xlabel('Occupation')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Genre analysis
def parse_genres(genre_str):
    try:
        return ast.literal_eval(genre_str)
    except:
        return []

# Extract all genres
all_genres = []
for genres_str in df['genres'].dropna():
    genres = parse_genres(genres_str)
    all_genres.extend(genres)

genre_counts = Counter(all_genres)
top_genres = dict(genre_counts.most_common(10))

plt.figure(figsize=(12, 6))
plt.bar(top_genres.keys(), top_genres.values())
plt.title('Top 10 Movie Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Rating patterns by demographics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Average rating by gender
gender_ratings = df.groupby('gender')['rating'].mean()
axes[0,0].bar(gender_ratings.index, gender_ratings.values)
axes[0,0].set_title('Average Rating by Gender')
axes[0,0].set_ylabel('Average Rating')

# Average rating by age group
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 50, 100], labels=['<25', '25-35', '35-50', '50+'])
age_ratings = df.groupby('age_group')['rating'].mean()
axes[0,1].bar(age_ratings.index, age_ratings.values)
axes[0,1].set_title('Average Rating by Age Group')
axes[0,1].set_ylabel('Average Rating')

# Top rated movies
top_movies = df.groupby(['item_id', 'title'])['rating'].agg(['mean', 'count']).reset_index()
top_movies = top_movies[top_movies['count'] >= 50].nlargest(10, 'mean')
axes[1,0].barh(range(len(top_movies)), top_movies['mean'])
axes[1,0].set_yticks(range(len(top_movies)))
axes[1,0].set_yticklabels([title[:30] + '...' if len(title) > 30 else title for title in top_movies['title']])
axes[1,0].set_title('Top 10 Highest Rated Movies (â‰¥50 ratings)')
axes[1,0].set_xlabel('Average Rating')

# Most active users
user_activity = df['user_id'].value_counts().head(10)
axes[1,1].bar(range(len(user_activity)), user_activity.values)
axes[1,1].set_title('Top 10 Most Active Users')
axes[1,1].set_xlabel('User Rank')
axes[1,1].set_ylabel('Number of Ratings')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
numeric_cols = ['rating', 'age', 'year']
corr_data = df[numeric_cols].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(corr_data, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Sparsity analysis
print("Data Sparsity Analysis:")
total_possible_ratings = df['user_id'].nunique() * df['item_id'].nunique()
actual_ratings = len(df)
sparsity = (1 - actual_ratings / total_possible_ratings) * 100
print(f"Total possible user-item pairs: {total_possible_ratings:,}")
print(f"Actual ratings: {actual_ratings:,}")
print(f"Sparsity: {sparsity:.2f}%")

# Rating frequency per user and item
user_rating_counts = df['user_id'].value_counts()
item_rating_counts = df['item_id'].value_counts()

print(f"\nUser rating statistics:")
print(f"Average ratings per user: {user_rating_counts.mean():.1f}")
print(f"Min ratings per user: {user_rating_counts.min()}")
print(f"Max ratings per user: {user_rating_counts.max()}")

print(f"\nItem rating statistics:")
print(f"Average ratings per item: {item_rating_counts.mean():.1f}")
print(f"Min ratings per item: {item_rating_counts.min()}")
print(f"Max ratings per item: {item_rating_counts.max()}")