# Exploratory Data Analysis (EDA)

In this notebook, we analyze the MovieLens dataset to understand user-item interaction patterns, sparsity, and popularity distributions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# load data
data_dir = '../data/raw/ml-latest-small'
ratings = pd.read_csv(os.path.join(data_dir, 'ratings.csv'))
movies = pd.read_csv(os.path.join(data_dir, 'movies.csv'))

print(f'Ratings shape: {ratings.shape}')
print(f'Movies shape: {movies.shape}')

## 1. Data Overview

In [None]:
ratings.head()

## 2. Sparsity Calculation

In [None]:
n_users = ratings['userId'].nunique()
n_items = ratings['movieId'].nunique()
n_ratings = len(ratings)

sparsity = 1 - (n_ratings / (n_users * n_items))
print(f'Sparsity: {sparsity:.4f}')

## 3. Rating Distribution

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x='rating', data=ratings)
plt.title('Distribution of Ratings')
plt.show()

## 4. Popularity Distribution (Long Tail)

In [None]:
movie_counts = ratings.groupby('movieId').size().sort_values(ascending=False)
plt.figure(figsize=(10, 5))
plt.plot(movie_counts.values)
plt.title('Long Tail Distribution of Movie Ratings')
plt.xlabel('Movie Rank')
plt.ylabel('Number of Ratings')
plt.show()