# 03 - Exploratory Data Analysis & Visualization

Notebook này thực hiện phân tích khám phá dữ liệu (EDA) và trực quan hóa.

## Mục Tiêu
- Phân tích phân bố ratings
- Phân tích tần suất genres
- Phân tích top movies
- Tạo heatmaps và correlation matrices
- Tạo word clouds từ titles
- Phân tích temporal trends

## 1. Import Libraries

In [None]:
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully")

## 2. Load Processed Data

In [None]:
# Load processed data
data_dir = '../data/processed'

movies = pd.read_csv(f'{data_dir}/movies_enriched.csv')
ratings = pd.read_csv(f'{data_dir}/ratings.csv')

print(f"Loaded {len(movies)} movies, {len(ratings)} ratings")
print(f"\nMovies shape: {movies.shape}")
print(f"Ratings shape: {ratings.shape}")
print(f"\nMovies columns ({len(movies.columns)}): {list(movies.columns)}")

## 3. Rating Analysis

In [None]:
print("=" * 70)
print("RATING DISTRIBUTION ANALYSIS")
print("=" * 70)

# Basic statistics
print("\nRating Statistics:")
print(ratings['rating'].describe())

print("\nRating Value Counts:")
print(ratings['rating'].value_counts().sort_index())

In [None]:
# Visualize rating distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Histogram
axes[0, 0].hist(ratings['rating'], bins=10, edgecolor='black', color='skyblue')
axes[0, 0].set_xlabel('Rating')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Rating Distribution (Histogram)')
axes[0, 0].grid(True, alpha=0.3)

# 2. Bar chart
rating_counts = ratings['rating'].value_counts().sort_index()
axes[0, 1].bar(rating_counts.index, rating_counts.values, color='coral', edgecolor='black')
axes[0, 1].set_xlabel('Rating')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Rating Distribution (Bar Chart)')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Boxplot
axes[1, 0].boxplot(ratings['rating'], vert=False)
axes[1, 0].set_xlabel('Rating')
axes[1, 0].set_title('Rating Distribution (Boxplot)')
axes[1, 0].grid(True, alpha=0.3)

# 4. Pie chart (rating ranges)
rating_ranges = pd.cut(ratings['rating'], bins=[0, 2, 3, 4, 5], labels=['Low (0-2)', 'Medium (2-3)', 'Good (3-4)', 'Excellent (4-5)'])
range_counts = rating_ranges.value_counts()
axes[1, 1].pie(range_counts.values, labels=range_counts.index, autopct='%1.1f%%', startangle=90, colors=['red', 'yellow', 'lightgreen', 'green'])
axes[1, 1].set_title('Rating Ranges Distribution')

plt.tight_layout()
plt.show()

## 4. Genre Analysis

In [None]:
print("=" * 70)
print("GENRE ANALYSIS")
print("=" * 70)

# Parse all genres
all_genres = movies['genres'].str.split('|').explode()
genre_counts = all_genres.value_counts()

print(f"\nTotal unique genres: {len(genre_counts)}")
print(f"\nTop 10 genres:")
print(genre_counts.head(10))

In [None]:
# Visualize genre distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Bar chart - Top genres
top_genres = genre_counts.head(15)
axes[0].barh(range(len(top_genres)), top_genres.values, color='steelblue')
axes[0].set_yticks(range(len(top_genres)))
axes[0].set_yticklabels(top_genres.index)
axes[0].set_xlabel('Number of Movies')
axes[0].set_title('Top 15 Genres by Frequency')
axes[0].invert_yaxis()
axes[0].grid(True, alpha=0.3, axis='x')

# 2. Genre count distribution
genre_per_movie = movies['genres'].str.split('|').apply(len)
axes[1].hist(genre_per_movie, bins=range(1, genre_per_movie.max()+2), edgecolor='black', color='teal')
axes[1].set_xlabel('Number of Genres per Movie')
axes[1].set_ylabel('Number of Movies')
axes[1].set_title('Distribution of Genre Count per Movie')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"\nAverage genres per movie: {genre_per_movie.mean():.2f}")
print(f"Max genres per movie: {genre_per_movie.max()}")

## 5. Top Movies Analysis

In [None]:
print("=" * 70)
print("TOP MOVIES ANALYSIS")
print("=" * 70)

# Filter movies with at least 50 ratings
popular_movies = movies[movies['num_ratings'] >= 50].copy()
print(f"\nMovies with ≥50 ratings: {len(popular_movies)}")

# Top rated movies
print("\nTop 10 Highest Rated Movies (≥50 ratings):")
top_rated = popular_movies.nlargest(10, 'avg_rating')[['title_clean', 'year', 'avg_rating', 'num_ratings', 'genres']]
print(top_rated)

print("\n\nTop 10 Most Rated Movies:")
most_rated = popular_movies.nlargest(10, 'num_ratings')[['title_clean', 'year', 'avg_rating', 'num_ratings', 'genres']]
print(most_rated)

In [None]:
# Visualize top movies
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# 1. Top rated movies
top_10_rated = popular_movies.nlargest(10, 'avg_rating')
axes[0].barh(range(len(top_10_rated)), top_10_rated['avg_rating'].values, color='gold')
axes[0].set_yticks(range(len(top_10_rated)))
axes[0].set_yticklabels(top_10_rated['title_clean'].values)
axes[0].set_xlabel('Average Rating')
axes[0].set_title('Top 10 Highest Rated Movies (≥50 ratings)')
axes[0].invert_yaxis()
axes[0].grid(True, alpha=0.3, axis='x')

# 2. Most rated movies
top_10_popular = popular_movies.nlargest(10, 'num_ratings')
axes[1].barh(range(len(top_10_popular)), top_10_popular['num_ratings'].values, color='purple')
axes[1].set_yticks(range(len(top_10_popular)))
axes[1].set_yticklabels(top_10_popular['title_clean'].values)
axes[1].set_xlabel('Number of Ratings')
axes[1].set_title('Top 10 Most Rated Movies')
axes[1].invert_yaxis()
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 6. Temporal Analysis

In [None]:
print("=" * 70)
print("TEMPORAL ANALYSIS")
print("=" * 70)

# Movies by year
movies_with_year = movies.dropna(subset=['year'])
movies_by_year = movies_with_year.groupby('year').size()

print(f"\nYear range: {movies_with_year['year'].min():.0f} - {movies_with_year['year'].max():.0f}")
print(f"\nTop 10 years by movie count:")
print(movies_by_year.nlargest(10))

In [None]:
# Visualize temporal trends
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Movies per year
axes[0, 0].plot(movies_by_year.index, movies_by_year.values, color='darkblue', linewidth=2)
axes[0, 0].fill_between(movies_by_year.index, movies_by_year.values, alpha=0.3)
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Number of Movies')
axes[0, 0].set_title('Movies Released Per Year')
axes[0, 0].grid(True, alpha=0.3)

# 2. Movies by decade
movies_by_decade = movies_with_year.groupby('decade').size()
axes[0, 1].bar(movies_by_decade.index, movies_by_decade.values, color='orange', edgecolor='black', width=8)
axes[0, 1].set_xlabel('Decade')
axes[0, 1].set_ylabel('Number of Movies')
axes[0, 1].set_title('Movies by Decade')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Average rating by decade
rating_by_decade = movies_with_year.groupby('decade')['avg_rating'].mean()
axes[1, 0].plot(rating_by_decade.index, rating_by_decade.values, marker='o', color='green', linewidth=2, markersize=8)
axes[1, 0].set_xlabel('Decade')
axes[1, 0].set_ylabel('Average Rating')
axes[1, 0].set_title('Average Rating by Decade')
axes[1, 0].grid(True, alpha=0.3)

# 4. Movies by era
movies_by_era = movies_with_year['era'].value_counts().sort_index()
axes[1, 1].pie(movies_by_era.values, labels=movies_by_era.index, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Movies by Era')

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
print("=" * 70)
print("CORRELATION ANALYSIS")
print("=" * 70)

# Select numeric columns for correlation
numeric_cols = ['year', 'avg_rating', 'std_rating', 'num_ratings', 'popularity', 'movie_age', 'genres_count']
corr_data = movies[numeric_cols].dropna()

# Calculate correlation matrix
corr_matrix = corr_data.corr()

print("\nCorrelation Matrix:")
print(corr_matrix)

In [None]:
# Visualize correlation matrix
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Heatmap
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=axes[0])
axes[0].set_title('Correlation Heatmap')

# 2. Clustermap approach - just show important correlations
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap='viridis', 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=axes[1])
axes[1].set_title('Correlation Heatmap (Lower Triangle)')

plt.tight_layout()
plt.show()

## 8. Scatter Plot Analysis

In [None]:
# Scatter plots for key relationships
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Ratings vs Popularity
axes[0, 0].scatter(movies['num_ratings'], movies['avg_rating'], alpha=0.5, s=20, c='blue')
axes[0, 0].set_xlabel('Number of Ratings')
axes[0, 0].set_ylabel('Average Rating')
axes[0, 0].set_title('Average Rating vs Number of Ratings')
axes[0, 0].grid(True, alpha=0.3)

# 2. Year vs Average Rating
axes[0, 1].scatter(movies['year'], movies['avg_rating'], alpha=0.5, s=20, c='green')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Average Rating')
axes[0, 1].set_title('Average Rating vs Year')
axes[0, 1].grid(True, alpha=0.3)

# 3. Movie Age vs Number of Ratings
axes[1, 0].scatter(movies['movie_age'], movies['num_ratings'], alpha=0.5, s=20, c='red')
axes[1, 0].set_xlabel('Movie Age (years)')
axes[1, 0].set_ylabel('Number of Ratings')
axes[1, 0].set_title('Number of Ratings vs Movie Age')
axes[1, 0].grid(True, alpha=0.3)

# 4. Genres Count vs Average Rating
axes[1, 1].scatter(movies['genres_count'], movies['avg_rating'], alpha=0.5, s=20, c='purple')
axes[1, 1].set_xlabel('Number of Genres')
axes[1, 1].set_ylabel('Average Rating')
axes[1, 1].set_title('Average Rating vs Number of Genres')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Word Cloud Analysis

In [None]:
print("=" * 70)
print("WORD CLOUD GENERATION")
print("=" * 70)

# Generate word clouds
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Word cloud from titles
text_titles = ' '.join(movies['title_clean'].dropna().values)
wordcloud_titles = WordCloud(width=800, height=400, background_color='white', 
                              colormap='viridis', max_words=100).generate(text_titles)
axes[0].imshow(wordcloud_titles, interpolation='bilinear')
axes[0].axis('off')
axes[0].set_title('Word Cloud from Movie Titles', fontsize=16)

# 2. Word cloud from genres
text_genres = ' '.join(movies['genres'].dropna().str.replace('|', ' ').values)
wordcloud_genres = WordCloud(width=800, height=400, background_color='white', 
                              colormap='plasma', max_words=50).generate(text_genres)
axes[1].imshow(wordcloud_genres, interpolation='bilinear')
axes[1].axis('off')
axes[1].set_title('Word Cloud from Genres', fontsize=16)

plt.tight_layout()
plt.show()

print("\nWord clouds generated successfully!")

## 10. Genre-specific Analysis

In [None]:
print("=" * 70)
print("GENRE-SPECIFIC ANALYSIS")
print("=" * 70)

# Get binary genre columns
genre_cols = [col for col in movies.columns if col.startswith('is_')]
print(f"\nFound {len(genre_cols)} binary genre features")

# Average rating by genre
genre_ratings = {}
for col in genre_cols:
    genre_name = col.replace('is_', '').replace('-', '-').title()
    genre_movies = movies[movies[col] == 1]
    if len(genre_movies) > 0:
        genre_ratings[genre_name] = {
            'avg_rating': genre_movies['avg_rating'].mean(),
            'count': len(genre_movies),
            'total_ratings': genre_movies['num_ratings'].sum()
        }

genre_df = pd.DataFrame(genre_ratings).T.sort_values('avg_rating', ascending=False)
print("\nAverage Rating by Genre:")
print(genre_df)

In [None]:
# Visualize genre analysis
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Average rating by genre
sorted_genres = genre_df.sort_values('avg_rating', ascending=True)
axes[0].barh(range(len(sorted_genres)), sorted_genres['avg_rating'].values, color='steelblue')
axes[0].set_yticks(range(len(sorted_genres)))
axes[0].set_yticklabels(sorted_genres.index)
axes[0].set_xlabel('Average Rating')
axes[0].set_title('Average Rating by Genre')
axes[0].grid(True, alpha=0.3, axis='x')

# 2. Movie count by genre
sorted_by_count = genre_df.sort_values('count', ascending=True)
axes[1].barh(range(len(sorted_by_count)), sorted_by_count['count'].values, color='coral')
axes[1].set_yticks(range(len(sorted_by_count)))
axes[1].set_yticklabels(sorted_by_count.index)
axes[1].set_xlabel('Number of Movies')
axes[1].set_title('Movie Count by Genre')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 11. Summary & Insights

In [None]:
print("=" * 70)
print("EDA SUMMARY & KEY INSIGHTS")
print("=" * 70)

print("\n1. DATASET OVERVIEW:")
print(f"   - Total movies: {len(movies):,}")
print(f"   - Total ratings: {len(ratings):,}")
print(f"   - Unique genres: {len(genre_counts)}")
print(f"   - Year range: {movies_with_year['year'].min():.0f} - {movies_with_year['year'].max():.0f}")

print("\n2. RATING INSIGHTS:")
print(f"   - Average rating: {ratings['rating'].mean():.2f}")
print(f"   - Median rating: {ratings['rating'].median():.2f}")
print(f"   - Most common rating: {ratings['rating'].mode()[0]:.1f}")
print(f"   - Rating std dev: {ratings['rating'].std():.2f}")

print("\n3. GENRE INSIGHTS:")
print(f"   - Most common genre: {genre_counts.index[0]}")
print(f"   - Avg genres per movie: {genre_per_movie.mean():.2f}")
print(f"   - Highest rated genre: {genre_df.index[0]}")

print("\n4. TEMPORAL INSIGHTS:")
print(f"   - Most productive decade: {movies_by_decade.idxmax():.0f}s")
print(f"   - Peak year: {movies_by_year.idxmax():.0f}")

print("\n5. KEY CORRELATIONS:")
# Find strongest correlations with avg_rating
rating_corr = corr_matrix['avg_rating'].drop('avg_rating').sort_values(ascending=False)
print(f"   Strongest positive correlation with rating: {rating_corr.index[0]} ({rating_corr.values[0]:.3f})")
print(f"   Strongest negative correlation with rating: {rating_corr.index[-1]} ({rating_corr.values[-1]:.3f})")

print("\n" + "=" * 70)
print("EDA COMPLETED SUCCESSFULLY!")
print("=" * 70)