# 03 - Exploratory Data Analysis & Visualization

Notebook này thực hiện phân tích khám phá dữ liệu (EDA) và trực quan hóa.

## Mục Tiêu
- Phân tích phân bố ratings
- Phân tích tần suất genres
- Phân tích top movies
- Tạo heatmaps và correlation matrices
- Tạo word clouds từ titles
- Phân tích temporal trends

## 1. Import Libraries

In [None]:
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('../src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully")

## 2. Load Processed Data

In [None]:
# Load processed data
data_dir = '../data/processed'

movies = pd.read_csv(f'{data_dir}/movies_enriched.csv')
ratings = pd.read_csv(f'{data_dir}/ratings.csv')

print(f"Loaded {len(movies)} movies, {len(ratings)} ratings")
print(f"\nMovies shape: {movies.shape}")
print(f"Ratings shape: {ratings.shape}")

# Visualize dataset overview
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# 1. Dataset size comparison
dataset_names = ['Movies', 'Ratings']
dataset_sizes = [len(movies), len(ratings)]
colors = ['#3498db', '#e74c3c']
bars = axes[0].bar(dataset_names, dataset_sizes, color=colors, edgecolor='black')
axes[0].set_ylabel('Count')
axes[0].set_title('Dataset Size Overview')
axes[0].set_yscale('log')
for bar, size in zip(bars, dataset_sizes):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{size:,}', 
                 ha='center', va='bottom', fontsize=12, fontweight='bold')

# 2. Movies columns by data type
dtype_counts = movies.dtypes.value_counts()
axes[1].pie(dtype_counts.values, labels=dtype_counts.index.astype(str), autopct='%1.1f%%', 
            startangle=90, colors=plt.cm.Set3.colors)
axes[1].set_title('Movies Data Types Distribution')

# 3. Missing values in movies
missing_counts = movies.isnull().sum()
missing_cols = missing_counts[missing_counts > 0].sort_values(ascending=True)
if len(missing_cols) > 0:
    axes[2].barh(missing_cols.index, missing_cols.values, color='#f39c12')
    axes[2].set_xlabel('Missing Values Count')
    axes[2].set_title('Missing Values in Movies Dataset')
else:
    axes[2].text(0.5, 0.5, 'No Missing Values!', ha='center', va='center', fontsize=16, fontweight='bold')
    axes[2].set_title('Missing Values Check')
    axes[2].axis('off')

plt.tight_layout()
plt.show()

print(f"\nMovies columns ({len(movies.columns)}): {list(movies.columns)}")

## 3. Rating Analysis

In [None]:
print("=" * 70)
print("RATING DISTRIBUTION ANALYSIS")
print("=" * 70)

# Create visualization for rating statistics
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# 1. Rating statistics as bar chart
stats = ratings['rating'].describe()
stat_names = ['Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max']
stat_values = [stats['mean'], stats['std'], stats['min'], stats['25%'], stats['50%'], stats['75%'], stats['max']]
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(stat_names)))
bars = axes[0].bar(stat_names, stat_values, color=colors, edgecolor='black')
axes[0].set_ylabel('Value')
axes[0].set_title('Rating Statistics Summary')
axes[0].set_ylim(0, 6)
for bar, val in zip(bars, stat_values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, f'{val:.2f}', 
                 ha='center', va='bottom', fontsize=10)

# 2. Rating value counts as lollipop chart
rating_counts = ratings['rating'].value_counts().sort_index()
axes[1].stem(rating_counts.index, rating_counts.values, linefmt='steelblue', markerfmt='o', basefmt=' ')
axes[1].set_xlabel('Rating')
axes[1].set_ylabel('Count')
axes[1].set_title('Rating Value Distribution')
axes[1].grid(True, alpha=0.3, axis='y')

# 3. Cumulative distribution
rating_sorted = np.sort(ratings['rating'])
cumulative = np.arange(1, len(rating_sorted) + 1) / len(rating_sorted)
axes[2].plot(rating_sorted, cumulative, color='#e74c3c', linewidth=2)
axes[2].fill_between(rating_sorted, cumulative, alpha=0.3, color='#e74c3c')
axes[2].set_xlabel('Rating')
axes[2].set_ylabel('Cumulative Proportion')
axes[2].set_title('Cumulative Distribution of Ratings')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nTotal ratings: {len(ratings):,}")
print(f"Average rating: {ratings['rating'].mean():.2f}")
print(f"Most common rating: {ratings['rating'].mode()[0]}")

In [None]:
# Visualize rating distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Histogram
axes[0, 0].hist(ratings['rating'], bins=10, edgecolor='black', color='skyblue')
axes[0, 0].set_xlabel('Rating')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Rating Distribution (Histogram)')
axes[0, 0].grid(True, alpha=0.3)

# 2. Bar chart
rating_counts = ratings['rating'].value_counts().sort_index()
axes[0, 1].bar(rating_counts.index, rating_counts.values, color='coral', edgecolor='black')
axes[0, 1].set_xlabel('Rating')
axes[0, 1].set_ylabel('Count')
axes[0, 1].set_title('Rating Distribution (Bar Chart)')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Boxplot
axes[1, 0].boxplot(ratings['rating'], vert=False)
axes[1, 0].set_xlabel('Rating')
axes[1, 0].set_title('Rating Distribution (Boxplot)')
axes[1, 0].grid(True, alpha=0.3)

# 4. Pie chart (rating ranges)
rating_ranges = pd.cut(ratings['rating'], bins=[0, 2, 3, 4, 5], labels=['Low (0-2)', 'Medium (2-3)', 'Good (3-4)', 'Excellent (4-5)'])
range_counts = rating_ranges.value_counts()
axes[1, 1].pie(range_counts.values, labels=range_counts.index, autopct='%1.1f%%', startangle=90, colors=['red', 'yellow', 'lightgreen', 'green'])
axes[1, 1].set_title('Rating Ranges Distribution')

plt.tight_layout()
plt.show()

## 4. Genre Analysis

In [None]:
print("=" * 70)
print("GENRE ANALYSIS")
print("=" * 70)

# Parse all genres
all_genres = movies['genres'].str.split('|').explode()
genre_counts = all_genres.value_counts()

# Visualize genre overview
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Top 10 genres with count labels
top_10_genres = genre_counts.head(10)
colors = plt.cm.RdYlBu(np.linspace(0.2, 0.8, len(top_10_genres)))
bars = axes[0].barh(range(len(top_10_genres)), top_10_genres.values, color=colors, edgecolor='black')
axes[0].set_yticks(range(len(top_10_genres)))
axes[0].set_yticklabels(top_10_genres.index)
axes[0].set_xlabel('Number of Movies')
axes[0].set_title(f'Top 10 Genres (Total: {len(genre_counts)} unique genres)')
axes[0].invert_yaxis()
for bar, count in zip(bars, top_10_genres.values):
    axes[0].text(bar.get_width() + 10, bar.get_y() + bar.get_height()/2, 
                 f'{count:,}', va='center', fontsize=10)

# 2. Genre distribution donut chart
top_5 = genre_counts.head(5)
others = pd.Series({'Others': genre_counts[5:].sum()})
genre_donut = pd.concat([top_5, others])
wedges, texts, autotexts = axes[1].pie(genre_donut.values, labels=genre_donut.index, 
                                        autopct='%1.1f%%', startangle=90, 
                                        colors=plt.cm.Set2.colors, pctdistance=0.75)
centre_circle = plt.Circle((0, 0), 0.50, fc='white')
axes[1].add_patch(centre_circle)
axes[1].set_title('Genre Proportion (Top 5 + Others)')

plt.tight_layout()
plt.show()

print(f"\nTotal unique genres: {len(genre_counts)}")
print(f"Most popular genre: {genre_counts.index[0]} ({genre_counts.values[0]:,} movies)")

In [None]:
# Visualize genre distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Bar chart - Top genres
top_genres = genre_counts.head(15)
axes[0].barh(range(len(top_genres)), top_genres.values, color='steelblue')
axes[0].set_yticks(range(len(top_genres)))
axes[0].set_yticklabels(top_genres.index)
axes[0].set_xlabel('Number of Movies')
axes[0].set_title('Top 15 Genres by Frequency')
axes[0].invert_yaxis()
axes[0].grid(True, alpha=0.3, axis='x')

# 2. Genre count distribution
genre_per_movie = movies['genres'].str.split('|').apply(len)
axes[1].hist(genre_per_movie, bins=range(1, genre_per_movie.max()+2), edgecolor='black', color='teal')
axes[1].set_xlabel('Number of Genres per Movie')
axes[1].set_ylabel('Number of Movies')
axes[1].set_title('Distribution of Genre Count per Movie')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

print(f"\nAverage genres per movie: {genre_per_movie.mean():.2f}")
print(f"Max genres per movie: {genre_per_movie.max()}")

## 5. Top Movies Analysis

In [None]:
print("=" * 70)
print("TOP MOVIES ANALYSIS")
print("=" * 70)

# Filter movies with at least 50 ratings
popular_movies = movies[movies['num_ratings'] >= 50].copy()
print(f"\nMovies with ≥50 ratings: {len(popular_movies)}")

# Visualize top movies overview
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Distribution of movies by rating count threshold
thresholds = [10, 20, 50, 100, 200, 500]
counts = [len(movies[movies['num_ratings'] >= t]) for t in thresholds]
axes[0].plot(thresholds, counts, marker='o', linewidth=2, markersize=10, color='#3498db')
axes[0].fill_between(thresholds, counts, alpha=0.3, color='#3498db')
for t, c in zip(thresholds, counts):
    axes[0].annotate(f'{c:,}', (t, c), textcoords="offset points", xytext=(0,10), ha='center')
axes[0].set_xlabel('Minimum Ratings Threshold')
axes[0].set_ylabel('Number of Movies')
axes[0].set_title('Movies by Rating Count Threshold')
axes[0].grid(True, alpha=0.3)

# 2. Top 5 highest rated (horizontal bar)
top_5_rated = popular_movies.nlargest(5, 'avg_rating')
colors = plt.cm.YlOrRd(np.linspace(0.4, 0.9, 5))
bars = axes[1].barh(range(5), top_5_rated['avg_rating'].values, color=colors, edgecolor='black')
axes[1].set_yticks(range(5))
axes[1].set_yticklabels([f"{t[:25]}..." if len(t) > 25 else t for t in top_5_rated['title_clean'].values])
axes[1].set_xlabel('Average Rating')
axes[1].set_title('Top 5 Highest Rated Movies (≥50 ratings)')
axes[1].set_xlim(3.5, 5)
axes[1].invert_yaxis()
for bar, rating in zip(bars, top_5_rated['avg_rating'].values):
    axes[1].text(bar.get_width() + 0.02, bar.get_y() + bar.get_height()/2, 
                 f'{rating:.2f}', va='center', fontsize=10, fontweight='bold')

# 3. Top 5 most rated (horizontal bar)
top_5_popular = popular_movies.nlargest(5, 'num_ratings')
colors = plt.cm.Blues(np.linspace(0.4, 0.9, 5))
bars = axes[2].barh(range(5), top_5_popular['num_ratings'].values, color=colors, edgecolor='black')
axes[2].set_yticks(range(5))
axes[2].set_yticklabels([f"{t[:25]}..." if len(t) > 25 else t for t in top_5_popular['title_clean'].values])
axes[2].set_xlabel('Number of Ratings')
axes[2].set_title('Top 5 Most Rated Movies')
axes[2].invert_yaxis()
for bar, count in zip(bars, top_5_popular['num_ratings'].values):
    axes[2].text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2, 
                 f'{count:,}', va='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Visualize top movies
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# 1. Top rated movies
top_10_rated = popular_movies.nlargest(10, 'avg_rating')
axes[0].barh(range(len(top_10_rated)), top_10_rated['avg_rating'].values, color='gold')
axes[0].set_yticks(range(len(top_10_rated)))
axes[0].set_yticklabels(top_10_rated['title_clean'].values)
axes[0].set_xlabel('Average Rating')
axes[0].set_title('Top 10 Highest Rated Movies (≥50 ratings)')
axes[0].invert_yaxis()
axes[0].grid(True, alpha=0.3, axis='x')

# 2. Most rated movies
top_10_popular = popular_movies.nlargest(10, 'num_ratings')
axes[1].barh(range(len(top_10_popular)), top_10_popular['num_ratings'].values, color='purple')
axes[1].set_yticks(range(len(top_10_popular)))
axes[1].set_yticklabels(top_10_popular['title_clean'].values)
axes[1].set_xlabel('Number of Ratings')
axes[1].set_title('Top 10 Most Rated Movies')
axes[1].invert_yaxis()
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 6. Temporal Analysis

In [None]:
print("=" * 70)
print("TEMPORAL ANALYSIS")
print("=" * 70)

# Movies by year
movies_with_year = movies.dropna(subset=['year'])
movies_by_year = movies_with_year.groupby('year').size()

# Visualize temporal overview
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Year range visualization as timeline
year_min = int(movies_with_year['year'].min())
year_max = int(movies_with_year['year'].max())
axes[0].axhline(y=0, color='gray', linewidth=2)
axes[0].scatter([year_min, year_max], [0, 0], s=200, c=['#e74c3c', '#27ae60'], zorder=5, edgecolor='black')
axes[0].annotate(f'First: {year_min}', (year_min, 0), xytext=(year_min, 0.3), ha='center', fontsize=12, fontweight='bold',
                 arrowprops=dict(arrowstyle='->', color='#e74c3c'))
axes[0].annotate(f'Latest: {year_max}', (year_max, 0), xytext=(year_max, 0.3), ha='center', fontsize=12, fontweight='bold',
                 arrowprops=dict(arrowstyle='->', color='#27ae60'))
axes[0].set_xlim(year_min - 10, year_max + 10)
axes[0].set_ylim(-0.5, 0.8)
axes[0].set_title(f'Year Range: {year_max - year_min} years of movies')
axes[0].axis('off')
axes[0].text((year_min + year_max)/2, -0.3, f'Span: {year_max - year_min} years', ha='center', fontsize=14, style='italic')

# 2. Top 10 years as horizontal bar
top_10_years = movies_by_year.nlargest(10)
colors = plt.cm.plasma(np.linspace(0.2, 0.8, 10))
bars = axes[1].barh(range(10), top_10_years.values, color=colors, edgecolor='black')
axes[1].set_yticks(range(10))
axes[1].set_yticklabels([f'{int(y)}' for y in top_10_years.index])
axes[1].set_xlabel('Number of Movies')
axes[1].set_title('Top 10 Years by Movie Count')
axes[1].invert_yaxis()
for bar, count in zip(bars, top_10_years.values):
    axes[1].text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2, 
                 f'{count}', va='center', fontsize=10)

# 3. Movies by decade as stacked area-like bar
movies_by_decade = movies_with_year.groupby('decade').size()
decades = movies_by_decade.index.astype(int)
colors = plt.cm.Spectral(np.linspace(0.1, 0.9, len(decades)))
bars = axes[2].bar(decades, movies_by_decade.values, width=8, color=colors, edgecolor='black')
axes[2].set_xlabel('Decade')
axes[2].set_ylabel('Number of Movies')
axes[2].set_title('Movies Released by Decade')
axes[2].grid(True, alpha=0.3, axis='y')
for bar, count in zip(bars, movies_by_decade.values):
    if count > 50:
        axes[2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
                     f'{count}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

print(f"\nYear range: {year_min} - {year_max}")
print(f"Peak year: {movies_by_year.idxmax():.0f} ({movies_by_year.max()} movies)")

In [None]:
# Visualize temporal trends
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Movies per year
axes[0, 0].plot(movies_by_year.index, movies_by_year.values, color='darkblue', linewidth=2)
axes[0, 0].fill_between(movies_by_year.index, movies_by_year.values, alpha=0.3)
axes[0, 0].set_xlabel('Year')
axes[0, 0].set_ylabel('Number of Movies')
axes[0, 0].set_title('Movies Released Per Year')
axes[0, 0].grid(True, alpha=0.3)

# 2. Movies by decade
movies_by_decade = movies_with_year.groupby('decade').size()
axes[0, 1].bar(movies_by_decade.index, movies_by_decade.values, color='orange', edgecolor='black', width=8)
axes[0, 1].set_xlabel('Decade')
axes[0, 1].set_ylabel('Number of Movies')
axes[0, 1].set_title('Movies by Decade')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# 3. Average rating by decade
rating_by_decade = movies_with_year.groupby('decade')['avg_rating'].mean()
axes[1, 0].plot(rating_by_decade.index, rating_by_decade.values, marker='o', color='green', linewidth=2, markersize=8)
axes[1, 0].set_xlabel('Decade')
axes[1, 0].set_ylabel('Average Rating')
axes[1, 0].set_title('Average Rating by Decade')
axes[1, 0].grid(True, alpha=0.3)

# 4. Movies by era
movies_by_era = movies_with_year['era'].value_counts().sort_index()
axes[1, 1].pie(movies_by_era.values, labels=movies_by_era.index, autopct='%1.1f%%', startangle=90)
axes[1, 1].set_title('Movies by Era')

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
print("=" * 70)
print("CORRELATION ANALYSIS")
print("=" * 70)

# Select numeric columns for correlation
numeric_cols = ['year', 'avg_rating', 'std_rating', 'num_ratings', 'popularity', 'movie_age', 'genres_count']
corr_data = movies[numeric_cols].dropna()

# Calculate correlation matrix
corr_matrix = corr_data.corr()

# Visualize correlation overview
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Correlation with avg_rating as horizontal bar
rating_corr = corr_matrix['avg_rating'].drop('avg_rating').sort_values()
colors = ['#e74c3c' if x < 0 else '#27ae60' for x in rating_corr.values]
bars = axes[0].barh(rating_corr.index, rating_corr.values, color=colors, edgecolor='black')
axes[0].axvline(x=0, color='black', linewidth=1)
axes[0].set_xlabel('Correlation Coefficient')
axes[0].set_title('Correlation with Average Rating')
axes[0].set_xlim(-1, 1)
for bar, val in zip(bars, rating_corr.values):
    offset = 0.05 if val >= 0 else -0.05
    ha = 'left' if val >= 0 else 'right'
    axes[0].text(val + offset, bar.get_y() + bar.get_height()/2, 
                 f'{val:.3f}', va='center', ha=ha, fontsize=10)

# 2. Correlation with num_ratings
pop_corr = corr_matrix['num_ratings'].drop('num_ratings').sort_values()
colors = ['#e74c3c' if x < 0 else '#3498db' for x in pop_corr.values]
bars = axes[1].barh(pop_corr.index, pop_corr.values, color=colors, edgecolor='black')
axes[1].axvline(x=0, color='black', linewidth=1)
axes[1].set_xlabel('Correlation Coefficient')
axes[1].set_title('Correlation with Number of Ratings')
axes[1].set_xlim(-1, 1)
for bar, val in zip(bars, pop_corr.values):
    offset = 0.05 if val >= 0 else -0.05
    ha = 'left' if val >= 0 else 'right'
    axes[1].text(val + offset, bar.get_y() + bar.get_height()/2, 
                 f'{val:.3f}', va='center', ha=ha, fontsize=10)

# 3. Top correlations (absolute value)
corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):
        corr_pairs.append({
            'pair': f"{corr_matrix.columns[i]}\nvs\n{corr_matrix.columns[j]}",
            'corr': corr_matrix.iloc[i, j]
        })
corr_df = pd.DataFrame(corr_pairs)
corr_df['abs_corr'] = corr_df['corr'].abs()
top_corr = corr_df.nlargest(5, 'abs_corr')
colors = ['#e74c3c' if x < 0 else '#27ae60' for x in top_corr['corr'].values]
bars = axes[2].bar(range(5), top_corr['corr'].values, color=colors, edgecolor='black')
axes[2].set_xticks(range(5))
axes[2].set_xticklabels(top_corr['pair'].values, fontsize=8)
axes[2].axhline(y=0, color='black', linewidth=1)
axes[2].set_ylabel('Correlation Coefficient')
axes[2].set_title('Top 5 Strongest Correlations')
axes[2].set_ylim(-1, 1)

plt.tight_layout()
plt.show()

print("\nKey findings:")
print(f"  - Strongest positive: {rating_corr.index[-1]} ({rating_corr.values[-1]:.3f})")
print(f"  - Strongest negative: {rating_corr.index[0]} ({rating_corr.values[0]:.3f})")

In [None]:
# Visualize correlation matrix
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Heatmap
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=axes[0])
axes[0].set_title('Correlation Heatmap')

# 2. Clustermap approach - just show important correlations
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt=".2f", cmap='viridis', 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=axes[1])
axes[1].set_title('Correlation Heatmap (Lower Triangle)')

plt.tight_layout()
plt.show()

## 8. Scatter Plot Analysis

In [None]:
# Scatter plots for key relationships
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Ratings vs Popularity
axes[0, 0].scatter(movies['num_ratings'], movies['avg_rating'], alpha=0.5, s=20, c='blue')
axes[0, 0].set_xlabel('Number of Ratings')
axes[0, 0].set_ylabel('Average Rating')
axes[0, 0].set_title('Average Rating vs Number of Ratings')
axes[0, 0].grid(True, alpha=0.3)

# 2. Year vs Average Rating
axes[0, 1].scatter(movies['year'], movies['avg_rating'], alpha=0.5, s=20, c='green')
axes[0, 1].set_xlabel('Year')
axes[0, 1].set_ylabel('Average Rating')
axes[0, 1].set_title('Average Rating vs Year')
axes[0, 1].grid(True, alpha=0.3)

# 3. Movie Age vs Number of Ratings
axes[1, 0].scatter(movies['movie_age'], movies['num_ratings'], alpha=0.5, s=20, c='red')
axes[1, 0].set_xlabel('Movie Age (years)')
axes[1, 0].set_ylabel('Number of Ratings')
axes[1, 0].set_title('Number of Ratings vs Movie Age')
axes[1, 0].grid(True, alpha=0.3)

# 4. Genres Count vs Average Rating
axes[1, 1].scatter(movies['genres_count'], movies['avg_rating'], alpha=0.5, s=20, c='purple')
axes[1, 1].set_xlabel('Number of Genres')
axes[1, 1].set_ylabel('Average Rating')
axes[1, 1].set_title('Average Rating vs Number of Genres')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Word Cloud Analysis

In [None]:
print("=" * 70)
print("WORD CLOUD GENERATION")
print("=" * 70)

# Generate word clouds
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Word cloud from titles
text_titles = ' '.join(movies['title_clean'].dropna().values)
wordcloud_titles = WordCloud(width=800, height=400, background_color='white', 
                              colormap='viridis', max_words=100).generate(text_titles)
axes[0].imshow(wordcloud_titles, interpolation='bilinear')
axes[0].axis('off')
axes[0].set_title('Word Cloud from Movie Titles', fontsize=16)

# 2. Word cloud from genres
text_genres = ' '.join(movies['genres'].dropna().str.replace('|', ' ').values)
wordcloud_genres = WordCloud(width=800, height=400, background_color='white', 
                              colormap='plasma', max_words=50).generate(text_genres)
axes[1].imshow(wordcloud_genres, interpolation='bilinear')
axes[1].axis('off')
axes[1].set_title('Word Cloud from Genres', fontsize=16)

plt.tight_layout()
plt.show()

print("\nWord clouds generated successfully!")

## 10. Genre-specific Analysis

In [None]:
print("=" * 70)
print("GENRE-SPECIFIC ANALYSIS")
print("=" * 70)

# Get binary genre columns
genre_cols = [col for col in movies.columns if col.startswith('is_')]
print(f"\nFound {len(genre_cols)} binary genre features")

# Average rating by genre
genre_ratings = {}
for col in genre_cols:
    genre_name = col.replace('is_', '').replace('-', '-').title()
    genre_movies = movies[movies[col] == 1]
    if len(genre_movies) > 0:
        genre_ratings[genre_name] = {
            'avg_rating': genre_movies['avg_rating'].mean(),
            'count': len(genre_movies),
            'total_ratings': genre_movies['num_ratings'].sum()
        }

genre_df = pd.DataFrame(genre_ratings).T.sort_values('avg_rating', ascending=False)

# Visualize genre-specific analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# 1. Average rating by genre (top and bottom 5)
top_5 = genre_df.head(5)
bottom_5 = genre_df.tail(5)
combined = pd.concat([top_5, bottom_5])
colors = ['#27ae60']*5 + ['#e74c3c']*5
bars = axes[0].barh(range(10), combined['avg_rating'].values, color=colors, edgecolor='black')
axes[0].set_yticks(range(10))
axes[0].set_yticklabels(combined.index)
axes[0].axhline(y=4.5, color='gray', linestyle='--', alpha=0.5)
axes[0].set_xlabel('Average Rating')
axes[0].set_title('Top 5 & Bottom 5 Genres by Rating')
axes[0].set_xlim(2.5, 4.5)
for bar, rating in zip(bars, combined['avg_rating'].values):
    axes[0].text(bar.get_width() + 0.02, bar.get_y() + bar.get_height()/2, 
                 f'{rating:.2f}', va='center', fontsize=9)

# 2. Genre rating vs count scatter
axes[1].scatter(genre_df['count'], genre_df['avg_rating'], 
                s=genre_df['total_ratings']/1000, alpha=0.7, c=genre_df['avg_rating'], 
                cmap='RdYlGn', edgecolors='black')
for idx, row in genre_df.iterrows():
    axes[1].annotate(idx, (row['count'], row['avg_rating']), fontsize=8, alpha=0.8,
                     xytext=(5, 5), textcoords='offset points')
axes[1].set_xlabel('Number of Movies')
axes[1].set_ylabel('Average Rating')
axes[1].set_title('Genre: Rating vs Movie Count\n(bubble size = total ratings)')
axes[1].grid(True, alpha=0.3)

# 3. Genre engagement (total ratings) - top 10
top_engagement = genre_df.nlargest(10, 'total_ratings')
colors = plt.cm.Blues(np.linspace(0.3, 0.9, 10))
bars = axes[2].barh(range(10), top_engagement['total_ratings'].values, color=colors, edgecolor='black')
axes[2].set_yticks(range(10))
axes[2].set_yticklabels(top_engagement.index)
axes[2].set_xlabel('Total Ratings Received')
axes[2].set_title('Top 10 Most Engaged Genres')
axes[2].invert_yaxis()
for bar, total in zip(bars, top_engagement['total_ratings'].values):
    axes[2].text(bar.get_width() + 100, bar.get_y() + bar.get_height()/2, 
                 f'{int(total):,}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

print(f"\nHighest rated genre: {genre_df.index[0]} ({genre_df['avg_rating'].iloc[0]:.2f})")
print(f"Most movies genre: {genre_df['count'].idxmax()} ({int(genre_df['count'].max())} movies)")

In [None]:
# Visualize genre analysis
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Average rating by genre
sorted_genres = genre_df.sort_values('avg_rating', ascending=True)
axes[0].barh(range(len(sorted_genres)), sorted_genres['avg_rating'].values, color='steelblue')
axes[0].set_yticks(range(len(sorted_genres)))
axes[0].set_yticklabels(sorted_genres.index)
axes[0].set_xlabel('Average Rating')
axes[0].set_title('Average Rating by Genre')
axes[0].grid(True, alpha=0.3, axis='x')

# 2. Movie count by genre
sorted_by_count = genre_df.sort_values('count', ascending=True)
axes[1].barh(range(len(sorted_by_count)), sorted_by_count['count'].values, color='coral')
axes[1].set_yticks(range(len(sorted_by_count)))
axes[1].set_yticklabels(sorted_by_count.index)
axes[1].set_xlabel('Number of Movies')
axes[1].set_title('Movie Count by Genre')
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## 11. Summary & Insights

In [None]:
print("=" * 70)
print("EDA SUMMARY & KEY INSIGHTS")
print("=" * 70)

# Create summary dashboard
fig = plt.figure(figsize=(18, 12))

# 1. Dataset Overview (top left)
ax1 = fig.add_subplot(2, 3, 1)
overview_data = ['Movies', 'Ratings', 'Genres', 'Years']
overview_values = [len(movies), len(ratings), len(genre_counts), int(movies_with_year['year'].max() - movies_with_year['year'].min())]
colors = ['#3498db', '#e74c3c', '#27ae60', '#f39c12']
bars = ax1.bar(overview_data, overview_values, color=colors, edgecolor='black')
ax1.set_yscale('log')
ax1.set_title('Dataset Overview', fontsize=12, fontweight='bold')
ax1.set_ylabel('Count (log scale)')
for bar, val in zip(bars, overview_values):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height(), f'{val:,}', 
             ha='center', va='bottom', fontsize=10, fontweight='bold')

# 2. Rating Distribution Summary (top center)
ax2 = fig.add_subplot(2, 3, 2)
rating_stats = [ratings['rating'].mean(), ratings['rating'].median(), ratings['rating'].mode()[0], ratings['rating'].std()]
stat_labels = ['Mean', 'Median', 'Mode', 'Std Dev']
colors = plt.cm.coolwarm(np.linspace(0.2, 0.8, 4))
bars = ax2.bar(stat_labels, rating_stats, color=colors, edgecolor='black')
ax2.set_title('Rating Statistics', fontsize=12, fontweight='bold')
ax2.set_ylabel('Value')
ax2.set_ylim(0, 5)
for bar, val in zip(bars, rating_stats):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, f'{val:.2f}', 
             ha='center', va='bottom', fontsize=11, fontweight='bold')

# 3. Top 5 Genres (top right)
ax3 = fig.add_subplot(2, 3, 3)
top_5_genres = genre_counts.head(5)
wedges, texts, autotexts = ax3.pie(top_5_genres.values, labels=top_5_genres.index, 
                                    autopct='%1.1f%%', startangle=90, 
                                    colors=plt.cm.Set2.colors[:5])
ax3.set_title('Top 5 Genres Distribution', fontsize=12, fontweight='bold')

# 4. Temporal Insights (bottom left)
ax4 = fig.add_subplot(2, 3, 4)
ax4.plot(movies_by_year.index, movies_by_year.values, color='#3498db', linewidth=2)
ax4.fill_between(movies_by_year.index, movies_by_year.values, alpha=0.3, color='#3498db')
peak_year = movies_by_year.idxmax()
ax4.axvline(x=peak_year, color='#e74c3c', linestyle='--', linewidth=2, label=f'Peak: {int(peak_year)}')
ax4.set_xlabel('Year')
ax4.set_ylabel('Number of Movies')
ax4.set_title('Movies Over Time', fontsize=12, fontweight='bold')
ax4.legend()
ax4.grid(True, alpha=0.3)

# 5. Rating vs Popularity (bottom center)
ax5 = fig.add_subplot(2, 3, 5)
ax5.scatter(movies['num_ratings'], movies['avg_rating'], alpha=0.4, s=15, c='#9b59b6')
ax5.set_xlabel('Number of Ratings')
ax5.set_ylabel('Average Rating')
ax5.set_title('Rating vs Popularity', fontsize=12, fontweight='bold')
ax5.grid(True, alpha=0.3)

# 6. Key Correlations (bottom right)
ax6 = fig.add_subplot(2, 3, 6)
rating_corr = corr_matrix['avg_rating'].drop('avg_rating').sort_values()
colors = ['#e74c3c' if x < 0 else '#27ae60' for x in rating_corr.values]
bars = ax6.barh(rating_corr.index, rating_corr.values, color=colors, edgecolor='black')
ax6.axvline(x=0, color='black', linewidth=1)
ax6.set_xlabel('Correlation with Rating')
ax6.set_title('Feature Correlations', fontsize=12, fontweight='bold')
ax6.set_xlim(-0.5, 0.5)

plt.suptitle('EDA SUMMARY DASHBOARD', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print text summary
print("\n" + "=" * 70)
print("KEY INSIGHTS SUMMARY")
print("=" * 70)

print(f"""
DATASET OVERVIEW:
   • Total movies: {len(movies):,}
   • Total ratings: {len(ratings):,}
   • Unique genres: {len(genre_counts)}
   • Year range: {int(movies_with_year['year'].min())} - {int(movies_with_year['year'].max())}

RATING INSIGHTS:
   • Average rating: {ratings['rating'].mean():.2f}
   • Most common rating: {ratings['rating'].mode()[0]:.1f}
   • Rating distribution is slightly left-skewed

GENRE INSIGHTS:
   • Most common: {genre_counts.index[0]} ({genre_counts.values[0]:,} movies)
   • Highest rated: {genre_df.index[0]} ({genre_df['avg_rating'].iloc[0]:.2f} avg)
   • Avg genres per movie: {genre_per_movie.mean():.2f}

TEMPORAL INSIGHTS:
   • Peak production year: {int(movies_by_year.idxmax())}
   • Most productive decade: {int(movies_by_decade.idxmax())}s

KEY CORRELATIONS:
   • Positive with rating: {rating_corr.index[-1]} ({rating_corr.values[-1]:.3f})
   • Negative with rating: {rating_corr.index[0]} ({rating_corr.values[0]:.3f})
""")

print("=" * 70)
print("EDA COMPLETED SUCCESSFULLY!")
print("=" * 70)