# ðŸŽ¬ Movie Recommendation System - EDA Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
movies = pd.read_csv('./data/movies.csv')
tags = pd.read_csv('./data/tags.csv')

print(f"Movies shape: {movies.shape}")
print(f"Tags shape: {tags.shape}")
print(f"\nMovies columns: {movies.columns.tolist()}")
print(f"Tags columns: {tags.columns.tolist()}")

## 2. Basic Dataset Info

In [None]:
print("=== MOVIES DATASET ===")
print(movies.info())
print(f"\nMissing values:\n{movies.isnull().sum()}")
print(f"\nDuplicates: {movies.duplicated().sum()}")
print(f"\n{movies.head()}")

In [None]:
print("=== TAGS DATASET ===")
print(tags.info())
print(f"\nMissing values:\n{tags.isnull().sum()}")
print(f"\nDuplicates: {tags.duplicated().sum()}")
print(f"\n{tags.head()}")

## 3. Genre Analysis

In [None]:
# Split genres and count
genres_split = movies['genres'].str.split('|').explode()
genre_counts = genres_split.value_counts()

plt.figure(figsize=(14, 6))
genre_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Movie Count by Genre', fontsize=16, fontweight='bold')
plt.xlabel('Genre', fontsize=12)
plt.ylabel('Number of Movies', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('genre_distribution.png', dpi=100, bbox_inches='tight')
plt.show()

print(f"\nTop 10 Genres:\n{genre_counts.head(10)}")

In [None]:
# Genres per movie
movies['genre_count'] = movies['genres'].str.split('|').str.len()

plt.figure(figsize=(10, 6))
movies['genre_count'].value_counts().sort_index().plot(kind='bar', color='coral', edgecolor='black')
plt.title('Distribution of Genres per Movie', fontsize=16, fontweight='bold')
plt.xlabel('Number of Genres', fontsize=12)
plt.ylabel('Number of Movies', fontsize=12)
plt.tight_layout()
plt.savefig('genres_per_movie.png', dpi=100, bbox_inches='tight')
plt.show()

print(f"\nAverage genres per movie: {movies['genre_count'].mean():.2f}")

## 4. Tag Analysis

In [None]:
# Most common tags
tag_counts = tags['tag'].value_counts().head(20)

plt.figure(figsize=(14, 6))
tag_counts.plot(kind='barh', color='lightgreen', edgecolor='black')
plt.title('Top 20 Most Common Tags', fontsize=16, fontweight='bold')
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Tag', fontsize=12)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('top_tags.png', dpi=100, bbox_inches='tight')
plt.show()

In [None]:
# Tags per movie
tags_per_movie = tags.groupby('movieId').size()

plt.figure(figsize=(12, 6))
plt.hist(tags_per_movie, bins=50, color='purple', edgecolor='black', alpha=0.7)
plt.title('Distribution of Tags per Movie', fontsize=16, fontweight='bold')
plt.xlabel('Number of Tags', fontsize=12)
plt.ylabel('Number of Movies', fontsize=12)
plt.tight_layout()
plt.savefig('tags_per_movie.png', dpi=100, bbox_inches='tight')
plt.show()

print(f"\nMovies with tags: {len(tags_per_movie)}")
print(f"Total movies: {len(movies)}")
print(f"Movies without tags: {len(movies) - len(tags_per_movie)}")
print(f"Average tags per movie: {tags_per_movie.mean():.2f}")
print(f"Max tags on a movie: {tags_per_movie.max()}")

In [None]:
# Tag word cloud
all_tags = ' '.join(tags['tag'].astype(str))
wordcloud = WordCloud(width=1200, height=600, background_color='white', colormap='viridis').generate(all_tags)

plt.figure(figsize=(14, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Tag Word Cloud', fontsize=18, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('tag_wordcloud.png', dpi=100, bbox_inches='tight')
plt.show()

## 5. Genre-Tag Relationship

In [None]:
# Merge and analyze
tag_counts_per_movie = tags.groupby('movieId').size().reset_index(name='tag_count')
movies_with_tags = movies.merge(tag_counts_per_movie, on='movieId', how='left')
movies_with_tags['tag_count'] = movies_with_tags['tag_count'].fillna(0)

# Tags by genre
genre_tag_data = []
for _, row in movies_with_tags.iterrows():
    for genre in row['genres'].split('|'):
        genre_tag_data.append({'genre': genre, 'tag_count': row['tag_count']})

genre_tag_df = pd.DataFrame(genre_tag_data)
genre_tag_avg = genre_tag_df.groupby('genre')['tag_count'].mean().sort_values(ascending=False)

plt.figure(figsize=(14, 6))
genre_tag_avg.plot(kind='bar', color='teal', edgecolor='black')
plt.title('Average Tags per Genre', fontsize=16, fontweight='bold')
plt.xlabel('Genre', fontsize=12)
plt.ylabel('Average Number of Tags', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('tags_by_genre.png', dpi=100, bbox_inches='tight')
plt.show()

## 6. Summary Statistics

In [None]:
print("=== SUMMARY STATISTICS ===")
print(f"\nTotal Movies: {len(movies)}")
print(f"Total Unique Genres: {len(genre_counts)}")
print(f"Total Tags: {len(tags)}")
print(f"Total Unique Tags: {tags['tag'].nunique()}")
print(f"Movies with Tags: {len(tags_per_movie)} ({len(tags_per_movie)/len(movies)*100:.1f}%)")
print(f"Movies without Tags: {len(movies) - len(tags_per_movie)} ({(len(movies) - len(tags_per_movie))/len(movies)*100:.1f}%)")
print(f"\nMost Popular Genre: {genre_counts.index[0]} ({genre_counts.iloc[0]} movies)")
print(f"Most Common Tag: '{tag_counts.index[0]}' ({tag_counts.iloc[0]} times)")
print(f"\nAverage Genres per Movie: {movies['genre_count'].mean():.2f}")
print(f"Average Tags per Movie (with tags): {tags_per_movie.mean():.2f}")