#Project 2: In-Depth Exploratory Data Analysis (EDA)

Netflix Content Analysis ðŸŽ¬

Project Objective: To perform an in-depth exploratory data analysis of the Netflix dataset. We will explore trends in content production, identify popular genres, analyze content ratings, and understand the distribution of movies and TV shows on the platform. This project builds on foundational EDA by introducing time-series analysis and more complex data cleaning and transformation techniques.

Step 1: Setup - Importing Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

sns.set_style('darkgrid')

Step 2: Data Loading and Initial Inspection

In [None]:
!git clone "https://github.com/HarshvardhanSingh-13/Datasets"

In [None]:
netflix_df = pd.read_csv('/content/Datasets/Netflix_Titles Dataset/netflix_titles.csv')
netflix_df.head()

In [None]:
netflix_df.info()

Step 3: Data Cleaning and Transformation

In [None]:
netflix_df['director'] = netflix_df['director'].fillna('Unknown')
netflix_df['cast'] = netflix_df['cast'].fillna('Unknown')

In [None]:
mode_country = netflix_df['country'].mode()[0]
netflix_df['country'] = netflix_df['country'].fillna(mode_country)

In [None]:
netflix_df.dropna(subset=['date_added', 'rating'], inplace=True)

In [None]:
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'], format='mixed', dayfirst=False)

In [None]:
netflix_df['year_added'] = netflix_df['date_added'].dt.year
netflix_df['month_added'] = netflix_df['date_added'].dt.month

In [None]:
print("Missing values after cleaning:")
print(netflix_df.isnull().sum())
print("\nData types after transformation:")
print(netflix_df.dtypes)

Step 4: Exploratory Data Analysis & Visualization

In [None]:
plt.figure(figsize=(8, 6))
type_counts = netflix_df['type'].value_counts()
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=140, colors=['#e60023', '#221f1f'])
plt.title('Proportion of Movies vs. TV Shows')
plt.ylabel('')
plt.show()

In [None]:
content_over_time = netflix_df.groupby(['year_added', 'type']).size().unstack().fillna(0)

plt.figure(figsize=(14, 8))
content_over_time.plot(kind='line', marker='o', figsize=(14, 8))
plt.title('Content Added to Netflix Over the Years (by Type)')
plt.xlabel('Year Added')
plt.ylabel('Number of Titles Added')
plt.legend(title='Content Type')
plt.grid(True)
plt.show()

In [None]:
netflix_df.head(2)

In [None]:
genres = netflix_df.assign(genre=netflix_df['listed_in'].str.split(', ')).explode('genre')

In [None]:
top_genres_counts = genres['genre'].value_counts().reset_index()
top_genres_counts.columns = ['genre', 'count']
top_genres_counts_plot = top_genres_counts.head(15)

plt.figure(figsize=(12, 8))
sns.barplot(y='genre', x='count', data=top_genres_counts_plot, palette='mako', hue='genre', legend=False)
plt.title('Top 15 Genres on Netflix')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.show()

In [None]:
movies_df = netflix_df[netflix_df['type'] == 'Movie'].copy()
tv_shows_df = netflix_df[netflix_df['type'] == 'TV Show'].copy()

In [None]:
movies_df['duration_min'] = movies_df['duration'].str.replace(' min', '').astype(int)

tv_shows_df['seasons'] = tv_shows_df['duration'].str.replace(' Seasons', '').str.replace(' Season', '').astype(int)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

sns.histplot(ax=axes[0], data=movies_df, x='duration_min', bins=50, kde=True, color='skyblue').set_title('Movie Duration Distribution (minutes)')

sns.countplot(ax=axes[1], x='seasons', data=tv_shows_df, palette='rocket', order=tv_shows_df['seasons'].value_counts().index, hue='seasons', legend=False).set_title('TV Show Season Distribution')

plt.show()

In [None]:
countries = netflix_df.assign(country=netflix_df['country'].str.split(', ')).explode('country')

In [None]:
top_countries_counts = countries['country'].value_counts().reset_index()
top_countries_counts.columns = ['country', 'count']

In [None]:
top_countries_counts_plot = top_countries_counts.head(15)

plt.figure(figsize=(12, 10))
sns.barplot(y='country', x='count', data=top_countries_counts_plot, palette='viridis', hue='country', legend=False)
plt.title('Top 15 Content Producing Countries on Netflix')
plt.xlabel('Number of Titles')
plt.ylabel('Country')
plt.show()

In [None]:
netflix_df.head(2)

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(x='rating', data=netflix_df, order=netflix_df['rating'].value_counts().index, palette='crest', hue='rating', legend=False)
plt.title('Distribution of Content Ratings on Netflix')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

Step 5: Feature Engineering - Content Freshness

In [None]:
netflix_df['age_on_netflix'] = netflix_df['year_added'] - netflix_df['release_year']

content_age = netflix_df[netflix_df['age_on_netflix'] >= 0]

plt.figure(figsize=(14, 7))
sns.histplot(data=content_age, x='age_on_netflix', bins=50, kde=True)
plt.title('Distribution of Content Age When Added to Netflix')
plt.xlabel('Content Age (Years)')
plt.ylabel('Number of Titles')
plt.show()

Step 6: Deeper Multivariate Analysis

In [None]:
top_genres = genres['genre'].value_counts().index[:5]
genres_movies = genres[(genres['type'] == 'Movie') & (genres['genre'].isin(top_genres))].copy()
genres_movies['duration_min'] = genres_movies['duration'].str.replace(' min', '').astype(int)

plt.figure(figsize=(15, 8))
sns.boxplot(data=genres_movies, x='genre', y='duration_min', palette='pastel', hue='genre', legend=False)
plt.title('Movie Duration by Top Genres')
plt.xlabel('Genre')
plt.ylabel('Duration (minutes)')
plt.xticks(rotation=45)
plt.show()

Step 7: Word Cloud from Content Descriptions

In [None]:
text = ' '.join(netflix_df['description'])

wordcloud = WordCloud(width=800, height=400, background_color='black').generate(text)

plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Common Words in Netflix Content Descriptions', fontsize=20)
plt.show()

In [None]:
import pandas as pd

df = pd.read_csv('/content/Datasets/Netflix_Titles Dataset/netflix_titles.csv')
df['director'] = df['director'].fillna('Unknown')
directors_series = df[df['director'] != 'Unknown']['director'].str.split(', ')
all_directors = directors_series.explode()
top_directors = all_directors.value_counts().head(10)

print(top_directors)

In [None]:
import pandas as pd
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df = pd.read_csv('/content/Datasets/Netflix_Titles Dataset/netflix_titles.csv')

def get_ngrams(text, n):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text).lower())
    tokens = [word for word in text.split() if word not in stop_words]
    return [" ".join(tokens[i:i+n]) for i in range(len(tokens)-(n-1))]

all_words = []
df['description'].apply(lambda x: all_words.extend(get_ngrams(x, 1)))
top_words = Counter(all_words).most_common(10)

all_pairs = []
df['description'].apply(lambda x: all_pairs.extend(get_ngrams(x, 2)))
top_pairs = Counter(all_pairs).most_common(10)

all_phrases = []
df['description'].apply(lambda x: all_phrases.extend(get_ngrams(x, 3)))
top_phrases = Counter(all_phrases).most_common(10)

print(top_words)
print(top_pairs)
print(top_phrases)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/Datasets/Netflix_Titles Dataset/netflix_titles.csv')

df['date_added'] = pd.to_datetime(df['date_added'].str.strip())
df['year_added'] = df['date_added'].dt.year

release_trend = df.groupby('release_year').size().reset_index(name='count')
added_trend = df.groupby('year_added').size().reset_index(name='count')

plt.figure(figsize=(12, 6))
sns.lineplot(data=release_trend[release_trend['release_year'] > 2000], x='release_year', y='count', label='Release Year')
sns.lineplot(data=added_trend, x='year_added', y='count', label='Year Added to Netflix')
plt.title('Content Production: Release Year vs. Year Added')
plt.xlabel('Year')
plt.ylabel('Number of Titles')
plt.legend()
plt.savefig('production_trend.png')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/Datasets/Netflix_Titles Dataset/netflix_titles.csv')

plt.figure(figsize=(10, 6))
sns.boxplot(x='type', y='release_year', data=df)
plt.title('Distribution of Release Year by Content Type')
plt.xlabel('Type')
plt.ylabel('Release Year')
plt.savefig('age_relationship.png')

stats = df.groupby('type')['release_year'].describe()
print(stats)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

netflix_df = pd.read_csv('/content/Datasets/Netflix_Titles Dataset/netflix_titles.csv')

netflix_df.dropna(subset=['date_added', 'rating'], inplace=True)
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'].str.strip(), format='mixed')
netflix_df['year_added'] = netflix_df['date_added'].dt.year

top_5_ratings = netflix_df['rating'].value_counts().nlargest(5).index
df_top_ratings = netflix_df[netflix_df['rating'].isin(top_5_ratings)]

rating_evolution = pd.crosstab(df_top_ratings['year_added'], df_top_ratings['rating'])

plt.figure(figsize=(14, 8))
sns.lineplot(data=rating_evolution, dashes=False, marker='o')
plt.title('Change in Distribution of Top 5 Content Ratings Over Time', fontsize=16)
plt.xlabel('Year Added to Netflix', fontsize=12)
plt.ylabel('Number of Titles Added', fontsize=12)
plt.legend(title='Rating', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

print(rating_evolution.tail())