In [None]:
# ==========================================================
# Netflix Titles Dataset - Exploratory Data Analysis (EDA)
# Author: Samad Mehboob
# ==========================================================

# 1Ô∏è‚É£ Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# Set plot style
plt.style.use('seaborn-v0_8')

# 2Ô∏è‚É£ Load Dataset
df = pd.read_csv("netflix_titles.csv")
print("‚úÖ Dataset Loaded Successfully")
print("Shape of dataset:", df.shape)
display(df.head())

# 3Ô∏è‚É£ Data Cleaning
print("\nMissing values before cleaning:\n", df.isnull().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)

# Fill missing text values
fill_cols = ['country', 'director', 'cast', 'listed_in', 'rating', 'duration']
for col in fill_cols:
    if col in df.columns:
        df[col].fillna('Unknown', inplace=True)

# Convert release_year to numeric
df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')

# Parse numeric duration
def parse_duration(x):
    if pd.isna(x): return np.nan
    s = str(x)
    if 'min' in s:
        return int(s.replace('min', '').strip())
    elif 'Season' in s:
        return int(s.split()[0])
    else:
        return np.nan

df['duration_num'] = df['duration'].apply(parse_duration)

print("\n‚úÖ Cleaning complete.")
print("Shape after cleaning:", df.shape)
print("\nColumn Data Types:\n", df.dtypes)

# 4Ô∏è‚É£ Basic Exploration
print("\nüìã Data Info:")
df.info()

print("\nüìä Summary Statistics:")
display(df.describe(include='all').T.head(10))

print("\nüé¨ Type Counts:\n", df['type'].value_counts())

# 5Ô∏è‚É£ Helper Function for Split Columns
def split_and_count(series, top_n=10):
    cnt = Counter()
    for val in series.dropna():
        for p in str(val).split(','):
            p = p.strip()
            if p:
                cnt[p] += 1
    return pd.Series(cnt).sort_values(ascending=False).head(top_n)

top_countries = split_and_count(df['country'], 10)
top_genres = split_and_count(df['listed_in'], 10)
top_directors = split_and_count(df['director'], 10)

# 6Ô∏è‚É£ Visual Exploratory Data Analysis

# --- 1. Movies vs TV Shows ---
plt.figure(figsize=(6,4))
df['type'].value_counts().plot(kind='bar', color=['#E50914', '#221f1f'])
plt.title("Count by Type (Movies vs TV Shows)")
plt.xlabel("Type")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# --- 2. Distribution of Release Year ---
plt.figure(figsize=(8,4))
df['release_year'].dropna().astype(int).plot(kind='hist', bins=30, color='#FF6347')
plt.title("Distribution of Release Year")
plt.xlabel("Release Year")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# --- 3. Top 10 Countries ---
plt.figure(figsize=(8,5))
top_countries[::-1].plot(kind='barh', color='#008080')
plt.title("Top 10 Countries by Number of Titles")
plt.xlabel("Count")
plt.tight_layout()
plt.show()

# --- 4. Top 10 Genres ---
plt.figure(figsize=(8,5))
top_genres[::-1].plot(kind='barh', color='#9932CC')
plt.title("Top 10 Genres / Categories")
plt.xlabel("Count")
plt.tight_layout()
plt.show()

# --- 5. Top 10 Directors ---
plt.figure(figsize=(8,5))
top_directors[::-1].plot(kind='barh', color='#FF8C00')
plt.title("Top 10 Directors")
plt.xlabel("Count")
plt.tight_layout()
plt.show()

# --- 6. Duration Distribution (Movies) ---
plt.figure(figsize=(8,4))
df[df['type']=='Movie']['duration_num'].dropna().plot(kind='hist', bins=20, color='#2E8B57')
plt.title("Distribution of Movie Durations")
plt.xlabel("Duration (minutes)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# --- 7. Ratings Breakdown ---
plt.figure(figsize=(6,6))
df['rating'].value_counts().head(8).plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=plt.cm.tab10.colors)
plt.title("Top Ratings Breakdown")
plt.ylabel('')
plt.tight_layout()
plt.show()

# --- 8. Duration vs Release Year (Scatter) ---
plt.figure(figsize=(8,5))
movies = df[df['type']=='Movie'].dropna(subset=['release_year','duration_num'])
plt.scatter(movies['release_year'], movies['duration_num'], alpha=0.5, color='#DC143C')
plt.title("Movie Duration vs Release Year")
plt.xlabel("Release Year")
plt.ylabel("Duration (minutes)")
plt.tight_layout()
plt.show()

# 7Ô∏è‚É£ Summary Insights
print("üìä Key Insights:")
print("- Most content on Netflix is:", df['type'].value_counts().idxmax())
print("- Most popular country:", top_countries.index[0])
print("- Most frequent genre:", top_genres.index[0])
print("- Most common rating:", df['rating'].value_counts().idxmax())
print("- Total titles after cleaning:", len(df))
