In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
%matplotlib inline

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("deep")
plt.rcParams['figure.figsize'] = [12, 7]

# Load the data
df = pd.read_csv('netflix_titles.csv')

# Initial data cleaning
df['date_added'] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
df['country'] = df['country'].fillna('Unknown')
df['duration'] = df['duration'].fillna('No Data')
df['cast'] = df['cast'].fillna('No Cast Listed')
df['director'] = df['director'].fillna('No Director Listed')

# Create primary country (first listed country for each show)
df['primary_country'] = df['country'].str.split(',').str[0].str.strip()

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [4]:
# Content Type Analysis
plt.figure(figsize=(10, 6))
content_dist = df['type'].value_counts()
plt.pie(content_dist, labels=content_dist.index, autopct='%1.1f%%', colors=['#E50914', '#221F1F'])
plt.title('Netflix Content Distribution: Movies vs TV Shows')
plt.show()

# Print summary
print("\nContent Distribution Summary:")
print("-" * 30)
for content_type, count in content_dist.items():
    print(f"{content_type}: {count:,} ({count/len(df)*100:.1f}%)")

NameError: name 'df' is not defined

<Figure size 1000x600 with 0 Axes>

In [6]:
# Yearly content addition analysis
yearly_content = df.groupby(['year_added', 'type']).size().unstack()
plt.figure(figsize=(12, 6))
yearly_content.plot(kind='bar', stacked=True)
plt.title('Netflix Content Addition Over Years')
plt.xlabel('Year')
plt.ylabel('Number of Titles Added')
plt.legend(title='Content Type')
plt.xticks(rotation=45)
plt.show()

# Print growth stats
print("\nContent Growth Summary:")
print("-" * 30)
print(f"Peak Year: {yearly_content.sum(axis=1).idxmax()} with {yearly_content.sum(axis=1).max():,} titles")

NameError: name 'df' is not defined

In [8]:
# Top 10 countries by content production
plt.figure(figsize=(12, 6))
top_countries = df['primary_country'].value_counts().head(10)
sns.barplot(x=top_countries.values, y=top_countries.index)
plt.title('Top 10 Countries by Content Production')
plt.xlabel('Number of Titles')
plt.show()

# Print country stats
print("\nGeographic Distribution Summary:")
print("-" * 30)
print("Top 5 Content Producing Countries:")
for country, count in top_countries.head().items():
    print(f"{country}: {count:,} titles")

NameError: name 'df' is not defined

<Figure size 1200x600 with 0 Axes>

In [10]:
# Split genres (listed_in) and get distribution
genres = df['listed_in'].str.split(',', expand=True).stack().str.strip()
top_genres = genres.value_counts().head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=top_genres.values, y=top_genres.index, palette='Reds_r')
plt.title('Top 10 Netflix Genres')
plt.xlabel('Number of Titles')
plt.show()

# Print genre insights
print("\nGenre Distribution Summary:")
print("-" * 30)
print("Top 5 Genres:")
for genre, count in top_genres.head().items():
    print(f"{genre}: {count:,} titles")

NameError: name 'df' is not defined

In [12]:
# Separate movies and TV shows for duration analysis
# Movies (convert to numeric)
movies_duration = df[df['type'] == 'Movie']['duration'].str.extract('(\d+)').astype(float)
tv_duration = df[df['type'] == 'TV Show']['duration'].str.extract('(\d+)').astype(float)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.hist(movies_duration, bins=30, color='#E50914', alpha=0.7)
plt.title('Movie Duration Distribution (minutes)')
plt.xlabel('Duration (minutes)')

plt.subplot(1, 2, 2)
plt.hist(tv_duration, bins=20, color='#221F1F', alpha=0.7)
plt.title('TV Show Duration Distribution (seasons)')
plt.xlabel('Number of Seasons')
plt.tight_layout()
plt.show()

# Print duration insights
print("\nDuration Analysis:")
print("-" * 30)
print(f"Average Movie Duration: {movies_duration.mean():.1f} minutes")
print(f"Average TV Show Duration: {tv_duration.mean():.1f} seasons")

NameError: name 'df' is not defined

In [14]:
# Monthly release patterns
plt.figure(figsize=(12, 6))
monthly_releases = df.groupby('month_added')['type'].count()
plt.plot(monthly_releases.index, monthly_releases.values, marker='o', linewidth=2, color='#E50914')
plt.title('Content Release Pattern by Month')
plt.xlabel('Month')
plt.ylabel('Number of Releases')
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.grid(True, alpha=0.3)
plt.show()

# Print seasonal insights
print("\nRelease Pattern Analysis:")
print("-" * 30)
peak_month = monthly_releases.idxmax()
print(f"Peak Release Month: {pd.datetime(2020, peak_month, 1).strftime('%B')} with {monthly_releases.max():,} releases")

NameError: name 'df' is not defined

<Figure size 1200x600 with 0 Axes>

In [16]:
# Rating analysis with content type breakdown
plt.figure(figsize=(12, 6))
rating_by_type = pd.crosstab(df['rating'], df['type'])
rating_by_type.plot(kind='bar', stacked=True)
plt.title('Content Rating Distribution by Type')
plt.xlabel('Rating')
plt.ylabel('Number of Titles')
plt.legend(title='Content Type')
plt.xticks(rotation=45)
plt.show()

# Print rating insights
print("\nRating Distribution Summary:")
print("-" * 30)
top_ratings = df['rating'].value_counts().head(3)
print("Top 3 Content Ratings:")
for rating, count in top_ratings.items():
    print(f"{rating}: {count:,} titles ({count/len(df)*100:.1f}%)")

NameError: name 'df' is not defined

<Figure size 1200x600 with 0 Axes>

In [18]:
print("\nNetflix Content Analysis Summary")
print("=" * 40)
print(f"Total Titles: {len(df):,}")
print(f"Time Period: {df['year_added'].min()} to {df['year_added'].max()}")
print(f"Number of Countries: {df['primary_country'].nunique():,}")
print(f"Number of Genres: {genres.nunique():,}")
print(f"Content Added in Last Year: {len(df[df['year_added'] == df['year_added'].max()]):,}")


Netflix Content Analysis Summary


NameError: name 'df' is not defined

In [20]:
# Netflix brand colors and custom style setup
NETFLIX_COLORS = {
    'red': '#E50914',
    'black': '#221F1F',
    'grey': '#F5F5F1',
    'dark_grey': '#4A4A4A'
}

# Custom style function
def netflix_style():
    plt.style.use('dark_background')
    plt.rcParams['figure.facecolor'] = NETFLIX_COLORS['black']
    plt.rcParams['axes.facecolor'] = NETFLIX_COLORS['black']
    plt.rcParams['text.color'] = NETFLIX_COLORS['grey']
    plt.rcParams['axes.labelcolor'] = NETFLIX_COLORS['grey']
    plt.rcParams['xtick.color'] = NETFLIX_COLORS['grey']
    plt.rcParams['ytick.color'] = NETFLIX_COLORS['grey']

In [22]:
def create_netflix_dashboard():
    netflix_style()
    fig = plt.figure(figsize=(20, 12))
    
    # Content Growth Over Time with Monthly Pattern
    ax1 = plt.subplot2grid((2, 3), (0, 0), colspan=2)
    yearly_data = df.groupby(['year_added', 'type']).size().unstack()
    yearly_data.plot(kind='area', stacked=True, ax=ax1, 
                    color=[NETFLIX_COLORS['red'], NETFLIX_COLORS['grey']], alpha=0.7)
    ax1.set_title('Netflix Content Growth', fontsize=14, pad=20)
    ax1.grid(color=NETFLIX_COLORS['dark_grey'], linestyle='--', alpha=0.2)
    
    # Genre Distribution (Circular)
    ax2 = plt.subplot2grid((2, 3), (0, 2))
    genres = df['listed_in'].str.split(',', expand=True).stack().str.strip()
    genre_counts = genres.value_counts().head(5)
    ax2.pie(genre_counts, labels=genre_counts.index, autopct='%1.1f%%', 
            colors=sns.color_palette("Reds", n_colors=5))
    ax2.set_title('Top 5 Genres', fontsize=14, pad=20)
    
    # Content Duration Analysis
    ax3 = plt.subplot2grid((2, 3), (1, 0))
    movies = df[df['type'] == 'Movie']
    movie_duration = movies['duration'].str.extract('(\d+)').astype(float)
    ax3.hist(movie_duration, bins=30, color=NETFLIX_COLORS['red'], alpha=0.7)
    ax3.set_title('Movie Duration Distribution', fontsize=14, pad=20)
    
    # Rating Distribution
    ax4 = plt.subplot2grid((2, 3), (1, 1))
    rating_counts = df['rating'].value_counts().head(6)
    ax4.bar(rating_counts.index, rating_counts.values, color=NETFLIX_COLORS['red'])
    ax4.set_title('Content Rating Distribution', fontsize=14, pad=20)
    plt.xticks(rotation=45)
    
    # Geographic Heat Map
    ax5 = plt.subp

In [24]:
def content_release_strategy():
    netflix_style()
    fig, ax = plt.subplots(figsize=(15, 8))
    
    # Create monthly release patterns by type
    monthly_by_type = df.pivot_table(
        index='month_added',
        columns='type',
        values='show_id',
        aggfunc='count'
    ).fillna(0)
    
    # Create radar chart
    angles = np.linspace(0, 2*np.pi, 12, endpoint=False)
    angles = np.concatenate((angles, [angles[0]]))  # complete the circle
    
    # Close the plot
    monthly_by_type_closed = monthly_by_type.copy()
    monthly_by_type_closed.loc[13] = monthly_by_type_closed.loc[1]
    
    # Plot
    ax = plt.subplot(111, projection='polar')
    ax.plot(angles, monthly_by_type_closed['Movie'], 'o-', 
            linewidth=2, label='Movies', color=NETFLIX_COLORS['red'])
    ax.fill(angles, monthly_by_type_closed['Movie'], 
            alpha=0.25, color=NETFLIX_COLORS['red'])
    ax.plot(angles, monthly_by_type_closed['TV Show'], 'o-', 
            linewidth=2, label='TV Shows', color=NETFLIX_COLORS['grey'])
    ax.fill(angles, monthly_by_type_closed['TV Show'], 
            alpha=0.25, color=NETFLIX_COLORS['grey'])
    
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                        'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    ax.set_title('Content Release Strategy Throughout the Year', 
                 pad=20, color=NETFLIX_COLORS['red'])
    ax.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    
    return fig

In [26]:
def analyze_content_trends():
    """Analyze content trends over time with unique metrics"""
    # Calculate content diversity score
    df['release_recency'] = df['release_year'].max() - df['release_year']
    df['content_score'] = df.groupby('type')['release_recency'].transform(
        lambda x: 1 - (x - x.min()) / (x.max() - x.min())
    )
    
    print("\nNETFLIX CONTENT STRATEGY INSIGHTS")
    print("=" * 40)
    print(f"\n1. Content Balance")
    print(f"   Movies to TV Shows Ratio: {len(df[df['type']=='Movie'])/len(df[df['type']=='TV Show']):.2f}")
    
    print(f"\n2. Geographic Diversity")
    print(f"   Number of Countries: {df['primary_country'].nunique()}")
    print(f"   International Content: {len(df[df['primary_country']!='United States'])/len(df)*100:.1f}%")
    
    print(f"\n3. Content Freshness")
    print(f"   Average Content Age: {df['release_recency'].mean():.1f} years")
    print(f"   New Releases (Last 2 Years): {len(df[df['release_recency'] <= 2])/len(df)*100:.1f}%")
    
    return None