In [None]:
import numpy as np  
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

netflix_data = pd.read_csv("netflix_titles.csv")
netflix_data




In [None]:
# 1.What is the total number of movies vs TV shows?

type_counts = netflix_data['type'].value_counts()
total = len(netflix_data)

fig, ax = plt.subplots()
ax.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%', startangle=90)
ax.text(-1.2, 1.1, f'Total Titles: {total}', ha='left', va='top', fontsize=8)

plt.title('Distribution of Movies vs TV Shows')
plt.show()

In [None]:
# 2.What are the top 10 countries producing content for Netflix?
top_contries = netflix_data["country"].value_counts()
top_contries.head(10)

In [None]:
# 3.Which directors have the most titles on Netflix?
top_directors = netflix_data["director"].value_counts()
top_directors.head(10)


In [None]:
# Time-Based Analysis:
# 1.How has Netflix's content grown over the years (e.g., content added per year)?
# 2.Which year had the most releases added to Netflix?

content_added_date = netflix_data["date_added"]
cleand_added_date = pd.to_datetime(content_added_date.dropna().str.strip()) 
year_added = cleand_added_date.dt.year.value_counts().sort_index()
year_added.plot(kind="line",color="red",marker="o",title="Content Added Per Year",xlabel="YEAR",ylabel="Content Added")
plt.xticks(rotation=45)
plt.grid()
plt.tight_layout()
plt.show()


In [None]:
# Age Rating Insights:
# 1. What are the most common ratings given to shows?
# 2. How many shows are rated "TV-MA" vs "PG-13"?
ratings_netflix = netflix_data["rating"]
ratings_count = ratings_netflix.value_counts()
ratings_count

In [None]:
# Country-Specific Analysis:
# 1. What are the top genres in the USA vs India vs the UK?
# 2. How many titles were added in 2020 from each country?
from collections import Counter
country_genre = netflix_data[["country","listed_in"]]


def genre_counts_by_country(df, country_name):
    # Filter entries for the specified country
    country_df = df[df['country'] == country_name]
    
    # Join all genre strings, split into individual genres
    genres = ', '.join(country_df['listed_in'].dropna()).split(', ')
    
    return Counter(genres).most_common(5)

# Apply to each country
usa_top_genres = genre_counts_by_country(country_genre, 'United States')
uk_top_genres = genre_counts_by_country(country_genre, 'United Kingdom')
india_top_genres = genre_counts_by_country(country_genre, 'India')

print("🇺🇸 United States:", usa_top_genres)
print("🇬🇧 United Kingdom:", uk_top_genres)
print("🇮🇳 India:", india_top_genres)


In [None]:

new_df = netflix_data[["country","date_added"]]
new_df['date_added'] = pd.to_datetime(new_df['date_added'], errors='coerce')
df_2020 = new_df[new_df['date_added'].dt.year == 2020]
country_counts_2020 = df_2020['country'].value_counts(dropna=False)

print(country_counts_2020)

In [None]:
# Advanced Filtering:
# 1. List all movies with a duration greater than 2 hours.

video_duration = netflix_data[["title","duration","type"]]
movie_duration = video_duration[video_duration["type"] == "Movie"]
movie_duration['duration_minutes'] = movie_duration['duration'].str.extract(r'(\d+)').astype(float)
long_movies = movie_duration[movie_duration['duration_minutes'] > 120]
print(long_movies[['title', 'duration']])


In [None]:
# netflix_data.columns
# 2. Find all shows with the keyword "crime" in their description.
description_filter = netflix_data[["title","listed_in"]]
crime_shows = description_filter[description_filter['listed_in'].str.contains('crime', case=False, na=False)]
crime_shows[['title', 'listed_in']]


In [None]:
# 3. Which actors appear most frequently in the dataset?
from collections import Counter
netflix_data.columns
casts = netflix_data["cast"]
df_cast_clean = casts.dropna()

df_cast_clean = df_cast_clean.str.split(', ')


all_actors = [actor for cast in df_cast_clean for actor in cast]
actor_counts = Counter(all_actors)
most_common_actor = actor_counts.most_common(1)
print(most_common_actor)

In [None]:
# Bar plot of content count by year.
df_cleaned = netflix_data.dropna(subset=['release_year'])
year_counts = df_cleaned['release_year'].value_counts().sort_index()

plt.figure(figsize=(12,6))
plt.bar(year_counts.index, year_counts.values, color='skyblue', edgecolor='black')
plt.xlabel('Release Year')
plt.ylabel('Number of Content')
plt.title('Number of Content Releases by Year')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Heatmap of content release by year and country.

df_cleaned = netflix_data.dropna(subset=['release_year', 'country'])
top_countries = df_cleaned['country'].value_counts().head(10).index
df_top = df_cleaned[df_cleaned['country'].isin(top_countries)]
heatmap_data = df_top.groupby(['release_year', 'country']).size().unstack(fill_value=0)
plt.figure(figsize=(14, 8))
sns.heatmap(heatmap_data, cmap='YlOrBr', linewidths=0.5)

plt.title('Content Releases by Year (Top 10 Countries)', fontsize=16)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Release Year', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df_movies = netflix_data[netflix_data['type'] == 'Movie'].dropna(subset=['duration', 'release_year'])
df_movies['duration_min'] = df_movies['duration'].str.extract('(\d+)').astype(float)


plt.figure(figsize=(12, 6))
plt.scatter(df_movies['release_year'], df_movies['duration_min'], alpha=0.5, color='teal')
plt.title('Movie Duration vs. Release Year')
plt.xlabel('Release Year')
plt.ylabel('Duration (minutes)')
plt.grid(True)
plt.tight_layout()
plt.show()