In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from collections import Counter
import os

In [None]:
df=pd.read_csv('netflix_titles.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

# In this we seen that there are total 8807 rows and 12 column in the data set of netflix.
# In this data many null values in, director	2634 ,cast	825,country	831,date_added	10,rating	4,duration 3 in this all columns.
# In this dataset there is no duplicated values
#  Handle missing values
# Fill missing director, cast, country, rating with "Unknown".
#Drop rows with missing date_added, release_year, or title.
#Parse date_added into a datetime object.
#Convert release_year to integer.
#Split listed_in into individual genres for analysis.
#Standardize type (e.g., "Movie" or "TV Show") and clean duration.



In [None]:
# Data cleanig
df['director'].fillna('Unknown',inplace=True)
df['cast'].fillna('Unknown',inplace=True)
df['country'].fillna('Unknown',inplace=True)
df['rating'].fillna('Unknown',inplace=True)
df['duration'].fillna('Unknown',inplace=True)
df=df.dropna(subset=['date_added','release_year','title'])

In [None]:
# Convert data types
df['release_year'] = df['release_year'].astype(int)
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
# Drop rows with invalid dates
df = df.dropna(subset=['date_added'])

In [None]:
# Clean and split genres
df['listed_in'] = df['listed_in'].apply(lambda x: [genre.strip() for genre in x.split(',')])

In [None]:
# Remove any rows with invalid types
df = df[df['type'].isin(['Movie', 'TV Show'])]

In [None]:
# Summary Statistics
print("\n=== Summary Statistics ===")
total_titles = len(df)
movies_count = len(df[df['type'] == 'Movie'])
tv_shows_count = len(df[df['type'] == 'TV Show'])
earliest_year = df['release_year'].min()
latest_year = df['release_year'].max()

In [None]:
print(f"Total Titles: {total_titles}")
print(f"Movies: {movies_count} ({movies_count/total_titles*100:.1f}%)")
print(f"TV Shows: {tv_shows_count} ({tv_shows_count/total_titles*100:.1f}%)")
print(f"Release Year Range: {earliest_year} - {latest_year}")

In [None]:
# Top 5 Countries
country_counts = Counter()
for countries in df['country']:
    for country in countries.split(','):
        country = country.strip()
        if country != 'Unknown':
            country_counts[country] += 1
top_countries = country_counts.most_common(5)
print("\nTop 5 Countries by Content:")
for country, count in top_countries:
    print(f"{country}: {count} titles")

In [None]:
# Top 5 Genres
genre_counts = Counter()
for genres in df['listed_in']:
    for genre in genres:
        genre_counts[genre] += 1
top_genres = genre_counts.most_common(5)
print("\nTop 5 Genres:")
for genre, count in top_genres:
    print(f"{genre}: {count} titles")

In [None]:
# Content Type Distribution (Bar Chart)
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='type', palette='viridis')
plt.title('Content Type Distribution')
plt.xlabel('Type')
plt.ylabel('Count')
plt.show()


In [None]:
# 2. Content Added Over Time (Line Chart)
df['year_added'] = df['date_added'].dt.year
content_by_year = df['year_added'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
plt.plot(content_by_year.index, content_by_year.values, marker='o', color='blue')
plt.title('Content Added Over Time')
plt.xlabel('Year Added')
plt.ylabel('Number of Titles')
plt.grid(True)

In [None]:
# Top 5 Countries (Pie Chart)
top_countries_dict = dict(top_countries)
other_count = sum(country_counts[c] for c in country_counts if c not in top_countries_dict)
top_countries_dict['Other'] = other_count
labels = top_countries_dict.keys()
sizes = top_countries_dict.values()
plt.figure(figsize=(8, 8))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('pastel'))
plt.title('Top 5 Countries by Content')


In [None]:
# Top 10 Genres (Bar Chart)
top_10_genres = genre_counts.most_common(10)
genres, counts = zip(*top_10_genres)
plt.figure(figsize=(12, 6))
sns.barplot(x=list(counts), y=list(genres), palette='magma')
plt.title('Top 10 Genres')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.show()

In [None]:
#  Rating Distribution (Bar Chart)
rating_counts = df['rating'].value_counts()
plt.figure(figsize=(10, 6))
sns.barplot(x=rating_counts.values, y=rating_counts.index, palette='coolwarm')
plt.title('Rating Distribution')
plt.xlabel('Count')
plt.ylabel('Rating')
plt.show()

In [None]:
# Interesting Fact
print("\n=== Interesting Fact ===")
print("The dataset shows a significant surge in content added between 2016 and 2019 with a peak around 2018. This aligns with Netflix global expansion and increased investment in original content marking a shift from licensing older movies to producing new TV shows.")

# Conclusion
print("\n=== Conclusion ===")
print(f"The Netflix Titles dataset, with {total_titles} titles, reveals a library dominated by movies ({movies_count/total_titles*100:.1f}%) over TV shows. The United States leads in content production, followed by countries like India and the UK. Genres like Dramas and International Movies are prevalent, reflecting Netflix's global appeal. The rating distribution shows a focus on mature audiences (e.g., TV-MA). The surge in content additions during 2016–2019 highlights Netflix's strategic pivot toward original content creation. These insights underscore Netflix's evolution into a global streaming giant catering to diverse viewer preferences.")


Loading Netflix Titles dataset

Cleaning data...

---Summary Statistics
Total Titles: 8807
Movies: 6131 (69.6%)
TV Shows: 2676 (30.4%)
Release Year Range: 1925 - 2021

---Top 5 Countries by Content:
United States: 2818 titles
India: 972 titles
United Kingdom: 419 titles
Japan: 245 titles
South Korea: 199 titles

---Top 5 Genres:
Dramas: 2427 titles
Comedies: 1674 titles
International Movies: 1351 titles
Documentaries: 869 titles
Action & Adventure: 859 titles

--- Interesting Fact ---
The dataset shows a significant surge in content added between 2016 and 2019, with a peak around 2018. This aligns with Netflix's global expansion and increased investment in original content, marking a shift from licensing older movies to producing new TV shows.

--- Conclusion ---
The Netflix Titles dataset, with 8807 titles, reveals a library dominated by movies (69.6%) over TV shows. The United States leads in content production, followed by countries like India and the UK. Genres like Dramas and International Movies are prevalent, reflecting Netflix global appeal. The rating distribution shows a focus on mature audiences (e.g., TV-MA). The surge in content additions during 2016–2019 highlights Netflix's strategic pivot toward original content creation. These insights underscore Netflix's evolution into a global streaming giant catering to diverse viewer preferences.

In [None]:
df.head()