# Netflix Movie & TV Shows Data Processing

In [1]:
from pandas_datareader import data, wb
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import kagglehub
import matplotlib.pyplot as plt
import plotly
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

### Data Cleaning

In [2]:
path = kagglehub.dataset_download("anandshaw2001/netflix-movies-and-tv-shows")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\lixin\.cache\kagglehub\datasets\anandshaw2001\netflix-movies-and-tv-shows\versions\1


In [3]:
import os

dataset_path = r"C:\Users\lixin\.cache\kagglehub\datasets\anandshaw2001\netflix-movies-and-tv-shows\versions\1"

files = os.listdir(dataset_path)
print(files)

['netflix_titles.csv']


In [4]:
file_path = os.path.join(dataset_path, "netflix_titles.csv")
df = pd.read_csv(file_path,encoding="ISO-8859-1")

In [5]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [7]:
df.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [8]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [9]:
df.update(df[['director', 'cast', 'country', 'rating']].fillna('Unknown'))
df.update(df[['date_added']].fillna(df['release_year']))
df.update(df[['duration']].fillna(df['duration'].mode()[0]))

In [10]:
df['date_added'] = df['date_added'].fillna(df['release_year'].astype(str))
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['year_month'] = df['date_added'].dt.to_period('M')

In [11]:
df['date_added'] = pd.to_datetime(df['date_added'])
df['year_month'] = df['date_added'].dt.to_period('M')

In [12]:
df_movies = df[df['type'] == 'Movie']
df_shows = df[df['type'] == 'TV Show'].copy()

df_shows["duration"] = df_shows["duration"].astype(str)

df_shows.loc[:, "seasons"] = df_shows["duration"].str.extract(r'(\d+)').astype(float)

df.loc[df["type"] == "Movie", "duration"] = df_movies["duration"]
df.loc[df["type"] == "TV Show", "duration"] = df_shows["seasons"]

df["duration"] = pd.to_numeric(df["duration"], errors="coerce")

df.loc[df["type"] == "Movie", "duration_type"] = "Minutes"
df.loc[df["type"] == "TV Show", "duration_type"] = "Seasons"

df["duration_type"] = df["duration_type"].astype(str)

In [13]:
df = df[['show_id', 'type', 'title', 'director', 'cast', 'country',
         'date_added', 'release_year', 'rating', 'duration', 'duration_type',
         'listed_in', 'description', 'year_month']]

In [14]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,duration_type,listed_in,description,year_month
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,2021-09-25,2020,PG-13,,Minutes,Documentaries,"As her father nears the end of his life, filmm...",2021-09
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2.0,Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021-09
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,2021-09-24,2021,TV-MA,1.0,Seasons,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021-09
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,2021-09-24,2021,TV-MA,1.0,Seasons,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021-09
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2.0,Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021-09


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   show_id        8807 non-null   object        
 1   type           8807 non-null   object        
 2   title          8807 non-null   object        
 3   director       8807 non-null   object        
 4   cast           8807 non-null   object        
 5   country        8807 non-null   object        
 6   date_added     8709 non-null   datetime64[ns]
 7   release_year   8807 non-null   int64         
 8   rating         8807 non-null   object        
 9   duration       2676 non-null   float64       
 10  duration_type  8807 non-null   object        
 11  listed_in      8807 non-null   object        
 12  description    8807 non-null   object        
 13  year_month     8709 non-null   period[M]     
dtypes: datetime64[ns](1), float64(1), int64(1), object(10), period[M](1)
mem

In [16]:
df.isnull().sum()

show_id             0
type                0
title               0
director            0
cast                0
country             0
date_added         98
release_year        0
rating              0
duration         6131
duration_type       0
listed_in           0
description         0
year_month         98
dtype: int64

In [17]:
df['date_added'] = df['date_added'].fillna(df['release_year'].astype(str) + "-01-01")
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [18]:
movie_duration_mean = df[df['type'] == 'Movie']['duration'].mean()
tv_show_seasons_mode = df[df['type'] == 'TV Show']['duration'].mode()[0]

df.loc[df['type'] == 'Movie', 'duration'] = df.loc[df['type'] == 'Movie', 'duration'].fillna(movie_duration_mean)
df.loc[df['type'] == 'TV Show', 'duration'] = df.loc[df['type'] == 'TV Show', 'duration'].fillna(tv_show_seasons_mode)

In [19]:
df.to_csv("cleaned_Netflix_Shows.csv", index=False)

### EDA

In [20]:
fig_release_year = px.histogram(df, x='release_year', nbins=30, title="Number of Netflix Titles Released Each Year", labels={'release_year': "Release Year"})
fig_release_year.show()

In [21]:
df["year_month"] = df["year_month"].astype(str)

fig_date_added = px.line(df.groupby("year_month")["show_id"].count().reset_index(), x="year_month", y="show_id", title="Netflix Titles Added Over Time", labels={"year_month": "Year-Month", "show_id": "Number of Titles"},markers=True)

fig_date_added.update_xaxes(tickangle=45)
fig_date_added.show()

In [22]:
fig = px.pie(df, names='type', title="Proportion of Movies vs TV Shows on Netflix", hole=0.3, color_discrete_sequence=px.colors.qualitative.Set2)

fig.update_traces(textinfo='percent+label')
fig.show()

In [42]:
from collections import Counter

all_genres = ",".join(df['listed_in'].dropna()).split(", ")
genre_counts = Counter(all_genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count']).sort_values(by='Count', ascending=False)

fig = px.bar(genre_df.head(10), x='Count', y='Genre', title="Top 10 Most Common Netflix Content Categories", labels={'Count': "Number of Titles", 'Genre': "Category"}, orientation='h')
fig.show()

In [43]:
country_count = df['country'].value_counts().reset_index().head(10)
country_count.columns = ['Country', 'Count']

fig = px.bar(country_count, x='Count', y='Country', title="Top 10 Countries with Most Netflix Titles", labels={'Count': "Number of Titles", 'Country': "Country"}, orientation='h')

fig.show()

In [44]:
fig = px.histogram(df[df['type'] == 'Movie'], x='duration', title="Distribution of Movie Durations on Netflix", labels={'duration': "Minutes"}, nbins=30)
fig.update_traces(marker=dict(line=dict(color="white", width=1)))
fig.show()

In [45]:
fig = px.histogram(df[df['type'] == 'TV Show'], x='duration', title="Distribution of TV Show Durations on Netflix", labels={'duration': ""}, nbins=30)
fig.update_traces(marker=dict(line=dict(color="white", width=1)))
fig.show()