# Data Cleaning
Data Source: https://www.kaggle.com/datasets/shivamb/netflix-shows?resource=download


In [24]:
import pandas as pd

In [25]:
# Load the dataset from a CSV file
df = pd.read_csv("/content/netflix_titles.csv")

In [26]:
# Display the first 5 rows to get an initial look at the data
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [27]:
# Retrieve the total number of rows and columns (dimensions) of dataset
df.shape

(8807, 12)

In [28]:
# Identify and count duplicate records in the dataset
df.duplicated().sum()

np.int64(0)

In [29]:
# Check for missing (null) values
df.isnull().any()

Unnamed: 0,0
show_id,False
type,False
title,False
director,True
cast,True
country,True
date_added,True
release_year,False
rating,True
duration,True


In [30]:
# Count the exact number of missing values for each column
df.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2634
cast,825
country,831
date_added,10
release_year,0
rating,4
duration,3


In [31]:
# Remove rows where critical columns (date_added, rating, duration) have missing values
df.dropna(subset=['date_added','rating','duration'], inplace=True)

In [32]:
# Re-verify missing values after the first stage of cleaning
df.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2621
cast,825
country,829
date_added,0
release_year,0
rating,0
duration,0


In [33]:
# Fill missing values in specific columns with a placeholder text "Unknown
columns_to_fill = ['director','cast','country']
df[columns_to_fill] = df[columns_to_fill].fillna("Unknown")

In [34]:
# Final check for null values to ensure the dataset is fully populated
df.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,0
cast,0
country,0
date_added,0
release_year,0
rating,0
duration,0


In [35]:
# Check the size of the dataset after dropping missing rows
df.shape

(8790, 12)

In [36]:
# Split the 'duration' column into two separate columns: one for the value and one for the unit (min/season)
duration_split = df['duration'].str.split(' ',expand=True)

In [37]:
# Assign the split parts to new columns
df['duration_value'] = duration_split[0]
df['duration_unit'] = duration_split[1]

In [38]:
# Display the head to verify the new columns
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_value,duration_unit
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",90,min
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2,Seasons
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,1,Season
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",1,Season
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2,Seasons


In [39]:
# Remove the original 'duration' column as it is no longer needed
df = df.drop(columns=['duration'] , axis=1)

In [40]:
# Check current data types of all columns
df.dtypes

Unnamed: 0,0
show_id,object
type,object
title,object
director,object
cast,object
country,object
date_added,object
release_year,int64
rating,object
listed_in,object


In [41]:
# Convert 'duration_value' from string to integer to allow for numerical analysis
df = df.astype({'duration_value':'int'})

In [42]:
# Display technical summary of the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8790 entries, 0 to 8806
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   show_id         8790 non-null   object
 1   type            8790 non-null   object
 2   title           8790 non-null   object
 3   director        8790 non-null   object
 4   cast            8790 non-null   object
 5   country         8790 non-null   object
 6   date_added      8790 non-null   object
 7   release_year    8790 non-null   int64 
 8   rating          8790 non-null   object
 9   listed_in       8790 non-null   object
 10  description     8790 non-null   object
 11  duration_value  8790 non-null   int64 
 12  duration_unit   8790 non-null   object
dtypes: int64(2), object(11)
memory usage: 961.4+ KB


In [43]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,listed_in,description,duration_value,duration_unit
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",2020,PG-13,Documentaries,"As her father nears the end of his life, filmm...",90,min
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2,Seasons
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unknown,"September 24, 2021",2021,TV-MA,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,1,Season
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,Unknown,"September 24, 2021",2021,TV-MA,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",1,Season
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2,Seasons


In [44]:
# Export the cleaned dataset to an Excel file for reporting or further analysis
df.to_excel("Netflix_Titles_Cleaned_Data.xlsx",index=False)