# Netflix Dataset Cleaning

In [1]:
# import libraries
import pandas as pd
import numpy as np

# read in data
df_netflix = pd.read_csv('data/netflix_titles.csv')

In [2]:
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Check for Duplicate/ NULL Values

In [3]:
df_netflix.duplicated().sum()

0

In [4]:
df_netflix.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [5]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


## Clean Data

In [6]:
df_netflix.dropna(subset=['date_added'], inplace=True) # drop some null values
df_netflix.dropna(subset=['rating'], inplace=True)
# fill unknown director and cast with empty str
df_netflix['director'] = df_netflix['director'].fillna('')
df_netflix['cast'] = df_netflix['cast'].fillna('')
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8793 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8793 non-null   object
 1   type          8793 non-null   object
 2   title         8793 non-null   object
 3   director      8793 non-null   object
 4   cast          8793 non-null   object
 5   country       7964 non-null   object
 6   date_added    8793 non-null   object
 7   release_year  8793 non-null   int64 
 8   rating        8793 non-null   object
 9   duration      8790 non-null   object
 10  listed_in     8793 non-null   object
 11  description   8793 non-null   object
dtypes: int64(1), object(11)
memory usage: 893.0+ KB


In [7]:
# fill the country and date_added with the mode because there is still some null values
df_netflix['country'] = df_netflix['country'].fillna(df_netflix['country'].mode()[0])
df_netflix['date_added'] = df_netflix['date_added'].fillna(df_netflix['date_added'].mode()[0])
df_netflix['duration'] = df_netflix['duration'].fillna(df_netflix['duration'].mode()[0])

In [8]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8793 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8793 non-null   object
 1   type          8793 non-null   object
 2   title         8793 non-null   object
 3   director      8793 non-null   object
 4   cast          8793 non-null   object
 5   country       8793 non-null   object
 6   date_added    8793 non-null   object
 7   release_year  8793 non-null   int64 
 8   rating        8793 non-null   object
 9   duration      8793 non-null   object
 10  listed_in     8793 non-null   object
 11  description   8793 non-null   object
dtypes: int64(1), object(11)
memory usage: 893.0+ KB


In [9]:
df_netflix['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR',
       'TV-Y7-FV', 'UR'], dtype=object)

## Need to remove bad values from rating col

In [10]:
# define function to clean rating col
def clean_rating(string):
    if 'min' in string:
        return ''
    return string

In [11]:
df_netflix['rating'] = df_netflix['rating'].apply(clean_rating)
df_netflix['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '', 'NR', 'TV-Y7-FV', 'UR'], dtype=object)

In [12]:
df_netflix.to_csv('cleaned_data/netflix_data_clean.csv')