In [41]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


#### Load dataset

In [42]:
df=pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


#### BASIC OVERVIEW

In [43]:
def basic_overview(df):
    print('\nTop 10 rows:\n', df.head(5))
    print('\nLast 10 rows:\n', df.tail(5))
    print('\nSample 10 rows:\n', df.sample(5))
    print('\Dataset size=',df.shape)
    print('\nStatiscal Summary (describe):\n',df.describe())
    print('info Data')
    df.info()
basic_overview(df)


Top 10 rows:
   show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 2

#### CHECK MISSING VALUE

In [44]:
print("Check Missing Value")
for col in df.columns:
    missing_value=df[col].isna().sum()
    percentage=missing_value*100/len(df)
    print("%s: %.2f%% (%d)" % (col, percentage, missing_value))

Check Missing Value
show_id: 0.00% (0)
type: 0.00% (0)
title: 0.00% (0)
director: 29.91% (2634)
cast: 9.37% (825)
country: 9.44% (831)
date_added: 0.11% (10)
release_year: 0.00% (0)
rating: 0.05% (4)
duration: 0.03% (3)
listed_in: 0.00% (0)
description: 0.00% (0)


#### HANDLE THE MISSING VALUE

In [45]:
#Check how many missing (NaN) values are there in selected columns
print(df[['director','country','cast']].isna().sum())
#Replace all missing values in these columns with 'unknown'
print("Replace all missing values in these columns with 'unknown'")
df[['director','country','cast']]=df[['director','country','cast']].fillna('unkwon')
df[['director','country','cast']].isna().sum()

director    2634
country      831
cast         825
dtype: int64
Replace all missing values in these columns with 'unknown'


director    0
country     0
cast        0
dtype: int64

In [46]:
#Check how many missing (NaN) values are there in selected columns
print(df[['date_added','rating','duration']].isna().sum())
#Drop rows that have missing values in any of the selected columns
print('Drop rows that have missing values in any of the selected columns')
df=df.dropna(subset=['date_added','rating','duration'])
print("\nMissing values after dropping rows:")
df.isna().sum()

date_added    10
rating         4
duration       3
dtype: int64
Drop rows that have missing values in any of the selected columns

Missing values after dropping rows:


show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [47]:
df.shape


(8790, 12)

In [48]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

#### Remove duplicate

In [49]:
# Remove duplicate rows
df.drop_duplicates(inplace=True)

#### Standardize columns name

In [50]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

In [51]:
df['country'] = df['country'].str.strip().str.title()
df['country']

0       United States
1        South Africa
2              Unkwon
3              Unkwon
4               India
            ...      
8802    United States
8803           Unkwon
8804    United States
8805    United States
8806            India
Name: country, Length: 8790, dtype: object

#### convert data time

In [52]:
df['date_added'] = pd.to_datetime(df['date_added'], errors="coerce")
df['date_added'].isna().sum()

np.int64(88)

#### Check and fix data types

In [53]:
df.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

In [54]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,unkwon,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,unkwon,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",Unkwon,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,unkwon,unkwon,Unkwon,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,unkwon,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [55]:
df[['duration_value', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_value'] = pd.to_numeric(df['duration_value'], errors="coerce")
df[['duration_value', 'duration_type']].dtypes


duration_value     int64
duration_type     object
dtype: object

In [56]:
#final check
print("\nShape after cleaning:", df.shape)
print("\nMissing values after cleaning:")
print(df.isnull().sum())


Shape after cleaning: (8790, 14)

Missing values after cleaning:
show_id            0
type               0
title              0
director           0
cast               0
country            0
date_added        88
release_year       0
rating             0
duration           0
listed_in          0
description        0
duration_value     0
duration_type      0
dtype: int64


In [59]:
df.dropna(subset=['date_added'], inplace=True)

In [60]:
df.to_csv("netflix_titles_cleaned.csv", index=False)