In [75]:
# Data cleaning project
# Got the data from Kaggle neflix1.csv
# Original data was uploaded

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('netflix1.csv')

In [4]:
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8790 non-null   object
 1   type          8790 non-null   object
 2   title         8790 non-null   object
 3   director      8790 non-null   object
 4   country       8790 non-null   object
 5   date_added    8790 non-null   object
 6   release_year  8790 non-null   int64 
 7   rating        8790 non-null   object
 8   duration      8790 non-null   object
 9   listed_in     8790 non-null   object
dtypes: int64(1), object(9)
memory usage: 686.8+ KB


In [6]:
# Missing data in percentage
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print ('{}-{} %'.format(col, round(pct_missing*100)))

show_id-0 %
type-0 %
title-0 %
director-0 %
country-0 %
date_added-0 %
release_year-0 %
rating-0 %
duration-0 %
listed_in-0 %


In [10]:
# No missing data in the DataFrame df

In [11]:
# Irregular Data (Outliers)

In [8]:
df['show_id'].describe()

count     8790
unique    8790
top         s1
freq         1
Name: show_id, dtype: object

In [9]:
df['type'].describe()

count      8790
unique        2
top       Movie
freq       6126
Name: type, dtype: object

In [10]:
df['title'].describe()

count      8790
unique     8787
top       9-Feb
freq          2
Name: title, dtype: object

In [22]:
# Unnecessary data 1.

In [23]:
#  checking for repetitive/Uninformative data

In [11]:
num_rows = len(df.index)
low_information_cols =[] 

for col in df.columns:
    cnts = df[col].value_counts(dropna=False)
    top_pct = (cnts/num_rows).iloc[0]
    
if top_pct > 0.95:
    low_information_cols.append(col)
    print('{0}: {1:.5f}%'.format(col, top_pct*100))
    print(cnts)
    print()

In [29]:
#Looks like there is no repetitive/Uninformative data

In [31]:
# 2 Checking for Irrelevant data
# This project is only for data cleaning so i have decide not to delete anything


In [12]:
# 3 Checking for duplicates
df_dedupped = df.drop('show_id', axis=1).drop_duplicates()

print(df.shape)
print(df_dedupped.shape)

(8790, 10)
(8787, 9)


In [33]:
# 3 rows were complete duplicates and I've dropped them already

In [38]:
# 3b Checking for duplicates using key features
# chances that the show/movie has the same title was directed by the same director, in the same country, 
# added & released the same year and run the same duration time  is close to zero

In [13]:
key = ['title', 'director', 'country', 'release_year', 'duration']
df.fillna(-999).groupby(key)['show_id'].count().sort_values(ascending=False).head(10)

title                     director                                          country         release_year  duration 
22-Jul                    Paul Greengrass                                   Norway          2018          144 min      2
9-Feb                     Not Given                                         Pakistan        2018          1 Season     2
15-Aug                    Swapnaneel Jayakar                                India           2019          124 min      2
#Alive                    Cho Il                                            South Korea     2020          99 min       1
Rising Phoenix            Not Given                                         United Kingdom  2020          107 min      1
Ripper Street             Not Given                                         United Kingdom  2017          5 Seasons    1
Rise of Empires: Ottoman  Not Given                                         Pakistan        2020          1 Season     1
Rise of the Zombie        Devaki Sing

In [43]:
# I can't see any duplicates but if there is any i will 
# drop them based on subnet of variables (key features)

In [14]:
key = ['title', 'director', 'country', 'release_year', 'duration']
df_dedupped2 = df.drop_duplicates(key)

print(df.shape)
print(df_dedupped2.shape)

(8790, 10)
(8787, 10)


In [47]:
# I have dropped 3 duplicates within the new dataset named df_dedupped2

In [None]:
# Now checking inconsistent data

In [51]:
# 1 Capitalization

In [15]:
df['country_uppercase'] = df['country'].str.upper()
df['country_uppercase'] = df['country'].value_counts(dropna=False)

In [16]:
df['country_uppercase'].value_counts(dropna=False)

NaN    8790
Name: country_uppercase, dtype: int64

In [65]:
df['director_uppercase'] = df['director'].str.upper()
df['director_uppercase'] = df['director'].value_counts(dropna=False)

In [67]:
df['director_uppercase'].value_counts(dropna=False)

NaN    8790
Name: director_uppercase, dtype: int64

In [None]:
#clearly there is no inconsistency on either country and director names

In [17]:
df

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in,country_uppercase
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries,
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies",
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies",
...,...,...,...,...,...,...,...,...,...,...,...
8785,s8797,TV Show,Yunus Emre,Not Given,Turkey,1/17/2017,2016,TV-PG,2 Seasons,"International TV Shows, TV Dramas",
8786,s8798,TV Show,Zak Storm,Not Given,United States,9/13/2018,2016,TV-Y7,3 Seasons,Kids' TV,
8787,s8801,TV Show,Zindagi Gulzar Hai,Not Given,Pakistan,12/15/2016,2012,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ...",
8788,s8784,TV Show,Yoko,Not Given,Pakistan,6/23/2018,2016,TV-Y,1 Season,Kids' TV,


In [18]:
df_dedupped

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"
...,...,...,...,...,...,...,...,...,...
8785,TV Show,Yunus Emre,Not Given,Turkey,1/17/2017,2016,TV-PG,2 Seasons,"International TV Shows, TV Dramas"
8786,TV Show,Zak Storm,Not Given,United States,9/13/2018,2016,TV-Y7,3 Seasons,Kids' TV
8787,TV Show,Zindagi Gulzar Hai,Not Given,Pakistan,12/15/2016,2012,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ..."
8788,TV Show,Yoko,Not Given,Pakistan,6/23/2018,2016,TV-Y,1 Season,Kids' TV


In [19]:
df_dedupped2

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"
...,...,...,...,...,...,...,...,...,...,...
8785,s8797,TV Show,Yunus Emre,Not Given,Turkey,1/17/2017,2016,TV-PG,2 Seasons,"International TV Shows, TV Dramas"
8786,s8798,TV Show,Zak Storm,Not Given,United States,9/13/2018,2016,TV-Y7,3 Seasons,Kids' TV
8787,s8801,TV Show,Zindagi Gulzar Hai,Not Given,Pakistan,12/15/2016,2012,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ..."
8788,s8784,TV Show,Yoko,Not Given,Pakistan,6/23/2018,2016,TV-Y,1 Season,Kids' TV
