In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pycountry_convert as pc
from datetime import datetime

In [58]:
# Load the dataset
# put your file path here
file_path = 'netflix_titles.csv'
df = pd.read_csv(file_path)

In [60]:
df.head(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [40]:
# check release year
df['country'].value_counts()

country
United States                             2818
India                                      972
United Kingdom                             419
Japan                                      245
South Korea                                199
                                          ... 
Romania, Bulgaria, Hungary                   1
Uruguay, Guatemala                           1
France, Senegal, Belgium                     1
Mexico, United States, Spain, Colombia       1
United Arab Emirates, Jordan                 1
Name: count, Length: 748, dtype: int64

In [41]:
df.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [42]:
# copy of filther df from 2010 to 2021
df_cleaned = df[(df['release_year'] >= 2010) & (df['release_year'] <= 2021)].copy()

In [43]:
# Drop columns
df_cleaned.drop(['show_id', 'description'], axis=1, inplace=True)

In [44]:
# keep first country in country column
# Ex. "United States, Ghana, Burkina Faso, United Kingdom" -> keep only first country == United States
df_cleaned['country'] = df_cleaned['country'].apply(lambda x: x.split(',')[0] if pd.notna(x) else x)

# map country to continent
# https://stackoverflow.com/questions/55910004/get-continent-name-from-country-using-pycountry
def country_to_continent(country_name):
    # we have missing data points so we need to use try and return None
    try:
        country = pc.country_name_to_country_alpha2(country_name.strip())
        country_continent_code = pc.country_alpha2_to_continent_code(country)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except:
        return None

# create 'continent'
df_cleaned['continent'] = df_cleaned['country'].apply(country_to_continent)

In [45]:
# remove space from date_added
df_cleaned['date_added'] = df_cleaned['date_added'].str.strip()

# convert 'date_added' to datetime type
df_cleaned['date_added'] = pd.to_datetime(df_cleaned['date_added'], format='%B %d, %Y')

# extract 'month', 'day', and 'year' from 'date_added'
df_cleaned['month'] = df_cleaned['date_added'].dt.month
df_cleaned['day'] = df_cleaned['date_added'].dt.day
df_cleaned['year'] = df_cleaned['date_added'].dt.year

# calculate the difference in months
# https://stackoverflow.com/questions/4039879/best-way-to-find-the-months-between-two-dates
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

# create 'difference' in months
df_cleaned['difference'] = df_cleaned.apply(lambda row: diff_month(row['date_added'], datetime(row['release_year'], 1, 1)), axis=1)

In [48]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7472 entries, 0 to 8806
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   type          7472 non-null   object        
 1   title         7472 non-null   object        
 2   director      5021 non-null   object        
 3   cast          6679 non-null   object        
 4   country       6688 non-null   object        
 5   date_added    7465 non-null   datetime64[ns]
 6   release_year  7472 non-null   int64         
 7   rating        7468 non-null   object        
 8   duration      7469 non-null   object        
 9   listed_in     7472 non-null   object        
 10  continent     6686 non-null   object        
 11  month         7465 non-null   float64       
 12  day           7465 non-null   float64       
 13  year          7465 non-null   float64       
 14  difference    7465 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(1),

In [47]:
df_cleaned.head()

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,continent,month,day,year,difference
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,2021-09-25,2020,PG-13,90 min,Documentaries,North America,9.0,25.0,2021.0,20.0
1,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries",Africa,9.0,24.0,2021.0,8.0
2,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,2021-09-24,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",,9.0,24.0,2021.0,8.0
3,TV Show,Jailbirds New Orleans,,,,2021-09-24,2021,TV-MA,1 Season,"Docuseries, Reality TV",,9.0,24.0,2021.0,8.0
4,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",Asia,9.0,24.0,2021.0,8.0


In [56]:
df_cleaned.isnull().any()

type            False
title           False
director         True
cast             True
country          True
date_added       True
release_year    False
rating           True
duration         True
listed_in       False
continent        True
month            True
day              True
year             True
difference       True
dtype: bool

In [None]:
# next task clean where GOOD LUCK... it's a lot: 
# director         True
# cast             True
# country          True
# date_added       True
# rating           True
# duration         True

# after filling out the missing data. Put code on top or get a new data set if cleaned from excel...

In [57]:
df_cleaned.loc[df_cleaned['director'].isnull()]
df_cleaned.loc[df_cleaned['cast'].isnull()]
df_cleaned.loc[df_cleaned['country'].isnull()]
df_cleaned.loc[df_cleaned['date_added'].isnull()]
df_cleaned.loc[df_cleaned['rating'].isnull()]
df_cleaned.loc[df_cleaned['duration'].isnull()]

Unnamed: 0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,continent,month,day,year,difference
5541,Movie,Louis C.K. 2017,Louis C.K.,Louis C.K.,United States,2017-04-04,2017,74 min,,Movies,North America,4.0,4.0,2017.0,3.0
5794,Movie,Louis C.K.: Hilarious,Louis C.K.,Louis C.K.,United States,2016-09-16,2010,84 min,,Movies,North America,9.0,16.0,2016.0,80.0
5813,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,Louis C.K.,United States,2016-08-15,2015,66 min,,Movies,North America,8.0,15.0,2016.0,19.0
