In [1]:
# import library
import pandas as pd

In [2]:
# read in data
netflix = pd.read_csv("netflix_titles.csv")

# Display data
print(netflix.shape)
netflix.head()

(8807, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Python Tip 1
###### Using pandas we can find out how much of our data is missing

In [3]:
# Display missing data as a percentage
(netflix.isnull().sum()/netflix.isnull().count()).sort_values(ascending=False)

director        0.299080
country         0.094357
cast            0.093675
date_added      0.001135
rating          0.000454
duration        0.000341
show_id         0.000000
type            0.000000
title           0.000000
release_year    0.000000
listed_in       0.000000
description     0.000000
dtype: float64

In [4]:
# choose some columns to work with
netflix_v1 = netflix[["type","director"]].dropna()
print(netflix_v1.shape)

(6173, 2)


## Python Tip 2
###### We can also group together data

In [5]:
# group by different directors
netflix_v1.groupby("director")["type"].value_counts()

director             type 
A. L. Vijay          Movie    2
A. Raajdheep         Movie    1
A. Salaam            Movie    1
A.R. Murugadoss      Movie    2
Aadish Keluskar      Movie    1
                             ..
Çagan Irmak          Movie    1
Ísold Uggadóttir     Movie    1
Óskar Thór Axelsson  Movie    1
Ömer Faruk Sorak     Movie    2
Şenol Sönmez         Movie    2
Name: type, Length: 4576, dtype: int64

In [6]:
# group by directors then count the different types of content they directed
netflix_v1.groupby('director')['type'].value_counts().unstack().fillna(0)

type,Movie,TV Show
director,Unnamed: 1_level_1,Unnamed: 2_level_1
A. L. Vijay,2.0,0.0
A. Raajdheep,1.0,0.0
A. Salaam,1.0,0.0
A.R. Murugadoss,2.0,0.0
Aadish Keluskar,1.0,0.0
...,...,...
Çagan Irmak,1.0,0.0
Ísold Uggadóttir,1.0,0.0
Óskar Thór Axelsson,1.0,0.0
Ömer Faruk Sorak,2.0,0.0


## Python Tip 3
###### Converting dates to datetime objects

In [7]:
# A reminder of what our dates originally look like
netflix["date_added"]

0       September 25, 2021
1       September 24, 2021
2       September 24, 2021
3       September 24, 2021
4       September 24, 2021
               ...        
8802     November 20, 2019
8803          July 1, 2019
8804      November 1, 2019
8805      January 11, 2020
8806         March 2, 2019
Name: date_added, Length: 8807, dtype: object

In [9]:
# convert dates to a datetime object and remove missing values
netflix['date_added'] = pd.to_datetime(netflix['date_added'])
netflix_date = netflix[["rating","date_added"]].dropna()
netflix_date

Unnamed: 0,rating,date_added
0,PG-13,2021-09-25
1,TV-MA,2021-09-24
2,TV-MA,2021-09-24
3,TV-MA,2021-09-24
4,TV-MA,2021-09-24
...,...,...
8802,R,2019-11-20
8803,TV-Y7,2019-07-01
8804,R,2019-11-01
8805,PG,2020-01-11
