In [1]:
import pandas as pd
import plotly.express as plt

pd.options.mode.chained_assignment = None

##### **Basic Data Cleaning**

In [2]:
#importing the dataset and parsing the dates
df = pd.read_csv('netflix_titles.csv', parse_dates=['date_added'])

handle: netflix_titles.csv
netflix_titles.csv


In [3]:
df.tail()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,2019-11-20,2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,2019-07-01,2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,2019-11-01,2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,2020-01-11,2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,2019-03-02,2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...


In [4]:
#checking the data
df.dtypes

show_id                 object
type                    object
title                   object
director                object
cast                    object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
description             object
dtype: object

In [5]:
#checking if null values appear in the dataset
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        84
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

#### **Year from which NETFLIX contains the most shows**

In [6]:
df['release_year'].value_counts().sort_index(ascending=False)

2021     592
2020     953
2019    1030
2018    1147
2017    1032
        ... 
1945       4
1944       3
1943       3
1942       2
1925       1
Name: release_year, Length: 74, dtype: int64

In [7]:
plt.histogram(df, x='release_year')

###### *INSIGHT: Movies that were created in 2018 are most present on NETFLIX*

##### **The oldest show available on NETFLIX**

In [8]:
pd.set_option('display.max_colwidth', None) #default is 50
df.loc[df.release_year == df['release_year'].min()][['title', 'release_year', 'date_added', 'description']]

Unnamed: 0,title,release_year,date_added,description
4250,Pioneers: First Women Filmmakers*,1925,2018-12-30,"This collection restores films from women who tackled hot-button issues, transcended norms, and left a mark on the history of cinema and the industry."


###### *The oldest show available on NETFLIX is "Pioneers: First Women Filmmakers" released first in 1925*

##### **Checking when NETFLIX uploaded most of its shows**

In [9]:
#since date_added contains NULL values we need to drop the rows
df_date_added = df.dropna(subset=['date_added'])


##### *1) Yearly Analysis*

In [10]:
#checking which year how many shows had been added
print(df_date_added['date_added'].dt.year.value_counts().sort_index(ascending=False))

2021    1498
2020    1878
2019    1999
2018    1628
2017    1166
2016     422
2015      77
2014      23
2013      11
2012       3
2011      13
2010       1
2009       2
2008       2
Name: date_added, dtype: int64


In [128]:
#plotting a graph on basis of which year it was added in
df_date_added['date_added_year'] = pd.to_datetime(df_date_added['date_added']).dt.year 
plt.histogram(df_date_added, x='date_added_year', title='Graph of when shows were added (Yearly)', nbins=28, color='type', labels={"date_added_year":"Year"})

###### *NETFLIX added most shows for the viewers in the year 2019 followed closely by 2020 beating it by a margin of 121 shows. NETFLIX prefers to upload movies to TV-shows as 2/3rd of yearly additions to the site are Movies*

##### *2) Monthly Analysis*

In [12]:
#plotting a graph on basis of which month it was added in
df_date_added['date_added_month'] = pd.to_datetime(df_date_added['date_added']).dt.month 
plt.histogram(df_date_added, x='date_added_month', title='Graph of when shows were added (Month-Wise)', nbins=23,color='date_added_month', labels={"date_added_month":"Month"})

###### *There is no particualr month in which NETFLIX prefers to upload most of its shows*

##### *1) Day-Wise Analysis*

In [13]:
#plotting a graph on basis of which day it was added in
df_date_added['date_added_day'] = pd.to_datetime(df_date_added['date_added']).dt.day 
plt.histogram(df_date_added, x='date_added_day', title='Graph of when shows were added (Day-Wise)', color='date_added_day', labels={"date_added_day":"day"})

###### *NETFLIX prefers to upload most of their shows on the 1st of every-month; significantly more than any-other day of the month. Second most uploaded day being the 15th of every-month which has  1500 less uploads overall*

##### **Analysing the Directors**

In [46]:
df_directors = df.dropna(subset=['director'])
df_directors = pd.concat([df_directors, df_directors['director'].str.split(",", expand=True)], axis=1)

In [49]:
df_directors = df_directors.melt(id_vars=["type", "title"], value_vars=range(12), value_name="directors")
df_directors = df_directors[df_directors["directors"].notna()]

In [98]:

df_director_count = df_directors["directors"].value_counts().reset_index()
df_director_count.set_index("index")

Unnamed: 0_level_0,directors
index,Unnamed: 1_level_1
Rajiv Chilaka,22
Jan Suter,18
Raúl Campos,18
Suhas Kadav,16
Marcus Raboy,16
...,...
Peter Howitt,1
Chiwetel Ejiofor,1
Rodrigo Salomón,1
Miguel Ángel Vivas,1


In [126]:
plt.histogram(df_director_count[:41], x="index", y="directors", labels={"index": "Name Of Directors", "directors": "directions"}, title="Directors with the most shows")

###### *The director which has directed the most movies is Rajiv Chakla with sum of 22 movies on NETFLIX*

In [123]:
df_directors_10 = df_directors[df_directors["directors"].isin(df_director_count["index"][:41])]

In [124]:
df_directors_10

Unnamed: 0,type,title,variable,directors
24,Movie,Jaws,0,Steven Spielberg
32,Movie,InuYasha the Movie 2: The Castle Beyond the Looking Glass,0,Toshiya Shinohara
33,Movie,InuYasha the Movie 3: Swords of an Honorable Ruler,0,Toshiya Shinohara
34,Movie,InuYasha the Movie 4: Fire on the Mystic Island,0,Toshiya Shinohara
35,Movie,InuYasha the Movie: Affections Touching Across Time,0,Toshiya Shinohara
...,...,...,...,...
9656,Movie,Alan Saldaña: Mi vida de pobre,1,Jan Suter
9841,Movie,Daniel Sosa: Sosafado,1,Jan Suter
9888,Movie,Ricardo O'Farrill: Abrazo navideño,1,Jan Suter
9986,Movie,Sofía Niño de Rivera: Exposed,1,Jan Suter


In [127]:
plt.histogram(df_directors_10, "directors", color="type", labels={"type": "Type of Direction"}, title="Type of Directions")

###### *As we can see directors with the most shows on NETFLIX have majority of their movies uploaded to the site*

##### **Analysing the genres**

In [139]:
df_genres = df.dropna(subset=['listed_in'])
df_genres = pd.concat([df_genres, df_genres['listed_in'].str.split(",", expand=True)], axis=1)

In [140]:
df_genres = df_genres.melt(id_vars=["type", "title"], value_vars=range(3), value_name="genres")
df_genres = df_genres[df_genres["genres"].notna()]

In [142]:
df_genres["genres"].value_counts()

 International Movies        2624
Dramas                       1600
Comedies                     1210
Action & Adventure            859
Documentaries                 829
                             ... 
Romantic Movies                 3
Spanish-Language TV Shows       2
LGBTQ Movies                    1
TV Sci-Fi & Fantasy             1
Sports Movies                   1
Name: genres, Length: 73, dtype: int64

In [151]:
plt.histogram(df_genres, "genres", title="Distribution of shows depending on Genres")

###### *The overall genre with the most shows in "International Movies" with 2624 movies* 

In [147]:
df_genres["type"].value_counts()

Movie      13190
TV Show     6133
Name: type, dtype: int64

In [152]:
plt.histogram(df_genres, "genres", title="Distribution of shows depending on Genres and Type", color="type")

###### *The movie genre with the most movies is "International Movies" with 2624 movies and TV-Show genre with the most shows is "International TV-shows" with 774 shows* 

##### **Analysis based on Show Ratings**

In [162]:

df_rating = df[["rating", "title", "type"]]
df_rating = df_rating.dropna()
df_rating["rating"].value_counts()

TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: rating, dtype: int64

In [161]:
plt.histogram(df_rating, "rating", color="type")

###### *Most of the shows on NETFLIX have a "TV-MA" rating meaning they are specifically designed to be viewed by adults and therefore may be unsuitable for children under 17*

##### Which director has the most TV-MA shows ?

In [211]:
df_rating = df[["rating", "title", "type", "director"]]
df_rating = df_rating.dropna()
df_rating = df_rating[df_rating["rating"] == "TV-MA"]
df_rating.shape

(2117, 4)

In [212]:
df_rating = pd.concat([df_rating, df_rating['director'].str.split(",", expand=True)], axis=1)
df_rating = df_rating.melt(id_vars=["type", "title"], value_vars=range(11), value_name="directors")
df_rating = df_rating[df_rating["directors"].notna()]

In [213]:
df_rating["directors"].value_counts()

 Jan Suter              17
Raúl Campos             17
Marcus Raboy            13
Jay Karas               11
Jay Chapman             10
                        ..
Shanawaz Nellikunnil     1
Daniel Růžička           1
Gangadhar Salimath       1
Prasanth Varma           1
 Suparn Verma            1
Name: directors, Length: 2057, dtype: int64

In [214]:
df_rating = df_rating[df_rating["directors"].isin(df_rating["directors"].value_counts().reset_index().set_index("index").index[:25])]

In [215]:
plt.histogram(df_rating, "directors", color='type')

###### *"Jan Suter" and "Raúl Campos" have directed the most TV-MA rated shows on NETFLIX at the moment with a tie at 17 movie-shows each*

##### **Analysis of Movie Duration**

In [216]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [222]:
df_TV = df[df["type"] == "Movie"]

In [225]:
df_TV["duration"]

0        90 min
6        91 min
7       125 min
9       104 min
12      127 min
         ...   
8801     96 min
8802    158 min
8804     88 min
8805     88 min
8806    111 min
Name: duration, Length: 6131, dtype: object