# Exploratory Data Analysis of Netflix Dataset :
- Saikrishna Reddy kotha
- Vamshi Jaligama
- Abhishek Raj Sampath

In [1]:
!pip install pandas-profiling
#!pip install plotly

Collecting pandas-profiling
  Using cached pandas_profiling-3.1.0-py2.py3-none-any.whl (261 kB)
Collecting visions[type_image_path]==0.7.4
  Using cached visions-0.7.4-py3-none-any.whl (102 kB)
Collecting pydantic>=1.8.1
  Using cached pydantic-1.8.2-cp36-cp36m-manylinux2014_x86_64.whl (10.2 MB)
Collecting phik>=0.11.1
  Using cached phik-0.12.0-cp36-cp36m-manylinux2010_x86_64.whl (675 kB)
Collecting htmlmin>=0.1.12
  Using cached htmlmin-0.1.12-py3-none-any.whl
Collecting missingno>=0.4.2
  Using cached missingno-0.5.0-py3-none-any.whl (8.8 kB)
Collecting multimethod>=1.4
  Using cached multimethod-1.5-py3-none-any.whl (7.7 kB)
Collecting tangled-up-in-unicode==0.1.0
  Using cached tangled_up_in_unicode-0.1.0-py3-none-any.whl (3.1 MB)
Collecting markupsafe~=2.0.1
  Using cached MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (30 kB)
Collecting imagehash
  Using cached ImageHash-4.2.1.tar.gz (812 kB)
  Preparing metadata

#### Importing the Required Libraries

In [2]:
#Importing the required libararies
import numpy as np
import pandas as pd
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots 
import seaborn as sns
from pandas_profiling import ProfileReport

#### Reading the CSV File


In [3]:
file = pd.read_csv('netflix_titles.csv')
file.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


#### Info about the Dataset


In [4]:
file.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


#### Descriptive statistics of the dataset

In [5]:
file.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


#### Missing values in the dataset


In [6]:
file.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

**Missing values in the Columns that are useful for EDA :**
- Replacing the missing country with the mode values.
- Removing the date_added rows.
- replacing the missing rating with the mode values.
- removing the missing duration as they are very few missing values.


**Columns with missing values which are unncessary for EDA that are ignored :**
- Missing director values are not helpful for EDA.
- Cast column's missing values are also not helpful for EDA.


#### Replacing the missing country with mode values


In [7]:
country_mode = file['country'].mode().values[0]

file['country'] = file['country'].replace(np.nan,country_mode)

#### Removing the date_added rows


In [8]:
file = file[file['date_added'].notna()]

#### Replacing the missing country with mode values


In [9]:
rating_mode = file['rating'].mode().values[0]

file['rating'] = file['rating'].replace(np.nan,rating_mode)

#### Removing the duration rows


In [10]:
file = file[file['duration'].notna()]

#### Checking for the Missing values in dataset

In [11]:
file.isna().sum()

show_id            0
type               0
title              0
director        2624
cast             825
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

### Checking for Duplicate values in the Dataset

In [12]:
duplicate_values = file.duplicated()
print("Number of duplicates in the dataset is :",duplicate_values.sum())

Number of duplicates in the dataset is : 0


#### Adding a column movies/tvshow added year to the Netflix form the existing date_added column


In [13]:
file['available_year'] = file['date_added'].apply(lambda date : date.split(" ")[-1])
file.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,available_year
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",2021
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",2021
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",United States,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,2021
3,s4,TV Show,Jailbirds New Orleans,,,United States,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",2021
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,2021


In [14]:
file['available_year'] = pd.to_numeric(file['available_year'])

#### Only keeping the first country in the country column and removing the extra data


In [15]:
file['primary_country'] = file['country'].apply(lambda co : co.split(",")[0])
file.tail(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,available_year,primary_country
8797,s8798,TV Show,Zak Storm,,"Michael Johnston, Jessica Gee-George, Christin...","United States, France, South Korea, Indonesia","September 13, 2018",2016,TV-Y7,3 Seasons,Kids' TV,Teen surfer Zak Storm is mysteriously transpor...,2018,United States
8798,s8799,Movie,Zed Plus,Chandra Prakash Dwivedi,"Adil Hussain, Mona Singh, K.K. Raina, Sanjay M...",India,"December 31, 2019",2014,TV-MA,131 min,"Comedies, Dramas, International Movies",A philandering small-town mechanic's political...,2019,India
8799,s8800,Movie,Zenda,Avadhoot Gupte,"Santosh Juvekar, Siddharth Chandekar, Sachit P...",India,"February 15, 2018",2009,TV-14,120 min,"Dramas, International Movies",A change in the leadership of a political part...,2018,India
8800,s8801,TV Show,Zindagi Gulzar Hai,,"Sanam Saeed, Fawad Khan, Ayesha Omer, Mehreen ...",Pakistan,"December 15, 2016",2012,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ...","Strong-willed, middle-class Kashaf and carefre...",2016,Pakistan
8801,s8802,Movie,Zinzana,Majid Al Ansari,"Ali Suliman, Saleh Bakri, Yasa, Ali Al-Jabri, ...","United Arab Emirates, Jordan","March 9, 2016",2015,TV-MA,96 min,"Dramas, International Movies, Thrillers",Recovering alcoholic Talal wakes up inside a s...,2016,United Arab Emirates
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a...",2019,United States
8803,s8804,TV Show,Zombie Dumb,,,United States,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...",2019,United States
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...,2019,United States
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero...",2020,United States
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...,2019,India


#### unique director


In [16]:
file['director'].nunique()

4527

#### Unique title


In [17]:
file.title.nunique()

8794

#### Unique cast


In [18]:
file.cast.nunique()

7681

#### Top 5 Directors


In [19]:
file.director.head(5)

0    Kirsten Johnson
1                NaN
2    Julien Leclercq
3                NaN
4                NaN
Name: director, dtype: object

#### Duration of shows


In [20]:
file.duration.head(20)

0        90 min
1     2 Seasons
2      1 Season
3      1 Season
4     2 Seasons
5      1 Season
6        91 min
7       125 min
8     9 Seasons
9       104 min
10     1 Season
11     1 Season
12      127 min
13       91 min
14     1 Season
15    4 Seasons
16       67 min
17    2 Seasons
18       94 min
19     1 Season
Name: duration, dtype: object

#### Duration value counts ascending

In [21]:
file.duration.value_counts(ascending = False)

1 Season     1793
2 Seasons     421
3 Seasons     198
90 min        152
94 min        146
             ... 
273 min         1
194 min         1
8 min           1
186 min         1
193 min         1
Name: duration, Length: 220, dtype: int64

#### Unique genre values


In [22]:
file.listed_in.value_counts()

Dramas, International Movies                               362
Documentaries                                              359
Stand-Up Comedy                                            334
Comedies, Dramas, International Movies                     274
Dramas, Independent Movies, International Movies           252
                                                          ... 
Anime Features                                               1
Reality TV, TV Action & Adventure, TV Mysteries              1
Kids' TV, Reality TV, Science & Nature TV                    1
British TV Shows, Docuseries, TV Comedies                    1
Romantic TV Shows, Spanish-Language TV Shows, TV Dramas      1
Name: listed_in, Length: 513, dtype: int64

#### Top 10 most casted Artist

In [23]:
max_cast = file.copy()
max_cast = pd.concat([max_cast, file['cast'].str.split(",", expand=True)], axis=1)
max_cast = max_cast.melt(id_vars=["type","title"], value_vars=range(44), value_name="Cast_name")
max_cast = max_cast[max_cast["Cast_name"].notna()]
max_cast["Cast_name"] = max_cast["Cast_name"].str.strip()
max_cast.Cast_name.value_counts()[:10]

Anupam Kher         43
Shah Rukh Khan      35
Julie Tejwani       33
Takahiro Sakurai    32
Naseeruddin Shah    32
Rupa Bhimani        31
Om Puri             30
Akshay Kumar        30
Yuki Kaji           29
Paresh Rawal        28
Name: Cast_name, dtype: int64

#### Top 10 oldest TV shows on Netflix


In [24]:
old_tvshow = file.sort_values("release_year", ascending = True)
old_tvshow = old_tvshow[old_tvshow['type'] == "TV Show"]
old_tvshow[['title', "release_year", 'type',]][:10]
old_tvshow.rename(columns={'title': 'Title', 'release_year': 'Release Year', 'type':'Type'}, inplace=True)
old_tvshow = old_tvshow[['Release Year', "Title",'Type']].head(10)
old_tvshow.head(10).style.hide_index()

Release Year,Title,Type
1925,Pioneers: First Women Filmmakers*,TV Show
1945,Five Came Back: The Reference Films,TV Show
1946,Pioneers of African-American Cinema,TV Show
1963,The Twilight Zone (Original Series),TV Show
1967,The Andy Griffith Show,TV Show
1972,Monty Python's Fliegender Zirkus,TV Show
1974,Monty Python's Flying Circus,TV Show
1977,Dad's Army,TV Show
1979,El Chavo,TV Show
1981,Ninja Hattori,TV Show


#### Top 10 oldest movies on netflix


In [25]:
old_movie = file.sort_values("release_year", ascending = True)
old_movie = old_movie[old_movie['type'] == "Movie"]
old_movie[['title', "release_year", 'type',]][:10]
old_movie.rename(columns={'title': 'Title', 'release_year': 'Release Year', 'type':'Type'}, inplace=True)
old_movie = old_movie[['Release Year', "Title",'Type']].head(10)
old_movie.head(10).style.hide_index()

Release Year,Title,Type
1942,The Battle of Midway,Movie
1942,Prelude to War,Movie
1943,Undercover: How to Operate Behind Enemy Lines,Movie
1943,Why We Fight: The Battle of Russia,Movie
1943,WWII: Report from the Aleutians,Movie
1944,The Memphis Belle: A Story of a Flying Fortress,Movie
1944,The Negro Soldier,Movie
1944,Tunisian Victory,Movie
1945,Know Your Enemy - Japan,Movie
1945,San Pietro,Movie


#### Movies and TV Shows added recently


In [26]:
file['date_added'] = pd.to_datetime(file['date_added'])
newly_added = file.sort_values(by='date_added', ascending=False)
newly_added.rename(columns={'title': 'Title', 'date_added': 'Date Added', 'type':'Type', 'release_year':'Release Year', 'description':'Description', 'country':'Country'}, inplace=True)
newly_added = newly_added[['Date Added', "Title",'Type', 'Release Year', 'Description']]
newly_added.head(5).style.hide_index()


Date Added,Title,Type,Release Year,Description
2021-09-25 00:00:00,Dick Johnson Is Dead,Movie,2020,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
2021-09-24 00:00:00,My Little Pony: A New Generation,Movie,2021,"Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
2021-09-24 00:00:00,"Vendetta: Truth, Lies and The Mafia",TV Show,2021,"Sicily boasts a bold ""Anti-Mafia"" coalition. But what happens when those trying to bring down organized crime are accused of being criminals themselves?"
2021-09-24 00:00:00,The Starling,Movie,2021,A woman adjusting to life after a loss contends with a feisty bird that's taken over her garden — and a husband who's struggling to find a way forward.
2021-09-24 00:00:00,The Great British Baking Show,TV Show,2021,"A talented batch of amateur bakers face off in a 10-week competition, whipping up their best dishes in the hopes of being named the U.K.'s best."


#### Netflix content country wise


In [40]:
country_wise = file['country'].value_counts().sort_values(ascending = False).reset_index().head(10)

fig = px.pie(country_wise, values = 'country', names = 'index', 
             color_discrete_sequence = px.colors.sequential.Blues,
             labels = {"index":"Country", "country":"Count"})
fig.update_layout(title="Netflix content country wise" , title_x =0.5)
fig.show()

#### contents made with the 'TV-MA' rating


In [41]:
rate = file['rating'].value_counts()
fig = px.pie(values = rate.values, 
             names = rate.index,
             color_discrete_sequence=px.colors.sequential.Blues)
fig.update_layout(title="contents made with the 'TV-MA' rating" , title_x =0.5)

fig.show()

#### Distribution of content in netflix

In [43]:
figure_pie = px.pie(file['type'].value_counts().reset_index(), values='type', names='index')
figure_pie.update_layout(title="Distribution of content in netflix" , title_x =0.5)
figure_pie.show()

In [30]:
file.dtypes

show_id                    object
type                       object
title                      object
director                   object
cast                       object
country                    object
date_added         datetime64[ns]
release_year                int64
rating                     object
duration                   object
listed_in                  object
description                object
available_year              int64
primary_country            object
dtype: object

#### Released Year of movies and tv shows to the available year in the netflix 

In [44]:
release = file.loc[file['release_year'] > 2010].groupby(['release_year', 'type']).agg({'show_id': 'count'}).reset_index()
added = file.loc[file['available_year'] > 2010].groupby(['available_year', 'type']).agg({'show_id': 'count'}).reset_index()


figure = go.Figure()

figure.add_trace(go.Scatter(x = release.loc[release['type'] == 'Movie']['release_year'],
                         y = release.loc[release['type']=="Movie"]['show_id'],
                         mode = 'lines+markers',
                         name= 'Movie: Released Year',
                         marker_color = 'green' ))

figure.add_trace(go.Scatter(x = release.loc[release['type'] == 'TV Show']['release_year'],
                         y = release.loc[release['type'] == 'TV Show']['show_id'],
                         mode = 'lines+markers',
                         name = 'TV Show: Released Year',
                         marker_color = 'yellow' ))
figure.add_trace(go.Scatter(x = added.loc[added['type'] == 'TV Show']['available_year'],
                         y = added.loc[added['type'] == 'TV Show']['show_id'],
                         mode = 'lines+markers',
                         name = 'TV Show: available Year',
                         marker_color = 'red'))

figure.add_trace(go.Scatter(x = added.loc[added['type'] == 'Movie']['available_year'],
                         y = added.loc[added['type'] == 'Movie']['show_id'],
                         mode = 'lines+markers',
                         name = 'Movie: Available Year',
                         marker_color = 'blue'))

figure.update_layout(title="Released Year of movies and tv shows to the available year in the netflix " , title_x =0.5)

figure.show()

#### Top 20 Movie Genre


In [32]:
genre = file[file['type']=='Movie']
genre_list = genre['listed_in'].value_counts().reset_index().head(20)

figure = px.bar(genre_list, x = 'index', y = 'listed_in',color = 'listed_in',color_continuous_scale = 'blues',
             labels = {"index":"Movie","listed_in":"Count"})

figure.update_layout(title = 'Top 20 Movie Genre',
                  title_x = 0.5,
                  title_font = dict(size = 16, color = 'Green'),
                  xaxis = dict(tickangle = 45))


figure.show()

#### Top 20 TV Show Genre


In [33]:
genre_tv = file[file['type']=='TV Show']
genre_listshow = genre_tv['listed_in'].value_counts().reset_index().head(20)

figure = px.bar(genre_listshow, x = 'index', y = 'listed_in',color = 'listed_in',color_continuous_scale = 'Blues',
             labels = {"index":"Movie","listed_in":"Count"})

figure.update_layout(title = 'Top 20 Tv Show Genre',
                  title_x = 0.5,
                  title_font = dict(size = 16, color = 'Green'),
                  xaxis = dict(tickangle = 45))

figure.show()

#### Tv show's duration distribution 


In [34]:
dur = file[file['type']=='TV Show']
duration = dur['duration'].value_counts().reset_index()

figure = px.bar(duration, x = 'index',y = 'duration', color = 'duration',color_continuous_scale = 'Blues',
             labels = {"index":"Duration","duration":"Count"}
            
            )

figure.update_layout(title="TV Show duration",
                  title_x = 0.5,
                  title_font = dict(size = 16, color = 'Green'))

figure.show()

#### highest ratings of tv shows and movies

In [35]:
copy = file.copy()

rating_show = copy[copy['type']=='TV Show'][['rating', 'type']].rename(columns={'type':'tv_show'})
rating_movies = copy[copy['type']=='Movie'][['rating', 'type']].rename(columns={'type':'movie'})


rating_show = pd.DataFrame(rating_show.rating.value_counts()).reset_index().rename(columns={'index':'tv_show'})
rating_show['rating_final'] = rating_show['rating'] 
# making rating column value negative
rating_show['rating'] *= -1


rating_movies = pd.DataFrame(rating_movies.rating.value_counts()).reset_index().rename(columns={'index':'movie'})
figure = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_yaxes=True, horizontal_spacing=0)
# bar plot for tv shows
figure.append_trace(go.Bar(x=rating_show.rating, y=rating_show.tv_show, orientation='h', showlegend=True, 
                        text=rating_show.rating_final, name='TV Show', marker_color='#221f1f'), 1, 1)
# bar plot for movies
figure.append_trace(go.Bar(x=rating_movies.rating, y=rating_movies.movie, orientation='h', showlegend=True, text=rating_movies.rating,
                        name='Movie', marker_color='#b20710'), 1, 2)
figure.update_xaxes(showgrid=False)
figure.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix=' ', showline=False)
figure.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
figure.update_layout(title="Highest rating for the Movies and Tv Shows",
                  margin=dict(t=80, b=0, l=70, r=40),
                  hovermode="y unified", 
                  xaxis_title=' ', yaxis_title=" ",
                  plot_bgcolor='grey', paper_bgcolor='grey',
                  title_font=dict(size=25, color='orange', family="Bazooka"),
                  font=dict(color='darkblue'),
                  legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
                  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Tisa"))

#### countries with count of movies and tv shows


In [36]:
figure = px.histogram(file, x = 'primary_country',title = 'Countries with their movies and tvshows count')
figure.update_xaxes(categoryorder='total descending')
figure.show()

#### ccount of movies and tv shows 

In [37]:
figure = px.histogram(file, x = 'listed_in',title = 'count of Movies and Tv show genre ')
figure.update_xaxes(categoryorder='total descending')
figure.update_layout(autosize=False,width=1000,height=1000,)
figure.show()