In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Feature Engineering - Adding extra columns
data['date_added'] = pd.to_datetime(data['date_added'])
data['month'] = data['date_added'].dt.month #Extracting Month
data['year_added'] = data['date_added'].dt.year #Extracting Year

# <font color="#FF8C00"><b>1. Introduction</b></font>
* Netflix, Inc. is an American technology and media services provider and production company headquartered in Los Gatos, California. Netflix was founded in 1997 by Reed Hastings and Marc Randolph in Scotts Valley, California. 
* The company's primary business is its subscription-based streaming service which offers online streaming of a library of films and television series, including those produced in-house.
* Here is a dataset from Netflix which includes data of Movies and TV Shows. It includes data points like title, director, cast, country, rating etc
* We will perform EDA to find out some insights :)

# <font color="#FF8C00">2. Content by Country

In [None]:
data_na = data.dropna(subset=['month']) #removing records where Month is NA  
data_na = data_na.dropna(subset=['country']).reset_index(drop=True)
country_list = data_na['country']
newlist = []
for d in country_list:
    d = d.split(",")
    newlist.extend(d) 

newlist = [x.strip() for x in newlist]
from collections import Counter
c = Counter(newlist)
df = pd.DataFrame.from_dict(c, orient = 'index').reset_index()
df.rename(columns = {'index': 'Country',0:'Occurences'},inplace=True)
df.sort_values(by = ['Occurences'], inplace=True,ascending = False)
top_10 = df.head(10)

#Creating pallette
pal = sns.color_palette("Blues_d", len(top_10['Occurences']))
rank = top_10["Occurences"].argsort().argsort() 

In [None]:
plt.style.use("seaborn-pastel")
plt.figure(figsize = (20,10))
sns.barplot(x='Country',y='Occurences',data=top_10, palette=np.array(pal[::-1])[rank])
plt.xlabel("")
plt.ylabel("")
plt.title("Top 10 Countries by Content Count",fontsize = 15)
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
# plt.title()
plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* United States is leading in number of movies release by a very high margin. It has released three times more movies released by India
* The above bar chart also includes the values if one movie is released in multiple countries, let's look at the records which where movies are released in just one country


# <font color="#FF8C00">3. TV Show or Movie?

In [None]:
pie_chart = data.groupby(['type']).count()['show_id'].reset_index()
total = len(data)
percent_l = []
for i in range((len(pie_chart))):
    percent_l.append((pie_chart['show_id'][i])/total)
pie_chart['Percentage'] = percent_l

In [None]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(7,7))
plt.rcParams.update({'font.size': 18})
plt.pie(pie_chart['Percentage'], labels=pie_chart['type'], shadow=True,
        startangle=90, autopct='%1.1f%%',
        wedgeprops={'edgecolor': 'black'})
plt.title("Content Type")
plt.tight_layout()
plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* More than 68% of the contents present on Netflix is of Movie Type
* It also makes sense because TV Series are much less in comparison to Movies
* TV Series are now getting popular and people are appreciating it due to low data rates (especially in India)

# <font color="#FF8C00">4. What's your Country's preference


In [None]:
#Top 5 Countries
data_country = data.groupby(['country']).count()['show_id'].to_frame().reset_index()
data_country = data_country.sort_values('show_id',ascending=False)
data_country = data_country.head(5).reset_index(drop=True)
list_c = data_country['country']
top_8_data = data[data['country'].isin(list_c)]
year_with_show = top_8_data.groupby(['country','type']).count()['show_id'].to_frame().reset_index()
year_with_show = year_with_show.sort_values(by = 'show_id', ascending= False)

In [None]:
plt.style.use('seaborn-pastel')
plt.figure(figsize=(20,10))
sns.barplot(x="country", y="show_id", hue="type",data= year_with_show)
plt.xlabel("")
plt.title('Shows by Country')
plt.ylabel("")
plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* We can see that the amount of TV Shows in India is much less now on Netflix, but it's increasing day by day. Also there are other streaming platforms like Amazon Prime, Hotstar which have more Indian Content.
* We can see that United Kingdom and Japan are two countries (in Top 5) where count of TV shows released is higher than Movies. Probably the demand over there is more toward TV Shows than movies.

# <font color="#FF8C00">5. Which Content type is preferred more over the years?

In [None]:
plt.style.use('seaborn-pastel')
plt.figure(figsize=(20,10))
a = data.groupby(['year_added','type']).count()['show_id'].to_frame().reset_index()
a = a[a["year_added"]!=2020]
sns.lineplot(x="year_added",y= "show_id", hue = "type",data = a)
plt.xlabel("Year")
plt.ylabel("Number of contents")
plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* We can see that Netflix started adding content at a very high rate after 2014
* The amount of addition of content is increasing year by year,although the amount of addition TV show content is less than Movies

# <font color="#FF8C00">6. Best Month to Add Content

In [None]:
months_v = data.groupby('month').count()['show_id'].to_frame().reset_index(drop=True)
months_l = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep' ,'Oct','Nov','Dec']
tuples = list(zip(months_l , months_v['show_id']))
months= pd.DataFrame(tuples , columns = ['Month','Values'])

In [None]:
plt.style.use('seaborn-pastel')
plt.figure(figsize=(20,10))
sns.barplot(x = months['Month'], y= months['Values'])
plt.xlabel("")
plt.ylabel("")
plt.title("Content Count by Month")
plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* We can see that most of the contents added were in the month of December, probably due to the fact that it's a festive month 
* The least amount of new contents added in February, so if Netflix trying to put new content they can target February, because that will give user a new content which he might like, and that will increase the popularity of the show.

# <font color="#FF8C00">7. Content by Month by Top Countries

In [None]:
month_wise = data.groupby(['country']).count()['show_id'].to_frame().reset_index()
top_countries = data[data['country'].isin(list_c)]
temp_1 = top_countries.groupby(['country','month']).count()['show_id'].to_frame().reset_index()
temp_2 = top_countries.groupby(['country','month','type']).count()['show_id'].to_frame().reset_index()

In [None]:
list_temp_1 = ["United States","India", "United Kingdom", "Japan","Canada"]
for i in range(len(list_temp_1)):
    a = temp_1[temp_1["country"] == list_temp_1[i]]
    plot = sns.catplot(data=a, x="month",y='show_id',kind='bar',row='country',height=4,aspect=4,linewidth=2.5)

    plot.set_axis_labels("", "").set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep' ,'Oct','Nov','Dec'])

plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* We can see that most of the countries has max number of contents added to Netflix platform in December
* It looks that the content added in the second half of the year is more than the first half

# <font color="#FF8C00">8. Content by Month by Country by Type

In [None]:
for i in range(len(list_temp_1)):
    a = temp_2[temp_2["country"] == list_temp_1[i]]
    ax5   = sns.catplot(data=a, x="month",y='show_id',hue = "type",kind='bar',
                        row='country',height=4,aspect=4,linewidth=2.5)
    ax5.set_axis_labels("", "Content Count").set_xticklabels(['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep' ,'Oct','Nov','Dec'])

<font color="#8A2BE2"><b>Observation</b></font>
> United States
* More TV show type content added in the second half of the year than the first half
* Most number of TV Show contents were added in September

> India
* Maximum number of movies added were in the month of December
* Maximum number of TV Show content were added in April
* The amount of TV Shows added in Jan, Feb, Sept, Nov is almost negligible. If more new contents are added in these months then Indian audiences will have much more choices.

> United Kingdom
* Maximum number of TV Show contents were added in March
* In the month of March, in United Kingdom, around 60% of added content was TV Show
* The count of TV Shows were more than Movies in the months of March, July, Aug, Sept, Oct, Nov

> Japan
* In comparison to Movies, Japan has very high amount of TV shows
* Almost every month except Sept, we can see that the amount of TV shows added is much more than Movies

> Canada
* Maximum amount of movies were added in the month of March
* Count of movies increases in the last few months

# <font color="#FF8C00">9. Which Ratings are preferred more?

In [None]:
rating_data = data.groupby("rating").count()['show_id'].to_frame().reset_index()
rating_data = rating_data.sort_values(by = 'show_id', ascending = False)

In [None]:
plt.style.use('seaborn-pastel')
plt.figure(figsize=(20,20))
sns.barplot(x="show_id",y="rating", data = rating_data)
plt.xlabel("")
plt.title("Count by Ratings")
plt.ylabel("")
plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* Most amount of contents added belong to TV-MA rating which is basically the content unsuitable for children under 17 and at second position we have TV-14 which is basically the programs unsuitable for children under 14 years of age. This makes sense becuase the majority users of Netflix will be adults and these contents are targeted for them.
* At third and fourth we have TV-PG and R which is restricted for children or children can watch under Parental guidance. So basically these category also belongs to Adult content.
* The categories with least amount of count like TV-G,TV-Y7, TV-Y, G are basically for Genral Audiences, and children. This also makes sense because children audience of Netflix is much less than Adult audience

# <font color="#FF8C00">10. Which Ratings are preferred in your Country

In [None]:
temp_3 = top_countries.groupby(['country','rating']).count()['show_id'].to_frame().reset_index()

for i in range(len(list_temp_1)):
    a = temp_3[temp_3["country"] == list_temp_1[i]].sort_values(by = 'show_id', ascending=False)
    ax1 = sns.catplot(data=a, x="rating",y="show_id",
       kind='bar',row='country',height=4,aspect=4,
       linewidth=2.5)
    ax1.set_axis_labels("", "")

<font color="#8A2BE2"><b>Observation</b></font>
* We can see that the countries(top 5) have most amount of contents in TV-MA and TV-14 which is basically the basically the contents unsuitable for people under 17 and people under 14 respectively.
* Rating with least count are basically the categories for General Audience or Children

# <font color="#FF8C00">11. Most Common words in Content Description

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
list_temp = []

description_list = data.dropna(subset=['description']).reset_index(drop=True)
description_list = description_list['description']
list_temp = []
for b in description_list:
    b=b.split(",")
    list_temp.extend(b)

list_temp = pd.Series(list_temp)

In [None]:
plt.figure(figsize=(20,20))
stopwords = set(STOPWORDS)
stopwords.update(["turn", "one", "two", "become", "three","take","new","four","must","takes","make","find","finds"])
# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=stopwords, background_color="black").generate(','.join(list_temp))

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* In most of the title we can see words like life, family, friend, world, death and crime

# <font color="#FF8C00">12. Is there any relation between Rating and Time Duration?

In [None]:
data_movie=data.loc[(data['type']=="Movie")]
data_movie = data_movie.dropna(subset = ['rating'])
data_movie[['min','rest']]=data_movie.duration.str.split(" ",expand = True)
data_movie = data_movie.drop(['rest'],axis=1)
data_movie['min']=data_movie['min'].astype(float)

In [None]:
plt.figure(figsize=(20,20))
sns.boxplot(x='rating', y='min', data=data_movie)
sns.swarmplot(x='rating', y='min', data=data_movie,size=8,alpha=0.2,color=".2")

In [None]:
data_movie.groupby(['rating'])['min'].median()

<font color="#8A2BE2"><b>Observation</b></font>
* TV-14 category movies are usually 50 to 180 min long with a median of 105 mins
* R category movies are usually 80 to 125 min long with a median of 102 mins
* Categories like TV-Y, TV-Y7, TV-Y7-FV are usually short in length and their medians are 46, 62 and 62 minutes respectively.

# <font color="#FF8C00">13. Top Directors

In [None]:
list_director = []
data_new = data[data['director'].notnull()]
d_list = data_new['director']
for a in d_list:
    a= a.split(",")
    list_director.extend(a)

f= Counter(list_director)

df_d = pd.DataFrame.from_dict(f, orient = 'index').reset_index()
df_d.rename(columns = {'index': 'Director',0:'Occurences'},inplace=True)
df_d.sort_values(by = ['Occurences'], inplace=True,ascending = False)
top_10_d = df_d.head(10)

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x="Occurences",y="Director", data = top_10_d)
plt.title("Top Directors with Content Count")
plt.xlabel("")
plt.ylabel("")
plt.xlim(1,19)
plt.show()

<font color="#8A2BE2"><b>Observation</b></font>
* Most famous director (in terms of content present on Netflix) is Raúl Campos with total of 18 Contents
* We can also see one Indian Director in the Top 10, i.e. David Dhawan

# <font color="#8B008B">Please Upvote if you like:)
# <font color="#8B008B">More updates coming soon:)

![](https://i.pinimg.com/originals/09/6f/e3/096fe3fa342dc57282992b1998370020.png)