# Netflix Visualizations 🎬🎦🍿

<img src="https://images.unsplash.com/photo-1574375927938-d5a98e8ffe85?ixid=MXwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHw%3D&ixlib=rb-1.2.1&auto=format&fit=crop&w=1049&q=80" width="400px">

**Kindly Upvote if you like the notebook and share possible improvements in the comments.**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# importing all the necessary stuff
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
!pip install chart-studio
import chart_studio.plotly as py
import cufflinks as cf
import plotly.graph_objs as go
%matplotlib inline

from plotly.offline import download_plotlyjs, plot, init_notebook_mode, iplot
init_notebook_mode(connected=True)# initiate notebook for offline plot
cf.go_offline()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#  Importing the Dataset

In [None]:
netflix_df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
netflix_df.head()

Shape of the Dataset

In [None]:
netflix_df.shape

Let's look at the null and missing values

In [None]:
netflix_df.isnull().sum()

It shows we have a lot of null values in director column followed by case, country and date_added.

# Comparison of Tv Shows and Movies

In [None]:
movies_count = netflix_df[netflix_df.type == "Movie"]
tvshows_count = netflix_df[netflix_df["type"] == "TV Show"]

In [None]:
plt.figure(figsize=(10,8))
sns.set_style("white")
sns.countplot(x="type", data=netflix_df, palette="viridis")

**From this we can clearly observe that there is a lot more Movies content than TV Shows.**

# Movies and TV Shows Content Comparison

In [None]:
colors = ["#809fff", "#66ff66"]
trace = go.Pie(labels=["Movies", "Tv Show"], 
               values=[movies_count.type.count(), tvshows_count.type.count()],
               hoverinfo="label+percent", textinfo="label+percent", marker=dict(colors=colors,
                line=dict(color="#2d2d2d", width=2)))

fig = go.Figure(data=[trace])
iplot(fig)

# Content In Different Countries

In [None]:
netflix_df_counts = netflix_df.country.value_counts()
netflix_df_counts

In [None]:
trace = go.Bar(x=netflix_df_counts.index[:15], y=netflix_df_counts, marker=dict(
                  opacity=0.8,
                  color=np.arange(15)
              ))
fig = go.Figure(data=[trace])

fig.update_layout(title="Top 15 Countries by Content")
fig.update_xaxes(title="Country")
fig.update_yaxes(title="Count")

iplot(fig)

**We can observe that netflix produces most of it's content in USA followed by India and UK**

# Movies released by directors

We have to drop rows with null values in directors as that might affect the pie plot

In [None]:
netflix_df_director = netflix_df.dropna(subset=["director"])

In [None]:
netflix_director_counts = netflix_df_director.director.value_counts()
netflix_director_counts

In [None]:
trace = go.Pie(labels=netflix_director_counts.index[:10], values=netflix_director_counts,
               hoverinfo="label+percent", textinfo="label")

fig = go.Figure(data=[trace])
fig.update_layout(title="Director by % Content")

iplot(fig)

# Year wise content released on netflix

In [None]:
year_wise_content = netflix_df.release_year.value_counts().index[:20]
year_wise_content

In [None]:
plt.figure(figsize=(12,10))
sns.set_style("dark")
sns.countplot(data=netflix_df, y="release_year", order=year_wise_content, palette="viridis")

# Average Movie Duration

First convert all the duration string values to integer

In [None]:
movies_count['duration']=movies_count['duration'].str.replace(' min','')
movies_count['duration']=movies_count['duration'].astype(str).astype(int)
movies_count['duration']

In [None]:
# movie duration distribution
plt.figure(figsize=(10,8))
sns.set_style("darkgrid")
sns.kdeplot(data=movies_count["duration"], shade=True)

**We can observe that the maximum movies on netflix was of duration 80-100 minutes**

# Content Produced On The Basis Of Ratings

In [None]:
trace = go.Bar(x=netflix_df.rating.value_counts().index[:10], y=netflix_df.rating.value_counts(),
               marker=dict(
                  opacity=0.8,
                  color=np.arange(10)
              ))
fig = go.Figure(data=[trace])

fig.update_layout(title="Maximum Content By Ratings")
fig.update_xaxes(title="Ratings")
fig.update_yaxes(title="Content Count")

iplot(fig)

**we can observe maximum content produced is TV-MA which is *mature content* suitable for 18+ years.**

# Indian Content On Netflix

First filter the country - India

In [None]:
netflix_india = netflix_df[netflix_df.country == "India"]
netflix_india.head()

**We can observe the shift of content being produced by netflix in india from 2000-2020**

In [None]:
plt.figure(figsize=(12,10))
sns.set_style("dark")
sns.countplot(data=netflix_india, y="release_year", 
              order=netflix_india.release_year.value_counts().index[:20], 
              palette="viridis")