In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Netflix Data Analysis & Visualization

![](http://assets3.thrillist.com/v1/image/2872430/1200x630/flatten;crop_down;jpeg_quality=70)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.text import Text
from matplotlib.lines import Line2D
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.patches import Rectangle, Polygon
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

### Load the Data

**The dataset has 7787 rows and 12 columns:**
* show_id: unique id of each show (not much of a use for us in this notebook)
* type: The category of a show, can be either a Movie or a TV Show
* title: Name of the show
* director: Name of the director(s) of the show
* cast: Name of actors and other cast of the show
* country: Name of countries the show is available to watch on Netflix
* date_added: Date when the show was added on Netflix
* release_year: Release year of the show
* rating: Show rating on netflix
* duration: Time duration of the show
* listed_in: Genre of the show
* description: Some text describing the show

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
df.head()

## Compare movie and TV shows number on Netflix

In [None]:
plt.figure(figsize=(6,6))
sns.set(style="white")
ax = sns.countplot(x="type", data=df, palette="Set2")

In [None]:
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot

movies_count = df[df["type"] == "Movie"]
tvshows_count = df[df["type"] == "TV Show"]

trace = go.Pie(labels=["Movies", "Tv Show"],
               values=[movies_count.type.count(),tvshows_count.type.count()],
               textinfo="label+percent", 
               marker=dict(colors=["#6ad49b", "#a678de"],line=dict(color="#2d2d2d", width=2)))

fig = go.Figure(data=[trace])
iplot(fig)

## Content Compare  by Countries

In [None]:
country_counts = df["country"].value_counts()

pie1 = go.Pie(labels=country_counts.index[:15], values=country_counts,
               hoverinfo="label+percent", textinfo="label")

fig = go.Figure(data=[pie1])
fig.update_layout(title="Top 15 Country by % Content")

iplot(fig)

## Director comparison by the number of movies

In [None]:
director_counts = df["director"].value_counts()

plt.figure(figsize=(12,6))
plt.title("Top 15 director with the most films",fontsize=18)
sns.barplot(x=director_counts[:15],y=director_counts.index[:15])

# Add label for vertical axis
plt.xlabel("Count",fontsize=14)
plt.ylabel("Director",fontsize=14);

## Content compare by years

In [None]:
netflix_year = df["release_year"].value_counts().index[:20]

plt.figure(figsize=(12,6))
plt.title("Number of relased movie/show by years",fontsize=18)
sns.countplot(data=df,y="release_year", order=netflix_year)

# Add label for vertical axis
plt.xlabel("Count",fontsize=14)
plt.ylabel("Year",fontsize=14);

In [None]:
TVshows = df[df['type'] == 'TV Show']
Movie = df[df['type'] == 'Movie']

TVshows_progress = TVshows['release_year'].value_counts().sort_index()
Movie_progress = Movie['release_year'].value_counts().sort_index()

plt.figure(figsize=(12, 6))

plt.plot(TVshows_progress.index, TVshows_progress.values, label='TV shows')
plt.plot(Movie_progress.index, Movie_progress.values, label='Movie')

plt.axvline(2019, alpha=0.3, linestyle='--', color='r')
plt.axvline(2021, alpha=0.3, linestyle='--', color='r')
plt.axvspan(2019, 2021, alpha=0.2, color='r', label='Coronavirus')

plt.xticks(list(range(1925, 2022, 5)), fontsize=12)
plt.title('Content growth throughout history', fontsize=18)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Amount of content', fontsize=14)
plt.yticks(fontsize=12)
plt.legend()
plt.show()

#Copied this code from "radmirzosimov" 

In [None]:
# Relased on the last 2 year

year_corona = df[(df['release_year'] == 2019) | (df['release_year'] == 2020)|(df['release_year'] == 2021)]
year_corona_counts = year_corona.value_counts().sum()
print("Number of movie/show relased on Corona years (2019-2021):", year_corona_counts)

## Content Produced On The Basis Of Ratings

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x="rating",data=df,order= df['rating'].value_counts().index[0:14]);
plt.title('Content rating of shows', fontsize=18)
plt.xlabel('Ratings', fontsize=14)
plt.ylabel('Amount of content', fontsize=14);

* TV-MA:This program is intended to be viewed by mature, adult audiences and may be unsuitable for children under 17.
* TV-14:This program may be unsuitable for children under 14 years of age.
* TV-PG:This program contains material that parents may find unsuitable for younger children. Parental guidance is recommended.
* R: May be unsuitable for children under the age of 17 (Under 17 requires accompanying parent or adult guardian)
* TV-Y:This program is aimed at a very young audience, including children from ages 2–6
* TV-PG:This program contains material that parents may find unsuitable for younger children. Parental guidance is recommended
* TV-Y7:This program is most appropriate for children age 7 and up
* TV-G:This program is suitable for all ages.

## Movie Duratios

In [None]:
Movie['duration']=Movie['duration'].str.replace(' min','')
Movie['duration']=Movie['duration'].astype(str).astype(int)

In [None]:
# movie duration distribution
plt.figure(figsize=(12,6))
sns.set_style("white")
sns.kdeplot(data=Movie['duration'], shade=True)
plt.title('Avarage Movie Duration', fontsize=18)
plt.xlabel('Duration', fontsize=14)
plt.ylabel('Density', fontsize=14);

In [None]:
movie_shortest = Movie.sort_values('duration')[['title', 'duration']].iloc[:20]
movie_longest= Movie.sort_values('duration')[['title', 'duration']].iloc[-20:]

In [None]:
plt.figure(figsize=(14,7))
plt.title('Top 20 shortest movies available on Netflix', fontsize=18)
plt.tick_params(labelsize=14)
sns.barplot(y=movie_shortest['title'], x=movie_shortest['duration'], alpha=0.6)

plt.show()

In [None]:
plt.figure(figsize=(14,7))
plt.title('Top 20 longest movies available on Netflix', fontsize=18)
plt.tick_params(labelsize=14)
sns.barplot(y=movie_longest['title'], x=movie_longest['duration'], alpha=0.6)

plt.show()

## TV Show Seasons

In [None]:
tv_duration=TVshows["duration"].value_counts().sort_index()

pie2 = go.Pie(labels=tv_duration.index[:20], values=tv_duration,
               hoverinfo="label+percent", textinfo="label")

fig = go.Figure(data=[pie2])
fig.update_layout(title="TV Show Durations")

iplot(fig)

### TV shows that last 10 or more than 10 seasons

In [None]:
more_10=TVshows[(TVshows["duration"] == "16 Seasons")|(TVshows["duration"] == "15 Seasons")|
                (TVshows["duration"] == "14 Seasons")|(TVshows["duration"] == "13 Seasons")|
                (TVshows["duration"] == "12 Seasons")|(TVshows["duration"] == "11 Seasons")|
                (TVshows["duration"] == "10 Seasons")]
print("TV shows that last 10 or more than 10 seasons:",len(more_10.index))

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Table(header=dict(values=['Title', 'duration']),
                 cells=dict(values=[more_10['title'],more_10['duration']],fill_color='lavender'))])
fig.show()

## Turkish Content on Netflix 

A quick look to Turkish contents

In [None]:
netflix_tr = df[df.country == "Turkey"]
netflix_tr.head()

In [None]:
#Word Cloud of Turkish Cast
from wordcloud import WordCloud
wordcloud = WordCloud(background_color = "white",width=1730,height=970).generate(" ".join(netflix_tr.cast))
plt.figure(figsize=(15,10))
plt.imshow(wordcloud,interpolation = 'bilinear')
plt.axis("off")
plt.figure(1,figsize=(12,12))
plt.show()