In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 style="font-family:cursive;"> <center>  Comic Style Plots using Matplotlib XKCD 📊</center> </h1>

<p style="font-size:15px; font-family:verdana; line-height: 1.7em">> Matplotlib is the excellent workhorse plotting library for python. As great as matplotlib is, it could occasionally do with a little stylistic improvement . I recently came across a cool feature in matplotlib that lets you plot in the style of the popular XKCD comics, with a fun font and a more ‘sketchy’ line style. Using xckd-style plots is not just for fun. You can use the xkcd style when you want to emphasise the uncertainty in your modelling or analysis. .I got to know about it from <a href="https://community.dataquest.io/t/how-to-make-comical-visualizations-explained-using-netflix-movie-and-tv-show-dataset/553826">here</a></p><br>

<h1 style="font-family:cursive;"> <center> NETFLIX EDA</center> </h1>

In [None]:

import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 200

df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
df.head()

In [None]:
df.describe()

**Missing Values**

In [None]:
df.isnull().sum()

In [None]:
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['date_added'] = df['date_added'].fillna(df['date_added'].mode()[0])
df['rating'] = df['rating'].fillna(df['country'].mode()[0])

In [None]:
#df.drop(['cast','director'],axis='columns',inplace=True)

All the missing values in the dataset have either been removed or filled. There are no missing values left

In [None]:
df["date_added"] = pd.to_datetime(df['date_added'])
df['year_added'] = df['date_added'].dt.year.astype('Int64')
df['month_added'] = df['date_added'].dt.month

df['season_count'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
df['duration'] = df.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)
df.head()

<h1 style="font-family:cursive;"> <center> Exploratory Data Analysis</center> </h1>

 To XKCDify plots in matplotlib, we just need to engulf all our plotting code within the following block and you’ll be all set:

<code>with plt.xkcd():</code>

**Netflix through the years**


**Movies vs TV Shows**

In [None]:
col = "type"
grouped = df[col].value_counts().reset_index()
grouped = grouped.rename(columns = {col : "count", "index" : col})

with plt.xkcd():
    explode = (0, 0.1)  # only "explode" the 2nd slice (i.e. 'TV Show')

    fig1, ax1 = plt.subplots(figsize=(5, 5), dpi=100)
    ax1.pie(grouped["count"], explode=explode, labels=grouped["type"], autopct='%1.1f%%',
        shadow=True, startangle=90)
    ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

    plt.show()

In [None]:
from collections import Counter
col = "country"

categories = ", ".join(df[col].fillna("")).split(", ")
counter_list = Counter(categories).most_common(25)
counter_list = [_ for _ in counter_list if _[0] != ""]
labels = [_[0] for _ in counter_list]
values = [_[1] for _ in counter_list]


with plt.xkcd():
    fig, ax = plt.subplots(figsize=(10, 10), dpi=100)
    y_pos = np.arange(len(labels))
    ax.barh(y_pos, values, align='center',color = 'red')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(labels)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Content')
    ax.set_title('Countries with most content')

    plt.show()

The vast majority of content on Netflix is from the United States .
Even though Netflix launched quite late in India (in 2016), it’s already in the second position right after the US. So, India is a big market for Netflix.

In [None]:
from collections import Counter
from matplotlib.pyplot import figure
import math

colours = ["orangered", "mediumseagreen", "darkturquoise", "mediumpurple", "deeppink", "indianred"]
countries_list = ["United States", "India", "United Kingdom", "Japan", "France", "Canada"]
cold = "director"

with plt.xkcd():
    figure(num=None, figsize=(20, 8)) 
    x=1
    for country in countries_list:
        country_df = df[df["country"]==country]
        categories = ", ".join(country_df["director"].fillna("")).split(", ")
        counter_list = Counter(categories).most_common(6)
        counter_list = [_ for _ in counter_list if _[0] != ""]
        labels = [_[0] for _ in counter_list][::-1]
        values = [_[1] for _ in counter_list][::-1]
        if max(values)<10:
            values_int = range(0, math.ceil(max(values))+1)
        else:
            values_int = range(0, math.ceil(max(values))+1, 2)
        plt.subplot(2, 3, x)
        plt.barh(labels,values, color = colours[x-1])
        plt.xticks(values_int)
        plt.title(country)
        x+=1
    plt.suptitle('Popular Directors with the most content')
    plt.tight_layout()
    plt.show()

In [None]:
import seaborn as sns
x= df.release_year.value_counts()
with plt.xkcd():
   
    plt.figure(figsize=(16,6))
    plt.xlabel("Year")
    sns.lineplot(x=x.index ,y= x.values)

In [None]:
col = "cast"

with plt.xkcd():
    figure(num=None, figsize=(20, 8)) 
    x=1
    for country in countries_list:
        df["from_country"] = df['country'].fillna("").apply(lambda x : 1 if country.lower() in x.lower() else 0)
        small = df[df["from_country"] == 1]
        cast = ", ".join(small['cast'].fillna("")).split(", ")
        tags = Counter(cast).most_common(11)
        tags = [_ for _ in tags if "" != _[0]]
        labels, values = [_[0]+"  " for _ in tags][::-1], [_[1] for _ in tags][::-1]
        if max(values)<10:
            values_int = range(0, math.ceil(max(values))+1)
        elif max(values)>=10 and max(values)<=20:
            values_int = range(0, math.ceil(max(values))+1, 2)
        else:
            values_int = range(0, math.ceil(max(values))+1, 5)
        plt.subplot(2, 3, x)
        plt.barh(labels,values, color = colours[x-1])
        plt.xticks(values_int)
        plt.title(country)
        x+=1
    plt.suptitle('Popular Actors with the most content')
    plt.tight_layout()
    plt.show()

In [None]:
x=df.rating.value_counts()
with plt.xkcd():
    
    plt.figure(figsize=(16,10))
    plt.xlabel("Rating")
    plt.ylabel("Count")
    sns.barplot(x=x.index ,y= x.values)