# *Netflix: India🇮🇳  vs The World🌎*



# Important Insights


**1. India as expected through Bollywood dominates the content world with the 2nd highest content produced after USA.**

**2. India makes a lot more movies compared to TV series, while countries like Japan, South Korea, and Taiwan have comparatively more TV series, maybe due to the large production of Anime.**

**3. Overall the TV series creation is increasing at a faster rate as compared to the movie.**

**4. In India the content is dominated by PG-14, while for the rest of the world all types have a decent allocation (may point to lack of diverse content in India as compared to the rest of the world).**

**5. From the variance present in release months, we can guess that in India the focus seems to be releasing the content during the festival period.**

**6. Indian movies duration is quite long compared to the rest of the world**

**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.subplots import make_subplots

**Capturing Indian Data**

In [None]:
df = pd.read_csv("../input/netflix-shows/netflix_titles.csv")
df["date_added"] = pd.to_datetime(df['date_added'])
df['Month']=df['date_added'].dt.month
df['season'] = df.apply(lambda x : int(x['duration'].split(" ")[0]) if "Season" in x['duration'] else "", axis = 1)
df['duration'] = df.apply(lambda x : int(x['duration'].split(" ")[0]) if "Season" not in x['duration'] else "", axis = 1)
df['Country_type'] = df.apply(lambda x : "India" if 'India' == x['country'] else "Rest Of the World", axis=1)

In [None]:
age_ratings = {
    'TV-PG': '12-14+',
    'TV-MA': '18+',
    'TV-Y7-FV': ' 7+',
    'TV-Y7': ' 7+',
    'TV-14': '12-14+',
    'R': '18+',
    'TV-Y': ' 0+ (ALL)',
    'NR': '18+',
    'PG-13': '12-14+',
    'TV-G': ' 0+ (ALL)',
    'PG': '12-14+',
    'G': ' 0+ (ALL)',
    'UR': '18+',
    'NC-17': '18+'
}
df['age_ratings'] = df['rating'].replace(age_ratings)

In [None]:
Ind_data = df[df.country=='India']
World = df[df.country!='India']

In [None]:
Ind_data.head()

**Segregating content country wise**

In [None]:
all_countries = df.groupby(['country','type']).count()['show_id'].reset_index()
all_countries.head()

# **1. Country Wise Content Analysis**

In [None]:
country_count = {}
for i in range(len(all_countries)):
    l = all_countries['country'][i].split(', ')
    for x in l:
        x = re.sub('[^A-Za-z0-9 ]+', '', x)
        if x not in country_count.keys():
            country_count[x] = all_countries['show_id'][i]
        else:
            country_count[x] += all_countries['show_id'][i]
country_df = pd.DataFrame(list(zip(country_count.keys(), country_count.values())), columns =['country', 'count']) 

d = country_df.sort_values(by=['count'], ascending=False).head(10)
# .plot.bar(x='country',y='count',edgecolor='black')
fig = px.bar(d, x='country',y='count')
fig.update_traces(marker_color='#221F1F', marker_line_color='#E50914',
                  marker_line_width=2, opacity=1)
fig.update_layout(title='Content produced country wise')
fig.show()
top_30 = country_df.sort_values(by=['count'], ascending=False)['country'].head(30)

In [None]:
sns.set_theme()
trace = go.Choropleth(
            locations = list(country_count.keys()),
            locationmode='country names',
            z = list(country_count.values()),
            text = country_df['country'],
#             autocolorscale =True,
            reversescale = False,
            zauto = True,
            colorscale = 'RdBu',
            marker = dict(
                line = dict(
                    color = 'rgb(0,0,0)',
                    width = 0.5)
            ),
            colorbar = dict(
                title = 'Total Content',
                tickprefix = '')
        )

data = [trace]
layout = go.Layout(
    title = 'Total content per country',
    geo = dict(
        showframe = True,
        showlakes = False,
        showcoastlines = True,
#         projection = dict(
#             type = 'natural earth'
#         )
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
country_count_t = {'Movie':{},'TV Show':{}}
for i in range(len(all_countries)):
    l = all_countries['country'][i].split(', ')
    for x in l:
        x = re.sub('[^A-Za-z0-9 ]+', '', x)
        if x not in country_count_t[all_countries['type'][i]]:
            country_count_t[all_countries['type'][i]][x] = all_countries['show_id'][i]
        else:
            country_count_t[all_countries['type'][i]][x] += all_countries['show_id'][i]
country_df_t = pd.concat({k: pd.Series(v) for k, v in country_count_t.items()}).reset_index()
country_df_t.columns = ['type','country','count']
# country_df_t

In [None]:
fig = px.treemap(country_df_t, path = ['type', 'country'], values = 'count',
                title="Total content per country, grouped by type of content")
fig.show()

**As one can notice, India makes lot more movies compared to TV series
While countries like Japan, South Korea, and Taiwan have comparatively more TV series, maybe due to large production of Anime**

# **2. Content type distribution analysis**

In [None]:
fig = px.histogram(df, x='type',color='Country_type', barmode='group',color_discrete_map={'Rest Of the World':'#221F1F','India':'#E50914'})
fig.update_layout(title='Content type distribution India vs World')
fig.show()

In [None]:
# i_l = Ind_data.groupby('type').size().tolist()
w_l = df.groupby('type').size().tolist()
movie_tv_ratio = {}
world_d = (w_l[0])/(w_l[0]+w_l[1])
print("World: ")
print("No. of Movie/Total_content ratio = ",world_d)

In [None]:
for x in country_df_t.sort_values(by=['count'], ascending=False)['country'].head(30):
    data = country_df_t[country_df_t.country==x]
#     print(x, data.type)
    if 'Movie' in data.type.values:
        m = int(data[data.type == 'Movie']['count'])
    else:
        m=0
    if 'TV Show' in data.type.values:
        tv = int(data[data.type == 'TV Show']['count'])
    else:
        tv=0
#     print(x,m,tv)
    movie_tv_ratio[x] = m/(m+tv)

print("Top 30 countries No. of movie/Total_content ratio")
movie_tv_ratio

In [None]:
# mt = sorted(movie_tv_ratio.items(), key=lambda x: x[1], reverse=True)
mt_0 = {k:v for (k,v) in movie_tv_ratio.items() if v > 0.6905}
mt_1 = {k:v for (k,v) in movie_tv_ratio.items() if v < 0.6905}
mt_0 = sorted(mt_0.items(), key=lambda x: x[1], reverse=True)
mt_1 = sorted(mt_1.items(), key=lambda x: x[1], reverse=True)

f, ax = plt.subplots(figsize=(16,8))
colors = []
for value in mt_0:
    if value[0] == 'India':
        colors.append('firebrick')
    else:
        colors.append('grey')
plt.bar([x[0] for x  in mt_0], [x[1] for x in mt_0], color =colors)
plt.bar(['World'], [world_d], color ='black')
plt.bar([x[0] for x  in mt_1], [x[1] for x in mt_1], color ='white')
plt.xlabel("Countries")
plt.ylabel("Movie Percent among total content")
plt.title("Percentage of movie for each country")
plt.xticks(rotation = 90)
plt.show()

In [None]:
p_df = country_df_t.loc[country_df_t['country'].isin(top_30.values.tolist())] 
fig = px.bar(p_df, x="country", y="count", color="type",
            hover_data=['count'], barmode = 'stack')
 
fig.show()

In [None]:
wd_yr = World[World['release_year']>2009].groupby(['release_year','type']).count()['show_id'].reset_index()
fig = px.area(wd_yr, x="release_year", y="show_id", color="type",color_discrete_sequence=['rgb(0,0,0)','rgb(115,115,115)'])
fig.update_layout(
    title="World Movie and TV show count corresponding to year",
    yaxis_title="Total Content",
    xaxis_title="Year Wise",
    legend_title="Type of content"
)
fig.show()

In [None]:
ind_yr = Ind_data[Ind_data['release_year']>2009].groupby(['release_year','type']).count()['show_id'].reset_index()
fig = px.area(ind_yr, x="release_year", y="show_id", color="type",color_discrete_sequence=['rgb(103,0,13)','rgb(203,24,29)'])
fig.update_layout(
    title="India Movie and TV show count corresponding to year",
    yaxis_title="Total Content",
    xaxis_title="Year Wise",
    legend_title="Type of content"
)
fig.show()

In [None]:
ind_yr = df[(df['release_year']>2009) & (df['release_year']<2021)].groupby(['release_year','type','Country_type']).count()['show_id'].reset_index()
fig = px.area(ind_yr, x="release_year", y="show_id", color="type",
              color_discrete_sequence=['rgb(203,24,29)','rgb(0,0,0)'],facet_col='Country_type')
fig.update_layout(
    title="Movie and TV show count corresponding to year",
    yaxis_title="Total Content",
    xaxis_title="Year Wise",
    legend_title="Type of content"
)
fig.show()

**TV Show productions are increasing at a quicker rate as compared to Movies**

# **3. Ratings comparison**

In [None]:
idd = Ind_data.dropna(subset=['age_ratings'])
idd = idd.groupby(['rating', 'age_ratings']).agg({'show_id': 'count'}).reset_index()
idd.columns = ['rating', 'age_ratings', 'count']
fig = px.bar(idd.sort_values('age_ratings'), x='rating',y='count', color='age_ratings',
            color_discrete_sequence =['rgb(252,187,161)','rgb(251,106,74)','rgb(203,24,29)','rgb(103,0,13)'])
fig.update_layout(title="India Ratings distribution")
fig.show()
wd = World.dropna(subset=['age_ratings'])
wd = wd.groupby(['rating', 'age_ratings']).agg({'show_id': 'count'}).reset_index()
wd.columns = ['rating', 'age_ratings', 'count']
fig = px.bar(wd.sort_values('age_ratings'), x='rating',y='count', color='age_ratings',
            color_discrete_sequence =['rgb(255,255,255)','rgb(217,217,217)','rgb(115,115,115)','rgb(0,0,0)'])
fig.update_layout(title="World Ratings distribution")
fig.show()

In [None]:
fig = px.pie(Ind_data, names='rating', title='India ratings distribution',color_discrete_sequence=px.colors.sequential.Reds)
fig.show()
px.pie(World, names='rating', title='World ratings distribution',color_discrete_sequence=px.colors.sequential.Greys)

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x=wd['age_ratings'],y=wd['count'],name='Rest of the world',marker_color='#221f1f'))
fig.add_trace(go.Bar(x=idd['age_ratings'],y=idd['count'] ,name='India',marker_color='#e50914'))
fig.update_layout(
    barmode='group', xaxis_tickangle=-45,
    title="Movies time distribution comparision",
    yaxis_title="Number of minutes",
    xaxis_title="Ratings",
)

fig.show()

**In India the content is dominated by PG-14 while for the rest of the world all types have a decent allocation (may point to lack of diverse content as compared to the rest of the world).**

# **4. Month Analysis**

In [None]:
wd_yr = Ind_data.groupby(['Month','type']).count()['show_id'].reset_index()
fig = px.area(wd_yr, x="Month", y="show_id", color="type",color_discrete_sequence=['rgb(103,0,13)','rgb(203,24,29)'])
fig.update_layout(
    title="India Movie and TV show count corresponding to month",
    xaxis_title="Total Content",
    yaxis_title="Month Wise",
    legend_title="Type of content"
)
fig.show()

In [None]:
wd_yr = World.groupby(['Month','type']).count()['show_id'].reset_index()
fig = px.area(wd_yr, x="Month", y="show_id", color="type",color_discrete_sequence=['rgb(0,0,0)','rgb(115,115,115)'])
fig.update_layout(
    title="World Movie and TV show count corresponding to month",
    xaxis_title="Total Content",
    yaxis_title="Month Wise",
    legend_title="Type of content"
)
fig.show()

**Here we see a lot of up and downs in India content graph, this might be due to focus on releasing during festival time periods.** 

# **5. Content Length Analysis**

In [None]:
# World.season_count
wd = World[World['type']=="TV Show"]['season'].value_counts()
fig = px.bar(wd)
fig.update_layout(
    title="World Series season distribution",
    xaxis_title="Number of Seasons",
    yaxis_title="Count",
)
fig.update_traces(marker_color='#221F1F', marker_line_color='#E50914',
                  marker_line_width=2, opacity=1)
fig.show()


idd = Ind_data[Ind_data['type']=="TV Show"]['season'].value_counts()
fig = px.bar(idd)
fig.update_layout(
    title="India Series season distribution",
    xaxis_title="Number of Seasons",
    yaxis_title="Count",
)
fig.update_traces(marker_color='#E50914', marker_line_color='#221F1F',
                  marker_line_width=2, opacity=1)
fig.show()

In [None]:
print("16 Seasons Series: ", World[World["season"]==16]['title'].iloc[0])

In [None]:
wd = World[World['type']=="Movie"]['duration'].fillna(0.0).astype(float)
idd = Ind_data[Ind_data['type']=="Movie"]['duration'].fillna(0.0).astype(float)
fig = ff.create_distplot([wd, idd], ['World','India'], bin_size=0.65, curve_type='normal'
                        ,colors =  ['#221F1F','#E50914'])
fig.update_layout(
    title="World Movies time distribution",
    xaxis_title="Number of minutes",
)
# fig.update_traces(marker_color='#221F1F', marker_line_color='#E50914',
#                   marker_line_width=0.1, opacity=1)
fig.show()

In [None]:
fig = px.box(df, x="rating", y="duration",color='Country_type',color_discrete_map={'Rest Of the World':'#221F1F','India':'#E50914'})
fig.update_layout(title="Movie duration with respect to ratings")
fig.show()

In [None]:
wd = (World[World['type']=="Movie"].groupby('rating')['duration'].sum()/World[World['type']=="Movie"].groupby('rating')['duration'].count()).reset_index()
idd = (Ind_data[Ind_data['type']=="Movie"].groupby('rating')['duration'].sum()/Ind_data[Ind_data['type']=="Movie"].groupby('rating')['duration'].count()).reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(x=wd['rating'],y=wd['duration'],name='Rest of the world',marker_color='#221f1f'))
fig.add_trace(go.Bar(x=idd['rating'],y=idd['duration'] ,name='India',marker_color='#e50914'))
fig.update_layout(
    barmode='group', xaxis_tickangle=-45,
    title="Movies time distribution comparision",
    yaxis_title="Number of minutes",
    xaxis_title="Ratings",
)

fig.show()

Indian movies duration is quite long compared to the rest of the world

# If there are any suggesion for the notebook please comment, that would be helpful. Also please upvote if you liked it! Thank you.