In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

## Clean Data

In [None]:
df_movies = df[df['type']=='Movie']


In [None]:
df_movies['date_added'] = pd.to_datetime(df['date_added'])


In [None]:
df_movies['month_added'] = df_movies['date_added'].dt.month_name()


In [None]:
df_movies['year_added'] = df_movies['date_added'].dt.year
df_movies['day_added'] = df_movies['date_added'].dt.day_name()

In [None]:
df_movies.head()

In [None]:
df_movies[df_movies.isna().any(axis=1)]

In [None]:
df_movies['director']=df_movies['director'].fillna('unknown')
df_movies['cast']=df_movies['cast'].fillna('unknown')
df_movies['country']=df_movies['country'].fillna('unknown')
df_movies['rating']=df_movies['rating'].fillna('unknown')
df_movies[df_movies.isna().any(axis=1)]

### Convert values to numeric

In [None]:
df_movies['duration'] = df_movies['duration'].apply(lambda x: x.split()[0])

df_movies['duration'] = pd.to_numeric(df_movies['duration'])
df_movies['release_year'] = pd.to_numeric(df_movies['release_year'])
df_movies.head()

### Which movie was the longest

In [None]:
print(df_movies['duration'].max())
df_movies[df_movies['duration']==312].style.background_gradient(cmap='plasma')

We see the longest movie is the Black Mirror one, if I remember correctly this movie has a lot of options the viewer can choose, so this number must be the with all the options, that's why it says the duration is 5.2 hours. 

So which is the second longest one?


In [None]:
df_movies['duration'].nlargest(3)

In [None]:
df_movies[df_movies['duration'].isin([312,253,237])].style.background_gradient(cmap='plasma') 

The second largest movie is from Egypt, and it last for 4 hours, now lets see the mean value of the duration. 

### Average movies duration

In [None]:
df_movies['duration'].mean()

The average length of movies is 1.65 hours, so if we want to make a movie it should be around this number

## Most active month

In [None]:
df_movies['month_added'].value_counts()

The most active month is January it seems, the least active is the next one, which is February.

If we want to publish content I think it should be around July or August not the more active neither the most inactive months, balance is always good.


## We can vizualize this

In [None]:
net_date = df_movies[['date_added']].dropna()
net_date['year'] = net_date.date_added.dt.year
net_date['month'] = net_date.date_added.dt.month_name()
net_date.head()

In [None]:
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'][::-1]
df_months = net_date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_order].T
df_months.head()

In [None]:
plt.figure(figsize=(8,5),dpi=150)
plt.pcolor(df_months, cmap='afmhot_r', edgecolors='white', linewidths=2)
plt.xticks(np.arange(0.5,len(df_months.columns),1), df_months.columns, fontsize=7, fontfamily='serif')
plt.yticks(np.arange(0.5, len(df_months.index), 1), df_months.index, fontsize=7, fontfamily='serif')

plt.title('Netflix content Update by Months', fontsize=12, fontfamily='serif', fontweight='bold', position=(0.20, 1.0+0.02))
cbar=plt.colorbar()
cbar.ax.tick_params(labelsize=8)
cbar.ax.minorticks_on()


plt.show()

Overall the most active month is January.

This probably because of contracts that starts at the beginning of the year or maybe the figure out that people have more hunger for content in January. 


## How activity grew

In [None]:
import matplotlib.patheffects as path_effects
year_data = df_movies['year_added'].value_counts().sort_index().loc[:2020]
year_data.head()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(10, 5))
# total
# year_data.index() is x and y is the values of each year
ax.plot(year_data.index, year_data, color='red', linewidth=5, label='Total', 
        path_effects=[path_effects.SimpleLineShadow(), path_effects.Normal()])
ax.set_facecolor(('#1A0D0D'))
ax.set_title('Content Growth (movies)', position=(0.23, 1.0+0.03), fontsize=15,fontweight='bold')
ax.yaxis.set_tick_params(labelsize=10)
ax.xaxis.set_tick_params(labelsize=10)
plt.show()

We can see that movies added peaked in 2019, that's weird, because I would have thought that 2020 would have had more content due to the demand for content the pandemic created. 

#### But why content grew so Exponentially?

Netflix's popularity skyrocketed since around 2013, each day more users were subscribing, so they had more money to acquire the rights of movies or create their own movies. 

People loved Netflix because it has all the movies in one place, but then more and more streaming services started to show up, Netflix is not the same anymore. 


In [None]:
df_movies[df_movies['director']=='Raúl Campos, Jan Suter']

The directors with the most movies are spanish speakears that do stand up comedy, maybe we should look for directors on other genders

In [None]:
directors = df_movies['director'].value_counts()[:20]
directors.drop('unknown')

In [None]:
df_movies[df_movies['director']=='Marcus Raboy']

Seems like stand-up comedy is a constant, after that comes Martin Scorsese, who is a great director

In [None]:
df_movies[df_movies['director']=='Martin Scorsese']

In [None]:
import sys
!{sys.executable} -m pip install countrygroups

In [None]:

europe = ['Austria',
 'Belgium',
 'Bulgaria',
 'Croatia',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Estonia',
 'Finland',
 'France',
 'Germany',
 'Greece',
 'Hungary',
 'Ireland',
 'Italy',
 'Latvia',
 'Lithuania',
 'Luxembourg',
 'Malta',
 'Netherlands',
 'Poland',
 'Portugal',
 'Romania',
 'Slovakia',
 'Slovenia',
 'Spain',
 'Sweden']
europe

In [None]:
def get_europe(country):
    europe = ['Austria',
 'Belgium',
 'Bulgaria',
 'Croatia',
 'Cyprus',
 'Czechia',
 'Denmark',
 'Estonia',
 'Finland',
 'France',
 'Germany',
 'Greece',
 'Hungary',
 'Ireland',
 'Italy',
 'Latvia',
 'Lithuania',
 'Luxembourg',
 'Malta',
 'Netherlands',
 'Poland',
 'Portugal',
 'Romania',
 'Slovakia',
 'Slovenia',
 'Spain',
 'Sweden']
    if country in europe:
        return 'europe'

In [None]:
df_movies['continent'] = df_movies['country'].apply(lambda x: get_europe(x))
df_movies.head()
                

In [None]:
europe = df_movies[df_movies['continent']=='europe']

In [None]:
europe['country'].value_counts().reset_index().style.background_gradient(cmap='plasma') 

Spain has the most movies on netflix from europe

## IMDb Ratings

In [None]:
imdb_movies = pd.read_csv('../input/imdb-extensive-dataset/IMDb movies.csv', usecols=['title','budget','genre'])
imdb_ratings=pd.read_csv('../input/imdb-extensive-dataset/IMDb ratings.csv', usecols=['weighted_average_vote'])
ratings = pd.DataFrame({'Title':imdb_movies.title,
                       'Rating':imdb_ratings.weighted_average_vote,
                       'Budget':imdb_movies.budget,
                       'Genre':imdb_movies.genre})
ratings.drop_duplicates(subset=['Title','Rating', 'Budget'], inplace=True)

In [None]:
joint = ratings.merge(df_movies,left_on='Title', right_on='title', how='inner')
joint = joint.sort_values(by='Rating', ascending=False)
joint.head()

In [None]:
import plotly.express as px
top_rated=joint[0:10]
fig = px.sunburst(
    top_rated,
    path=['title','country'],
    values='Rating',
    color='Rating')
fig.show()

The best rated movie seems to be Pulp Fiction which is a really great Movie, the suprising thing is there is a lot of indian good movies it really shows the growth of bolliwood