In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px # visualization package 

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



Read data from csv to pandas.DataFrame

In [None]:
filepath_netflix = "../input/netflix-shows/netflix_titles.csv"
netflix_shows = pd.read_csv(filepath_netflix, index_col = "show_id")
netflix_shows.head()

In [None]:
netflix_shows.info()

Create pie chart for 'type' column to see distribution between **Movie** and **TV-Show**

In [None]:
fig1 = px.pie(netflix_shows, names = 'type',
                      title = 'Overall distribution between Movie and TV-Show')
fig1.show()

Bar chart to see development over time of **Movies** and **TV-Show** in comparison 

In [None]:
df = netflix_shows
df = df.groupby(by=["release_year", "type"]).size().reset_index(name="counts")
fig2 = px.bar(df.query("release_year > 1980"), x="release_year", y = "counts",
              color='type',  barmode='group', width = 800, height = 700,
              title = 'Development over time of Movies and TV-Show' )
fig2.show()

Proportion of Movies : TV-Show. If either of them (Movies, TV-Show) is missing, then we will drop that year.

In [None]:
marker = df.groupby('release_year').count()['type'] - 1

s = [marker[rel_y] for rel_y in df['release_year']]

df["release_year_not_unique"] = list(map(bool,s))
df_no_single_year = df.query('release_year_not_unique == 1')
t = [int(df_no_single_year.query('release_year == @y').query('type == "TV Show"')['counts'])/
     int(df_no_single_year.query('release_year == @y').query('type == "Movie"')['counts']) 
     for y in df_no_single_year['release_year'].unique()]
#len(t)
data_prop = {'release_year': df_no_single_year['release_year'].unique(), 'proportion': t}
dataframe_prop = pd.DataFrame(data = data_prop)
fig3 = px.scatter(dataframe_prop, x = 'release_year', y = 'proportion',
               title = 'Proportion of TV-Show : Movies (value >= 1 means more TV-Shows than Movies)')

fig3.show()

Most used words in title

In [None]:
all_titles_concat = " ".join(netflix_shows['title'])
all_titles_in_word_list = all_titles_concat.split(" ")
titles_counter = {}
for t in all_titles_in_word_list:
    if t in titles_counter:
        titles_counter[t] += 1
    else:
        titles_counter[t] = 1
popular_titles = sorted(titles_counter, key = titles_counter.get, reverse = True)
appearances_of_title = []
name_of_most_common_title = []
for pop_tit in popular_titles:
    if len(appearances_of_title) == 40:
        break
    if pop_tit[0].isupper():
        name_of_most_common_title.append(pop_tit)
        appearances_of_title.append(titles_counter[pop_tit])
    
most_common_titles_dict = {'appearances':appearances_of_title, 'title_name': name_of_most_common_title}
titles_data = pd.DataFrame(data = most_common_titles_dict)
titles_data = titles_data.sort_values('appearances', ascending=True)
titles_data.tail(20)

In [None]:
fig4 = px.bar(titles_data, x = 'appearances', y = 'title_name', orientation = 'h',
             hover_data = ['title_name', 'appearances'], height = 1000,
             title = 'Most used words in title')
fig4.show()

Directors with the most movies/tv shows

In [None]:
director_unique = netflix_shows['director'].dropna().value_counts().reset_index()
director_unique = director_unique.rename(columns = {'index': 'director', 'director': 'counter'})
director_unique = director_unique.sort_values('counter', ascending = True)
fig5 = px.bar(director_unique.tail(15), x = 'counter', y = 'director',
              title = 'Director with the most movies/tv shows')
fig5.show()

Actors with most movie appearances

In [None]:
all_actors_concat = ",".join(netflix_shows['cast'].dropna())
all_actors_in_list = all_actors_concat.split(",")
actors_counter = {}
for act in all_actors_in_list:
    if act in actors_counter:
        actors_counter[act] += 1
    else:
        actors_counter[act] = 1
popular_actors = sorted(actors_counter, key = actors_counter.get, reverse = True)
#print(popular_actors)
appearances_of_actor = []
name_of_most_pop_actor = []
for pop_act in popular_actors:
    if len(appearances_of_actor) == 20:
        break
    name_of_most_pop_actor.append(pop_act)
    appearances_of_actor.append(actors_counter[pop_act])


In [None]:
most_pop_actor_dict = {'appearances':appearances_of_actor, 'actor_name': name_of_most_pop_actor}
actor_data = pd.DataFrame(data = most_pop_actor_dict)
actor_data = actor_data.sort_values('appearances', ascending=True)
actor_data
fig6 = px.bar(actor_data.tail(15), x = 'appearances', y = 'actor_name',
              title = 'Actors with most movie/tv show appearances')
fig6.show()

Distribution of movies/tv shows in recommending age

In [None]:
netflix_rating = netflix_shows.copy()
netflix_rating['rating'].dropna()
netflix_rating['rating'] = netflix_rating['rating'].replace(['TV-MA', 'R', 'NC-17'],'content for mature audiences')
netflix_rating['rating'] = netflix_rating['rating'].replace(['TV-G', 'G'], 'suitable for all ages')
netflix_rating['rating'] = netflix_rating['rating'].replace('TV-Y', 'children from ages 2–6')
netflix_rating['rating'] = netflix_rating['rating'].replace('TV-14', 'may be unsuitable for children under 14 years of age')
netflix_rating['rating'] = netflix_rating['rating'].replace('PG-13', 'may be inappropriate for children under 13')
netflix_rating['rating'] = netflix_rating['rating'].replace(['TV-Y7', 'TV-Y7-FV'],'most appropriate for children age 7 and up')
netflix_rating['rating'] = netflix_rating['rating'].replace(['TV-PG', 'PG'], 'may not be suitable for children')
netflix_rating['rating'] = netflix_rating['rating'].replace(['NR', 'UR', 'nan'], 'Not rated')
netflix_rating['rating'] = netflix_rating['rating'].fillna('Not rated')

netflix_rating['rating'].unique()

fig7 = px.pie(netflix_rating, names = 'rating',
              title = 'Distribution of movies/tv shows in recommending-age')
fig7.show()

Distribution of countries where movie/tv show took place

In [None]:
netflix_country = ",".join(netflix_shows['country'].dropna())
country_list = netflix_country.split(",")
country_counter = {}
for c in country_list:
    if c in country_counter:
        country_counter[c] += 1
    else:
        country_counter[c] = 1
popular_country = sorted(country_counter, key = country_counter.get, reverse = True)

appearances_of_country = []
name_of_most_pop_country = []
for pop_country in popular_country:
    if len(appearances_of_country) == 40:
        break
    name_of_most_pop_country.append(pop_country)
    appearances_of_country.append(country_counter[pop_country])

for counter, app in enumerate(appearances_of_country):
    if app > 100:
        continue
    else:
        movie_sum = sum(appearances_of_country[counter:])
        appearances_of_country = appearances_of_country[:counter]
        appearances_of_country.append(movie_sum)
        name_of_most_pop_country = name_of_most_pop_country[:counter]
        name_of_most_pop_country.append("Others")
        break
        
country_dict = {'appearances': appearances_of_country, 'country': name_of_most_pop_country}
country_data = pd.DataFrame(data = country_dict)                                        

fig8 = px.pie(country_data, names = 'country', values = 'appearances',
             title = 'Distribution of countries where movie/tv show took place')
fig8.show()


In [None]:
country_data = country_data.sort_values('appearances', ascending = True)
fig9 = px.bar(country_data, x = 'appearances', y = 'country', orientation = 'h')
fig9.show()