In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_rows', 500)
#plots
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
#nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
#re
import re
#sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

<h1 style="font-family:verdana;"> <center> 🎬 Netflix recommender systems 🍿</center> </h1>
<img src="https://cdn.vox-cdn.com/thumbor/S8y46C9A3UraH2Xx4-0KZ4zpCwg=/0x0:1280x720/620x413/filters:focal(538x258:742x462):gifv():no_upscale()/cdn.vox-cdn.com/uploads/chorus_image/image/62980013/netflix.0.gif">

# Introduction
This dataset consists of tv shows and movies available on Netflix as of 2019. The dataset is collected from Flixable which is a third-party Netflix search engine.

In 2018, they released an interesting report which shows that the number of TV shows on Netflix has nearly tripled since 2010. The streaming service’s number of movies has decreased by more than 2,000 titles since 2010, while its number of TV shows has nearly tripled. It will be interesting to explore what all other insights can be obtained from the same dataset.

Integrating this dataset with other external datasets such as IMDB ratings, rotten tomatoes can also provide many interesting findings.

Currently there has been an update and now we are able to process new data even from 2021. 

# About notebook
This notebook was created to expand my knowledge of recommender systems. When working on the notebook, I will try to expand the information on recommender systems as much as possible. I will also try to conduct a exploratory data analysis using the Plotly library, which I have not dealt with before.

I would like the notebook to be a kind of sandbox which will allow me to expand my knowledge and abilities on this subject. In next stages of the notebook's development I will try to integrate my recommender systems with other external datasets as suggested in the dataset description. 

# EDA with Plotly
During EDA i try to learn Plotly and prepare all of plots on my own. Not in popular kaggle manner of Cmd+C, Cmd+V.

In [None]:
data = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

In [None]:
data.head()

In [None]:
data.info()

### Movies vs TV Shows

In [None]:
colors = px.colors.qualitative.D3
vs_count = data.type.value_counts()
fig = px.pie(values=vs_count.values, names=vs_count.index, title = 'Movies vs TV Shows', \
             color_discrete_sequence=colors)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

### Content added over the years

In [None]:
data['year_added'] = pd.DatetimeIndex(data['date_added']).year
d1 = data[data["type"] == "TV Show"].year_added.value_counts()
d2 = data[data["type"] == "Movie"].year_added.value_counts()
d1.sort_index(inplace=True)
d2.sort_index(inplace=True)

t1 = go.Scatter(x = d1.index, y=d1.values ,
                    mode='lines+markers',
                    name='TV Show')


t2 = go.Scatter(x = d2.index, y=d2.values ,
                    mode='lines+markers',
                    name='Movie')

layout = go.Layout(hovermode= 'closest', title = 'Content added over the years' , \
                   xaxis = dict(title = 'Year'), yaxis = dict(title = 'Content added'))

fig = go.Figure(
    data=[t1, t2],
    layout=layout
)

fig.show()

### Content share of countries 
Limited to 25 countries of largest share.

In [None]:
country_count = data['country'].value_counts()[:25]
fig = px.pie(values=country_count.values, names=country_count.index, \
             title='Conent share of countries', color_discrete_sequence=colors)
fig.show()

### Distribution of movie duration
Attempt to perform distplot equivalent do sns.distplot

In [None]:
movie_duration = data[data.type == 'Movie'].duration
movie_duration = movie_duration.apply(lambda x : float(x.replace(' min','')))
t1 = go.Histogram(
                    x = movie_duration,
                    xbins=dict(size=0.5),
                    marker=dict(color = colors)
                    )

layout = go.Layout(title = 'Distribution of movie duration', xaxis = dict(title = 'Minutes'))
fig = go.Figure(data = [t1], layout = layout)
fig.show()

### Distribution of show duration

In [None]:
show_duration = data[data.type == 'TV Show'].duration
show_duration = show_duration.apply(lambda x : float(re.sub(' Seasons?','',x)))
t2 = go.Histogram(
                    x = show_duration,
                    xbins=dict(size=0.5),
                    marker=dict(color = colors)
                    )

layout2 = go.Layout(title = 'Distribution of show duration', xaxis = dict(title = 'Seasons'))
fig = go.Figure(data = [t2], layout = layout2)
fig.show()

### EDA to be continued...

# Content-based recommender
Content-based recommenders suggest similar items based on a particular item. This system uses item metadata, such as director, description, cast, etc. for movies, to make these recommendations. The general idea behind these recommender systems is that if a person likes a particular item, he or she will also like an item that is similar to it. And to recommend that, it will make use of the user's past item metadata. A good example could be YouTube, where based on your history, it suggests you new videos that you could potentially watch.

In [None]:
data = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

### Dropping unecessary columns

In [None]:
data.drop(['show_id','type','date_added','release_year','rating','duration'], axis=1, inplace=True)
data.head()

### Missing values

In [None]:
data.isna().sum().sort_values(ascending=False)

In [None]:
data.dropna(inplace=True)

In [None]:
library = data.copy()
library.reset_index(inplace=True, drop=True)

In [None]:
english_stopwords = stopwords.words('english')
#base of english stopwords
stemmer = SnowballStemmer('english')
#stemming algorithm
regex = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
#regex used for cleaning text from all unwanted marks

In [None]:
def preprocess(content, stem=False):
  content = re.sub(regex, ' ', str(content).lower()).strip()
  tokens = []
  for token in content.split():
    if token not in english_stopwords:
      tokens.append(stemmer.stem(token))
  return " ".join(tokens)

In [None]:
data.description = data.description.apply(lambda x: preprocess(x))

data.listed_in = data.listed_in.apply(lambda x: preprocess(x))

data.listed_in = data.listed_in.apply(lambda x: x.lower().split(" ")) 

data.description = data.description.apply(lambda x: x.lower().split(" "))

data.director = data.director.apply(lambda x: x.lower().split(","))

data.cast = data.cast.apply(lambda x: x.lower().split(","))

data.country = data.country.apply(lambda x: x.lower().split(","))

In [None]:
for index, row in data.iterrows():
    row['director'] = [item.replace(" ", "") for item in row['director']]
    row['cast'] = [item.replace(" ", "") for item in row['cast']]
    row['country'] = [item.replace(" ", "") for item in row['country']]

In [None]:
data.set_index('title', inplace = True)
data.head()

In [None]:
columns = data.columns
data['bagofwords'] = ""

for index, row in data.iterrows():
    words = ''
    for column in columns:
        words = words + ' '.join(row[column])+' '
    row['bagofwords'] = words
    
data.drop([column for column in columns], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
count = CountVectorizer()
count_matrix = count.fit_transform(data['bagofwords'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [None]:
def recommender(title):
    
    
    index = library[library['title']==str(title)].index[0]
    
    # creating a Series with the similarity scores in descending order
    similar_indexes = pd.Series(cosine_sim[index]).sort_values(ascending=False)
    # getting the indexes of the 10 most similar movies
    top5 = list(similar_indexes.iloc[1:6].index)
    
    recommended_movies = library.iloc[pd.Index(library.index).get_indexer(top5)]

    return recommended_movies

In [None]:
recommender('Bad Boys')

In [None]:
recommender('Indiana Jones and the Kingdom of the Crystal Skull')

In [None]:
recommender('Indiana Jones and the Kingdom of the Crystal Skull')

In [None]:
recommender('Casino Royale')

### More sophisticated recommender systems soon...

# In the end
**<center>Presented notebook is not yet complete. I will try to improve it regularly with more interesting plots in the Plotly library and more complex recommendation systems. Another recommendation system I would like to build will be based on deep learning.</center>**

**<center> More info soon... </center>**




<h3 style="text-align:center;"><center><font color='blue'>Upvotes and sugestions</font> are welcome</center></h2>