# Content Based Recommendation System


So, I am going to use TF-IDF  technique to make two different recommendation models with sigmoid kernel fucntion and one with linear cosine similarity and let's see which one is better.
****

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
print (data.shape)
data.head(5)

In [None]:
data.info()

**1. Content based recommendation system using sigmoid kernel to find the similarities**

****I am dividing 2 different sets one for movie recommendations and one for TV/Shows. Also I am using a combined function to compute the similarity****

# Movie set* ****

In [None]:
movies = data[data['type']=='Movie'].reset_index()
movies = movies.drop(columns = ['duration','country','date_added','release_year','show_id','type','index','listed_in'])

In [None]:
movies['director'] = movies['director'].fillna("")
movies['cast'] = movies['cast'].fillna("")
movies['combined'] = movies['description']+movies['cast']+movies['director']
movies.head(5)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df = 3,max_features = None,analyzer = 'word',token_pattern = 'r\w{1,}', ngram_range = (1,3), stop_words = 'english')

In [None]:
movies['combined'] = movies['combined'].fillna("")

In [None]:
tfv_matrix = tfv.fit_transform(movies['combined'])
from sklearn.metrics.pairwise import sigmoid_kernel
sig = sigmoid_kernel(tfv_matrix,tfv_matrix)
sig[0]
indices = pd.Series(movies.index,index = movies['title']).drop_duplicates()
indices

In [None]:
def recommend(title,sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores,key = lambda x:x[1], reverse = True)
    sig_scores = sig_scores[1:11]
    movies_indices = [i[0] for i in sig_scores]
    return movies['title'].iloc[movies_indices]

In [None]:
recommend('Zulu Man in Japan')

In [None]:
recommend('Zubaan')

In [None]:
recommend('The Cakemaker')

# TV Shows set

In [None]:
shows = data[data['type']=='TV Show'].reset_index()
shows = shows.drop(columns = ['duration','country','date_added','release_year','show_id','type','index','listed_in'])

In [None]:
shows['director'] = shows['director'].fillna("")
shows['cast'] = shows['cast'].fillna("")
shows['combined'] = shows['cast']+shows['director']+shows['description']

In [None]:
shows.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df = 3,max_features = None,analyzer = 'word',token_pattern = 'r\w{1,}', ngram_range = (1,3), stop_words = 'english')
shows['combined'] = shows['combined'].fillna("")

In [None]:
tfv_matrix_shows = tfv.fit_transform(shows['combined'])
from sklearn.metrics.pairwise import sigmoid_kernel
sig = sigmoid_kernel(tfv_matrix_shows,tfv_matrix_shows)
sig[0]
indices = pd.Series(shows.index,index = shows['title']).drop_duplicates()
indices

In [None]:
def recommend(title,sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores,key = lambda x:x[1], reverse = True)
    sig_scores = sig_scores[1:11]
    shows_indices = [i[0] for i in sig_scores]
    return shows['title'].iloc[shows_indices]

In [None]:
recommend("Friends")

In [None]:
recommend("Crash Landing on You")

**Personally , I don't feel that these recommendations are good enough to consider. So, I am doing the same method but this time using Cosine similarities with linear kernel**

# Movies set* 

In [None]:
movies.head(5)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')


movies['combined'] = movies['combined'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['combined'])


tfidf_matrix.shape

In [None]:
tfidf.get_feature_names()[5000:5010]

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

In [None]:
cosine_sim[1]

In [None]:
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
indices

In [None]:
def recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]


In [None]:
recommendations('Zulu Man in Japan')

In [None]:
recommendations("Zubaan")

In [None]:
recommendations("Sanju")

**My experience can vote for this recommendation model better than the first one**

# TV Shows set

In [None]:
shows.head(5)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')


shows['combined'] = shows['combined'].fillna('')
tfidf_matrix = tfidf.fit_transform(shows['combined'])


tfidf_matrix.shape

In [None]:
tfidf.get_feature_names()[5000:5010]

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim.shape

In [None]:
cosine_sim[1]

In [None]:
indices = pd.Series(shows.index, index=shows['title']).drop_duplicates()
indices

In [None]:
def recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    shows_indices = [i[0] for i in sim_scores]
    return shows['title'].iloc[shows_indices]

In [None]:
recommendations("Friends")

In [None]:
recommendations("Crash Landing on You")

In [None]:
recommendations("It's Okay to Not Be Okay")

**My Kdrama addicted brain is absolutely in favour of these recommendations more than the first one**

**# So, overall I would say that the 2nd one is better. I would also try to make this better in future.** Thank you for viewing this notebook😊😊