# A very simple Recommendation system 

Only suggests similar titles based on the genre similarity 

## Imports

In [1]:
import pandas as pd
import numpy as np
import pickle 

import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
lm = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import nltk                                         #Natural language processing tool-kit

from nltk.corpus import stopwords                   #Stopwords corpus
from nltk.stem import PorterStemmer                 # Stemmer
stop_words = set(stopwords.words('english')) 

import gc

import yake

from sklearn.feature_extraction.text import CountVectorizer

## Loading dataset

Download from [kaggle link](https://www.kaggle.com/snehaanbhawal/netflix-tv-shows-and-movie-list)

In [2]:
df = pd.read_csv("netflix_list.csv")

In [3]:
df.drop(["imdb_id","popular_rank", "runtime", "type","language", "endYear","certificate", "rating", "startYear","episodes","orign_country","isAdult","image_url","numVotes"], axis=1, inplace=True)
df.dropna(inplace = True)
df.reset_index(drop=True, inplace=True)
df.rename(columns={"plot": "story"},inplace=True)

In [4]:
df.head()

Unnamed: 0,title,story,summary,genres,cast
0,Lucifer,Lucifer Morningstar has decided he's had enoug...,"Lucifer Morningstar, bored from his sulking li...","Crime,Drama,Fantasy","['Tom Ellis', 'Lauren German', 'Lesley-Ann Bra..."
1,Army of the Dead,"Following a zombie outbreak in Las Vegas, a gr...","With the abandoned, walled city of Las Vegas o...","Action,Crime,Horror","['Dave Bautista', 'Ella Purnell', 'Ana de la R..."
2,The Kominsky Method,"An aging actor, who long ago enjoyed a brush w...",Michael Douglas plays an actor who made it big...,"Comedy,Drama","['Michael Douglas', 'Sarah Baker', 'Graham Rog..."
3,Friends,Follows the personal and professional lives of...,"Ross Geller, Rachel Green, Monica Geller, Joey...","Comedy,Romance","['Jennifer Aniston', 'Courteney Cox', 'Lisa Ku..."
4,Ragnarok,A small Norwegian town experiencing warm winte...,In the small fictional town of Edda coming of ...,"Action,Drama,Fantasy","['David Stakston', 'Jonas Strand Gravli', 'Her..."


In [5]:
df.isnull().sum()

title      0
story      0
summary    0
genres     0
cast       0
dtype: int64

## Some Processing data

In [6]:
df["tags"] = ""

In [7]:
import yake
kw_extractor = yake.KeywordExtractor()
# text = """spaCy is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython. The library is published under the MIT license and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
language = "en"
max_ngram_size = 1
deduplication_threshold = 0.95
numOfKeywords = 50
custom_kw_extractor = yake.KeywordExtractor(lan=language,  dedupLim=deduplication_threshold, top=numOfKeywords,n=max_ngram_size, features=None)


In [8]:
# keywords = custom_kw_extractor.extract_keywords(df.summary[0])
# keywords

# for i in range(df.shape[0]):
#     keywords = custom_kw_extractor.extract_keywords(df.summary[i])
#     keywords += custom_kw_extractor.extract_keywords(df.summary[i])
#     df["tags"][i] = [x[0] for x in keywords]

In [9]:
df.drop(["story", "summary", "cast"], axis=1, inplace=True)
df.head()

Unnamed: 0,title,genres,tags
0,Lucifer,"Crime,Drama,Fantasy",
1,Army of the Dead,"Action,Crime,Horror",
2,The Kominsky Method,"Comedy,Drama",
3,Friends,"Comedy,Romance",
4,Ragnarok,"Action,Drama,Fantasy",


In [10]:
df.tags = [str(x).replace(","," ").replace("["," ").replace("]"," ").replace("'"," ") for x in df.tags]

In [11]:
df.genres = [x.replace(","," ") for x in df.genres]
df.genres.str.split(expand=True).stack().value_counts()

Drama          2719
Comedy         2055
Action         1180
Documentary     977
Crime           967
Animation       878
Adventure       857
Thriller        659
Romance         655
Mystery         467
Horror          415
Fantasy         372
Family          352
Biography       274
Sci-Fi          234
History         217
Reality-TV      184
Music           160
Sport           128
Short           119
\N               66
Musical          48
War              48
Game-Show        34
Talk-Show        31
Western          25
News             20
Film-Noir        11
dtype: int64

In [14]:
df.head()

Unnamed: 0,title,genres,tags,action,adventure,animation,biography,comedy,crime,documentary,...,romance,sci,short,show,sport,talk,thriller,tv,war,western
0,Lucifer,Crime Drama Fantasy,,0.0,0.0,0.0,0.0,0.0,0.561013,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Army of the Dead,Action Crime Horror,,0.495983,0.0,0.0,0.0,0.0,0.532222,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,The Kominsky Method,Comedy Drama,,0.0,0.0,0.0,0.0,0.754111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Friends,Comedy Romance,,0.0,0.0,0.0,0.0,0.547855,0.0,0.0,...,0.836573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Ragnarok,Action Drama Fantasy,,0.533985,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
v = TfidfVectorizer()
x = v.fit_transform(df["genres"])
df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
df = pd.concat([df, df1], axis=1)
# df.drop(["genres"], axis = 1, inplace=True) 

In [13]:
v = TfidfVectorizer()
x = v.fit_transform(df["tags"])
df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())
df1 = pd.concat([df, df1], axis=1)
df1.drop(["tags","title","genres"], axis = 1, inplace=True) 

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
import scipy
mat = scipy.sparse.csr_matrix(df1.values)
from sklearn.metrics.pairwise import sigmoid_kernel
sig1 = sigmoid_kernel(mat,mat)

In [None]:
index = pd.Series(df.index, index = df.title).drop_duplicates()

In [None]:
def recommend(title, sig=sig1):
    idx = index[title]
    scores = list(enumerate(sig1[idx]))
    scores = sorted(scores, key=lambda x:x[1], reverse=True)
    scores= scores[1:11]
    mov_index = [i[0] for i in scores]
    return df.iloc[mov_index][["title","genres"]]

In [None]:
recommend("Ragnarok")

In [None]:
recommend("Breaking Bad")

In [None]:
# df.to_csv("tag.csv", index=False)
# df = pd.read_csv("tag.csv")