In [1]:
# Workflow
# https://techvidvan.com/tutorials/movie-recommendation-system-python-machine-learning/

In [2]:
# imports

import io
import numpy as np
import pandas as pd
import sys
import codecs

import nltk
from gensim.models import Word2Vec, KeyedVectors
from sklearn.manifold import TSNE

%matplotlib inline
import matplotlib.pyplot as plt

import ssl

from ast import literal_eval

In [3]:
# override ssl error from trying to reload nltk data

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [4]:
# Load in data
watch_history = pd.read_csv('NetflixViewingHistoryDan.csv')
watch_history

Unnamed: 0,Title,Date
0,Big Fish,3/28/22
1,Monsters vs. Aliens,3/19/22
2,Brand New Cherry Flavor: Limited Series: Tadpo...,3/10/22
3,Brand New Cherry Flavor: Limited Series: I Exist,3/10/22
4,Starship Troopers,3/9/22
...,...,...
973,Battle for Haditha,3/14/15
974,Blackfish,3/14/15
975,The Immigrant,2/24/15
976,Elsa & Fred,2/23/15


In [5]:
# Netflix database
netflix = pd.read_csv('netflix_titles.csv')
netflix

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [6]:
# Keeping only movie data for watch history
movie_dict = {
    'title': [],
    'director': [],
    'cast': [],
    'list': [],
    'description': []
}

for item in watch_history['Title']:
    match = netflix[netflix['title']== item]
    if match.shape[0] != 0 and 'Movie' in match['type'].tolist():
        movie_dict['title'].append(match['title'].tolist()[0])
        movie_dict['director'].append(match['director'].tolist()[0])
        movie_dict['cast'].append(match['cast'].tolist()[0])
        movie_dict['list'].append(match['listed_in'].tolist()[0])
        movie_dict['description'].append(match['description'].tolist()[0])


In [7]:
# Create a new dataframe of only those titles
movies_df = pd.DataFrame(movie_dict)
movies_df

Unnamed: 0,title,director,cast,list,description
0,Total Recall,Paul Verhoeven,"Arnold Schwarzenegger, Rachel Ticotin, Sharon ...","Action & Adventure, Sci-Fi & Fantasy","After getting a memory implant, working stiff ..."
1,Someone Great,Jennifer Kaytin Robinson,"Gina Rodriguez, Brittany Snow, DeWanda Wise, L...","Comedies, Romantic Movies","On the heels of a blindsiding breakup, music j..."
2,The Guns of Navarone,J. Lee Thompson,"Gregory Peck, David Niven, Anthony Quinn, Stan...","Action & Adventure, Classic Movies","During World War II, British forces launch an ..."
3,Stripes,Ivan Reitman,"Bill Murray, Harold Ramis, Warren Oates, P.J. ...","Classic Movies, Comedies, Cult Movies","After losing everything, an indolent sad sack ..."
4,Klaus,Sergio Pablos,"Jason Schwartzman, J.K. Simmons, Rashida Jones...","Children & Family Movies, Comedies",A selfish postman and a reclusive toymaker for...
...,...,...,...,...,...
162,The November Man,Roger Donaldson,"Pierce Brosnan, Luke Bracey, Olga Kurylenko, E...",Action & Adventure,An ex-CIA agent emerges from retirement to pro...
163,Kevin Hart: Let Me Explain,"Leslie Small, Tim Story",Kevin Hart,Stand-Up Comedy,Philadelphia funnyman Kevin Hart takes the sta...
164,Bill Burr: Let It Go,Shannon Hartman,Bill Burr,Stand-Up Comedy,The musings of comedian Bill Burr are let loos...
165,Blackfish,Gabriela Cowperthwaite,,Documentaries,This fascinating documentary examines the life...


In [8]:
# Having only movies in our main database
netflix = netflix[netflix['type'] == 'Movie']

In [9]:
def literal_return(val):
    try:
        return literal_eval(val)
    except ValueError:
        return (val)
    except SyntaxError:
        return(val)

In [10]:
# Selecting the features we want to use
features = ["director", "cast", "list", "description"]

for feature in features:
    movies_df[feature] = movies_df[feature].apply(literal_return)

movies_df[features].head(10)

Unnamed: 0,director,cast,list,description
0,Paul Verhoeven,"Arnold Schwarzenegger, Rachel Ticotin, Sharon ...","Action & Adventure, Sci-Fi & Fantasy","After getting a memory implant, working stiff ..."
1,Jennifer Kaytin Robinson,"Gina Rodriguez, Brittany Snow, DeWanda Wise, L...","Comedies, Romantic Movies","On the heels of a blindsiding breakup, music j..."
2,J. Lee Thompson,"Gregory Peck, David Niven, Anthony Quinn, Stan...","Action & Adventure, Classic Movies","During World War II, British forces launch an ..."
3,Ivan Reitman,"Bill Murray, Harold Ramis, Warren Oates, P.J. ...","Classic Movies, Comedies, Cult Movies","After losing everything, an indolent sad sack ..."
4,Sergio Pablos,"Jason Schwartzman, J.K. Simmons, Rashida Jones...","Children & Family Movies, Comedies",A selfish postman and a reclusive toymaker for...
5,Paul Verhoeven,"Arnold Schwarzenegger, Rachel Ticotin, Sharon ...","Action & Adventure, Sci-Fi & Fantasy","After getting a memory implant, working stiff ..."
6,Terry Jones,"Graham Chapman, John Cleese, Terry Gilliam, Er...","Classic Movies, Comedies, Cult Movies","Born in a stable in Judea, Brian grows up to j..."
7,David Gordon Green,"Seth Rogen, James Franco, Danny McBride, Kevin...","Action & Adventure, Comedies","After witnessing a murder, a perpetually stone..."
8,"Mike Rianda, Jeff Rowe","Danny McBride, Abbi Jacobson, Maya Rudolph, Mi...","Children & Family Movies, Comedies",A robot apocalypse put the brakes on their cro...
9,Matt Thompson,"Channing Tatum, Jason Mantzoukas, Olivia Munn,...","Action & Adventure, Comedies",A chainsaw-wielding George Washington teams wi...


In [11]:
# Clean data for easier conversion to metadata
def clean_data(row):
    if isinstance(row, list):
        return [str.lower(i.replace(" ", "")) for i in row]
    else:
        if isinstance(row, str):
            return str.lower(row.replace(" ", ""))
        else:
            return ""

for feature in features:
    movies_df[feature] = movies_df[feature].apply(clean_data)

In [12]:
# Metadata creation
def create_soup(features):
    return ' '.join(features['director']) + ' ' + ' '.join(features['cast']) + ' ' + features['list'] + ' ' + ' '.join(features['description'])


movies_df["soup"] = movies_df.apply(create_soup, axis=1)
print(movies_df["soup"].head())

0    p a u l v e r h o e v e n a r n o l d s c h w ...
1    j e n n i f e r k a y t i n r o b i n s o n g ...
2    j . l e e t h o m p s o n g r e g o r y p e c ...
3    i v a n r e i t m a n b i l l m u r r a y , h ...
4    s e r g i o p a b l o s j a s o n s c h w a r ...
Name: soup, dtype: object


In [13]:
# Creating one large metadata entry for all watched movies
# Possible replacement with some dimensionality reduction technique
final_soup = ' '.join(movies_df['soup'].tolist())
final_soup

'p a u l v e r h o e v e n a r n o l d s c h w a r z e n e g g e r , r a c h e l t i c o t i n , s h a r o n s t o n e , m i c h a e l i r o n s i d e , r o n n y c o x , m a r s h a l l b e l l , m i c h a e l c h a m p i o n , m e l j o h n s o n j r . , r o y b r o c k s m i t h , r o s e m a r y d u n s m o r e action&adventure,sci-fi&fantasy a f t e r g e t t i n g a m e m o r y i m p l a n t , w o r k i n g s t i f f d o u g l a s q u a i d d i s c o v e r s h e m i g h t a c t u a l l y b e a s e c r e t a g e n t e m b r o i l e d i n a v i o l e n t i n s u r r e c t i o n o n m a r s . j e n n i f e r k a y t i n r o b i n s o n g i n a r o d r i g u e z , b r i t t a n y s n o w , d e w a n d a w i s e , l a k e i t h s t a n f i e l d , r u p a u l c h a r l e s , p e t e r v a c k , a l e x m o f f a t , r o s a r i o d a w s o n , r e b e c c a n a o m i j o n e s , j a b o u k i e y o u n g - w h i t e , m i c h e l l e b u t e a u , q u e s t l o v e comedies,romanticmo

In [14]:
# Final database for similarity matrix calculation
movie_database = netflix.drop(['show_id', 'type', 'country', 'date_added', 'release_year', 'rating', 'duration'], axis=1)

In [15]:
# Selecting the features we want to use in our final database
features = ["director", "cast", "listed_in", "description"]

for feature in features:
    movie_database[feature] = movie_database[feature].apply(literal_return)

movie_database[features].head(10)

Unnamed: 0,director,cast,listed_in,description
0,Kirsten Johnson,,Documentaries,"As her father nears the end of his life, filmm..."
6,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
7,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
9,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...","Comedies, Dramas",A woman adjusting to life after a loss contend...
12,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Dramas, International Movies",After most of her family is murdered in a terr...
13,Bruno Garotti,"Klara Castanho, Lucca Picon, Júlia Gomes, Marc...","Children & Family Movies, Comedies",When the clever but socially-awkward Tetê join...
16,"Pedro de Echave García, Pablo Azorín Williams",,"Documentaries, International Movies",Declassified documents reveal the post-WWII li...
18,Adam Salky,"Freida Pinto, Logan Marshall-Green, Robert Joh...",Thrillers,After a deadly home invasion at a couple’s new...
22,K.S. Ravikumar,"Kamal Hassan, Meena, Gemini Ganesan, Heera Raj...","Comedies, International Movies",Newly divorced and denied visitation rights wi...
23,"Alex Woo, Stanley Moore","Maisie Benson, Paul Killam, Kerry Gudjohnsen, ...",Children & Family Movies,From arcade games to sled days and hiccup cure...


In [16]:
# Clean data for easier conversion to metadata
def clean_data(row):
    if isinstance(row, list):
        return [str.lower(i.replace(" ", "")) for i in row]
    else:
        if isinstance(row, str):
            return str.lower(row.replace(" ", ""))
        else:
            return ""

for feature in features:
    movie_database[feature] = movie_database[feature].apply(clean_data)

In [17]:
# Metadata creation
def create_soup(features):
    return ' '.join(features['director']) + ' ' + ' '.join(features['cast']) + ' ' + features['listed_in'] + ' ' + ' '.join(features['description'])


movie_database["soup"] = movie_database.apply(create_soup, axis=1)
print(movie_database["soup"].head())

0     k i r s t e n j o h n s o n  documentaries a s...
6     r o b e r t c u l l e n , j o s é l u i s u c ...
7     h a i l e g e r i m a k o f i g h a n a b a , ...
9     t h e o d o r e m e l f i m e l i s s a m c c ...
12    c h r i s t i a n s c h w o c h o w l u n a w ...
Name: soup, dtype: object


In [18]:
movie_database

Unnamed: 0,title,director,cast,listed_in,description,soup
0,Dick Johnson Is Dead,kirstenjohnson,,documentaries,"asherfathernearstheendofhislife,filmmakerkirst...",k i r s t e n j o h n s o n documentaries a s...
6,My Little Pony: A New Generation,"robertcullen,joséluisucha","vanessahudgens,kimikoglenn,jamesmarsden,sofiac...",children&familymovies,equestria'sdivided.butabright-eyedherobelieves...,"r o b e r t c u l l e n , j o s é l u i s u c ..."
7,Sankofa,hailegerima,"kofighanaba,oyafunmikeogunlano,alexandraduah,n...","dramas,independentmovies,internationalmovies","onaphotoshootinghana,anamericanmodelslipsbacki...","h a i l e g e r i m a k o f i g h a n a b a , ..."
9,The Starling,theodoremelfi,"melissamccarthy,chriso'dowd,kevinkline,timothy...","comedies,dramas",awomanadjustingtolifeafteralosscontendswithafe...,t h e o d o r e m e l f i m e l i s s a m c c ...
12,Je Suis Karl,christianschwochow,"lunawedler,jannisniewöhner,milanpeschel,edinha...","dramas,internationalmovies",aftermostofherfamilyismurderedinaterroristbomb...,c h r i s t i a n s c h w o c h o w l u n a w ...
...,...,...,...,...,...,...
8801,Zinzana,majidalansari,"alisuliman,salehbakri,yasa,alial-jabri,mansoor...","dramas,internationalmovies,thrillers",recoveringalcoholictalalwakesupinsideasmall-to...,m a j i d a l a n s a r i a l i s u l i m a n ...
8802,Zodiac,davidfincher,"markruffalo,jakegyllenhaal,robertdowneyjr.,ant...","cultmovies,dramas,thrillers","apoliticalcartoonist,acrimereporterandapairofc...",d a v i d f i n c h e r m a r k r u f f a l o ...
8804,Zombieland,rubenfleischer,"jesseeisenberg,woodyharrelson,emmastone,abigai...","comedies,horrormovies","lookingtosurviveinaworldtakenoverbyzombies,ado...",r u b e n f l e i s c h e r j e s s e e i s e ...
8805,Zoom,peterhewitt,"timallen,courteneycox,chevychase,katemara,ryan...","children&familymovies,comedies","draggedfromcivilianlife,aformersuperheromusttr...","p e t e r h e w i t t t i m a l l e n , c o u ..."


In [19]:
title = ' '.join(movies_df['title'].tolist())
directors= ' '.join(movies_df['title'].tolist())
cast = ' '.join(movies_df['cast'].tolist())
listed_in = ' '.join(movies_df['list'].tolist())
description = ' '.join(movies_df['description'].tolist())

In [20]:
movie_database.loc[8807] = [title, directors, cast, listed_in,description, final_soup]

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count_vectorizer = CountVectorizer(stop_words="english")
count_matrix = count_vectorizer.fit_transform(movie_database["soup"])

print(count_matrix.shape)

cosine_sim2 = cosine_similarity(count_matrix, count_matrix) 
print(cosine_sim2.shape)

movie_database = movie_database.reset_index()
indices = pd.Series(movie_database.index, index=movie_database['title'])

(6132, 27)
(6132, 6132)


In [22]:
indices = pd.Series(movie_database.index, index=movie_database["title"]).drop_duplicates()

print(indices.head())

title
Dick Johnson Is Dead                0
My Little Pony: A New Generation    1
Sankofa                             2
The Starling                        3
Je Suis Karl                        4
dtype: int64


In [28]:
def get_recommendations(title, cosine_sim=cosine_sim2):
    idx = indices[title]
    similarity_scores = list(enumerate(cosine_sim[idx]))
    similarity_scores= sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores= similarity_scores[1:11]
    # (a, b) where a is id of movie, b is similarity_scores

    movies_indices = [ind[0] for ind in similarity_scores]
    movies = movie_database["title"].iloc[movies_indices]
    return movies

In [38]:
print("################ Content Based System #############")
# print("Recommendations for The Dark Knight Rises")
# print(get_recommendations("The Dark Knight Rises", cosine_sim2))
# print()
print("Recommendations for Zodiac")
print(get_recommendations(indices.index[-1], cosine_sim2))

################ Content Based System #############
Recommendations for Zodiac
48                  Omo Ghetto: the Saga
58                      Angamaly Diaries
633     Shadow and Bone - The Afterparty
862                 Awara Paagal Deewana
1248           Hunt for the Wilderpeople
1467             Bbuddah Hoga Terra Baap
1470                                Boss
1705                           R.K.Nagar
1799                  Muqaddar ka Faisla
1860    Angu Vaikuntapurathu (Malayalam)
Name: title, dtype: object
