# Due 4/26/2022

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np

For various attributes, like similarity in description + title, actors, director, etc, depending on the position in each list a movie appears, it will be assigned a certain number of points, and the movie with the most points will be recommended next.

In [234]:
# load in the Netflix library
netflix = pd.read_csv("netflix_titles.csv")

# take out some of the things we care less about
netflix_simple = netflix.drop(['date_added', 'release_year', 'duration'], axis = 1)

In [281]:
# fill in any NaNs with empty strings so the vectorizing doesn't do a stupid

netflix_simple.fillna("", inplace=True)
netflix_simple.head()

# things that don't need extra manipulation
titledesc = netflix_simple['description'] + netflix_simple['title']
listed = netflix_simple['listed_in']

# things that do need some help, since they contain multiword tokens
cast = netflix_simple['cast']
cast = cast.str.rsplit(pat=',', n = 1, expand = True)

director = netflix_simple['director']
director = director.str.split(pat=',')

# all the things we;ll be using for the ranking
metrics = [titledesc, listed, cast, director]

In [282]:
cast = cast.iloc[:,0]

In [283]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english', token_pattern = ',')
tfidf_cast = tf.fit_transform(cast)

In [284]:
# Create tfidf vectorizer:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')

# Use vectorizer to create tfidf matrix.
tfidf_titledesc = tf.fit_transform(titledesc)
tfidf_listed = tf.fit_transform(listed)

In [285]:
# Compute the similarity matrix so we know how similar the given movie is to the others available
sim_titledesc = linear_kernel(tfidf_titledesc, tfidf_titledesc)
sim_cast = linear_kernel(tfidf_cast, tfidf_cast)
sim_listed = linear_kernel(tfidf_listed, tfidf_listed)

In [204]:
# just an example of how the indexing works here; consider deleting

movie_A = 1
movie_B = 5

similarity = sim_titledesc[movie_A, movie_B]
print("Similarity:", similarity)

print("Movie A:", cast.loc[movie_A])

Similarity: 0.009992218981363724
Movie A: Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng


In [286]:
# Reset the numbering of the rows to make sure no numbers are skipped because we removed rows
netflix_simple.reset_index(inplace=True)

# reindex according to title so that the indexing later is easier
indices = pd.Series(netflix_simple.index, index=netflix_simple['title'])

In [287]:
indices['Star Trek: The Next Generation']

4946

In [288]:
# Get the movie's index based on its title:
index = indices["Star Trek: The Next Generation"]

# Use that index to get the similarities matrix row
# that gives a similarity score for this movie
# compared to each other movie:
row = sim_cast[index]

# Print the similarity scores with the first 20 movies:
print(row[:20])

[0.         0.99891542 1.         0.         0.99990288 0.99920635
 0.99994763 0.99990288 0.96963605 0.9998368  0.         0.99850645
 0.9998368  0.99994763 0.         0.99990288 0.         0.99970591
 0.99994763 0.99990288]


In [289]:
# Convert that row to a list of (movie_row, similarity_score) pairs:
sim_scores = list(enumerate(row))

# sim_scores

In [290]:
# Sort the (movie_row, similarity_score) pairs by similarity score:
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

In [291]:
# Get the top 10 (movie_row, similarity_score) pairs:
closest_matches = sim_scores[1:10]

In [292]:
np.set_printoptions(suppress=True)

movie_indices = [i[0] for i in closest_matches]

contenders = np.array(closest_matches)

contenders

array([[ 38.,   1.],
       [ 50.,   1.],
       [ 71.,   1.],
       [100.,   1.],
       [114.,   1.],
       [165.,   1.],
       [169.,   1.],
       [204.,   1.],
       [219.,   1.]])

In [293]:
netflix_simple.iloc[movie_indices]

Unnamed: 0,level_0,index,show_id,type,title,director,cast,country,rating,listed_in,description
38,38,38,s39,Movie,Birth of the Dragon,George Nolfi,"Billy Magnussen, Ron Yuan, Qu Jingjing, Terry ...","China, Canada, United States",PG-13,"Action & Adventure, Dramas",A young Bruce Lee angers kung fu traditionalis...
50,50,50,s51,TV Show,Dharmakshetra,,"Kashmira Irani, Chandan Anand, Dinesh Mehta, A...",India,TV-PG,"International TV Shows, TV Dramas, TV Sci-Fi &...","After the ancient Great War, the god Chitragup..."
71,71,71,s72,Movie,A StoryBots Space Adventure,David A. Vargas,"Evan Spiridellis, Erin Fitzgerald, Jeff Gill, ...",,TV-Y,Children & Family Movies,Join the StoryBots and the space travelers of ...
100,100,100,s101,TV Show,Tobot Galaxy Detectives,,"Austin Abell, Travis Turner, Cole Howard, Anna...",,TV-Y7,Kids' TV,An intergalactic device transforms toy cars in...
114,114,114,s115,Movie,Anjaam,Rahul Rawail,"Madhuri Dixit, Shah Rukh Khan, Tinnu Anand, Jo...",India,TV-14,"Dramas, International Movies, Thrillers",A wealthy industrialist’s dangerous obsession ...
165,165,165,s166,TV Show,Oldsters,,"Patricio Contreras, Alejandro Goic, Sergio Her...",,TV-MA,"Crime TV Shows, International TV Shows, Spanis...",Three friends in their 70s step out of retirem...
169,169,169,s170,Movie,Poseidon,Wolfgang Petersen,"Josh Lucas, Kurt Russell, Jacinda Barrett, Ric...",United States,PG-13,"Action & Adventure, Dramas",A tidal wave spells disaster for a ship of New...
204,204,204,s205,Movie,Kyaa Super Kool Hain Hum,Sachin Yardi,"Tusshar Kapoor, Riteish Deshmukh, Anupam Kher,...",India,TV-MA,"Comedies, International Movies",An aspiring actor and a struggling DJ team up ...
219,219,219,s220,TV Show,EDENS ZERO,,"Takuma Terashima, Mikako Komatsu, Rie Kugimiya...",Japan,TV-14,"Anime Series, International TV Shows","Aboard the Edens Zero, a lonely boy with the a..."


In [104]:
row2 = sim_titledesc[index]

sim_scores = list(enumerate(row2))

sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

closest_matches2 = sim_scores[1:10]
                             
movie_indices = [i[0] for i in closest_matches2]

contenders2 = np.array(closest_matches2)
                             
netflix_simple.iloc[movie_indices]

Unnamed: 0,index,show_id,type,title,director,cast,country,rating,listed_in,description
5244,5244,s5245,TV Show,Star Trek: Enterprise,,"Scott Bakula, John Billingsley, Jolene Blalock...",United States,TV-14,"Classic & Cult TV, TV Action & Adventure, TV S...",Capt. Archer and his crew explore space and di...
5650,5650,s5651,TV Show,Star Trek: Deep Space Nine,,"Avery Brooks, Nana Visitor, Rene Auberjonois, ...",United States,TV-14,"TV Action & Adventure, TV Sci-Fi & Fantasy","In this ""Star Trek"" spin-off, Commander Sisko ..."
594,594,s595,Movie,Star Trek,J.J. Abrams,"Chris Pine, Zachary Quinto, Karl Urban, Zoe Sa...","United States, Germany",PG-13,"Action & Adventure, Sci-Fi & Fantasy",On their first voyage aboard the starship Ente...
5245,5245,s5246,TV Show,Star Trek: Voyager,,"Kate Mulgrew, Robert Beltran, Roxann Dawson, J...",United States,TV-PG,"TV Action & Adventure, TV Sci-Fi & Fantasy","On Voyager's 75-year journey back to Earth, th..."
8586,8586,s8587,Movie,Thumper,Jordan Ross,"Eliza Taylor, Pablo Schreiber, Daniel Webber, ...",United States,TV-MA,"Dramas, Thrillers",After moving to a hardscrabble suburban Califo...
2005,2005,s2006,Movie,Lara and the Beat,Tosin Coker,"Seyi Shay, Somkele Iyamah, Vector, Chioma Chuk...",Nigeria,TV-MA,"Dramas, International Movies, Music & Musicals","When their glamorous, fast-paced lifestyle com..."
956,956,s957,Movie,Zack and Miri Make a Porno,Kevin Smith,"Seth Rogen, Elizabeth Banks, Craig Robinson, J...",United States,R,"Comedies, Independent Movies, Romantic Movies",Zack and Miri make and star in an adult film t...
5693,5693,s5694,Movie,For the Love of Spock,Adam Nimoy,"Leonard Nimoy, William Shatner, George Takei, ...","Canada, United States",TV-14,Documentaries,The son of actor Leonard Nimoy directs this mo...
4652,4652,s4653,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,TV-Y7,Kids' TV,When a prison ship crash unleashes hundreds of...


In [103]:
contenders

array([[ 908.        ,    1.        ],
       [1779.        ,    1.        ],
       [2405.        ,    1.        ],
       [2470.        ,    1.        ],
       [3615.        ,    1.        ],
       [4946.        ,    1.        ],
       [5245.        ,    1.        ],
       [5650.        ,    1.        ],
       [3674.        ,    0.92786103]])

In [105]:
contenders2

array([[5244.        ,    0.20736046],
       [5650.        ,    0.11901829],
       [ 594.        ,    0.10358936],
       [5245.        ,    0.08648629],
       [8586.        ,    0.065304  ],
       [2005.        ,    0.05835767],
       [ 956.        ,    0.05614262],
       [5693.        ,    0.05493923],
       [4652.        ,    0.05028207]])

In [123]:
test = np.concatenate((contenders, contenders2))

test

array([[ 908.        ,    1.        ],
       [1779.        ,    1.        ],
       [2405.        ,    1.        ],
       [2470.        ,    1.        ],
       [3615.        ,    1.        ],
       [4946.        ,    1.        ],
       [5245.        ,    1.        ],
       [5650.        ,    1.        ],
       [3674.        ,    0.92786103],
       [5244.        ,    0.20736046],
       [5650.        ,    0.11901829],
       [ 594.        ,    0.10358936],
       [5245.        ,    0.08648629],
       [8586.        ,    0.065304  ],
       [2005.        ,    0.05835767],
       [ 956.        ,    0.05614262],
       [5693.        ,    0.05493923],
       [4652.        ,    0.05028207]])

In [163]:
net = test[:,0]

finalists = np.zeros((len(test), 2))

for i in range(len(net)):
    ind = np.where(test[:,0] == net[i])
    finalists[i,0] = net[i]
    finalists[i,1] = np.sum(test[ind,1])
    
finalists = np.unique(finalists, axis = 0)

In [164]:
finalists

array([[ 594.        ,    0.10358936],
       [ 908.        ,    1.        ],
       [ 956.        ,    0.05614262],
       [1779.        ,    1.        ],
       [2005.        ,    0.05835767],
       [2405.        ,    1.        ],
       [2470.        ,    1.        ],
       [3615.        ,    1.        ],
       [3674.        ,    0.92786103],
       [4652.        ,    0.05028207],
       [4946.        ,    1.        ],
       [5244.        ,    0.20736046],
       [5245.        ,    1.08648629],
       [5650.        ,    1.11901829],
       [5693.        ,    0.05493923],
       [8586.        ,    0.065304  ]])

In [165]:
print(np.where(finalists[:,1] == max(finalists[:,1])))

(array([13], dtype=int64),)


In [184]:
ind = finalists[np.where(finalists[:,1] == max(finalists[:,1]))]

ind = int(ind[0,0])

ind

5650

In [187]:
netflix_simple.iloc[ind]

index                                                       5650
show_id                                                    s5651
type                                                     TV Show
title                                 Star Trek: Deep Space Nine
director                                                        
cast           Avery Brooks, Nana Visitor, Rene Auberjonois, ...
country                                            United States
rating                                                     TV-14
listed_in             TV Action & Adventure, TV Sci-Fi & Fantasy
description    In this "Star Trek" spin-off, Commander Sisko ...
Name: 5650, dtype: object