# Finding similar movies/tv shows using their descriptions

Created for learning purposes.

In [34]:
import numpy as np # linear algebra
import pandas as pd # data processing
import nltk # Language processing tools
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
#Load our data
netflix_data = pd.read_csv("netflix_titles.csv")

In [36]:
#Check how data looks
netflix_data

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...
...,...,...,...,...,...,...,...,...,...,...,...,...
7782,s7783,Movie,Zozo,Josef Fares,"Imad Creidi, Antoinette Turk, Elias Gergi, Car...","Sweden, Czech Republic, United Kingdom, Denmar...","October 19, 2020",2005,TV-MA,99 min,"Dramas, International Movies",When Lebanon's Civil War deprives Zozo of his ...
7783,s7784,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...
7784,s7785,Movie,Zulu Man in Japan,,Nasty C,,"September 25, 2020",2019,TV-MA,44 min,"Documentaries, International Movies, Music & M...","In this documentary, South African rapper Nast..."
7785,s7786,TV Show,Zumbo's Just Desserts,,"Adriano Zumbo, Rachel Khoo",Australia,"October 31, 2020",2019,TV-PG,1 Season,"International TV Shows, Reality TV",Dessert wizard Adriano Zumbo looks for the nex...


In [37]:
#Replace index with netflix database convention
netflix_data.set_index('show_id',inplace=True)

In [38]:
#Lets first tokenize each description using nltk and list comprehesion
descriptions_tokenized = [nltk.word_tokenize(description) for description in netflix_data['description']]

In [39]:
#Check first description
descriptions_tokenized[0]

# Explanation
# desriptions_tokenized[i][j]
# i - index of description
# j - index of word in chosen description
# Example: desriptions_tokenized[14][5] - six word of fifteenth description (cause we numerate from 0)

['In',
 'a',
 'future',
 'where',
 'the',
 'elite',
 'inhabit',
 'an',
 'island',
 'paradise',
 'far',
 'from',
 'the',
 'crowded',
 'slums',
 ',',
 'you',
 'get',
 'one',
 'chance',
 'to',
 'join',
 'the',
 '3',
 '%',
 'saved',
 'from',
 'squalor',
 '.']

In [40]:
#Lets give each word unique ID, so it will be easier later to use it
#Easiest method is to create gensim dictionary which will contain all words without repetition
from gensim import corpora
dictionary = corpora.Dictionary(descriptions_tokenized)
print(dictionary)

Dictionary(21381 unique tokens: ['%', ',', '.', '3', 'In']...)


In [41]:
#We have 21381 uniqe words in out dictionary, lets check how many words we have in total. To do it we can multiplay rows length * columns length.
#We need to do it for each row and then sum it up, becuase every row has diffrent number of words
sum(len(row) for row in descriptions_tokenized)

205846

In [42]:
#If the dictionary would be huge, we could decrease number of words by deleting these with low frequency (lets say these which appear only once)
#Now we have ids for each word
dictionary.token2id['island']

15

In [43]:
#Now we can exchange all descriptions to numbers (their identifiers). It's called bag of words (bow).
descriptions_bow = [dictionary.doc2bow(description) for description in descriptions_tokenized]

In [44]:
#First number in tuple is ID of word. Second number in tuple is frequency in document number 88
descriptions_bow[89]

[(1, 1),
 (2, 1),
 (5, 1),
 (6, 2),
 (23, 2),
 (52, 1),
 (57, 1),
 (90, 1),
 (100, 2),
 (161, 1),
 (174, 1),
 (348, 1),
 (423, 1),
 (431, 1),
 (440, 1),
 (612, 1),
 (964, 1),
 (1135, 1),
 (1136, 1),
 (1137, 1),
 (1138, 1),
 (1139, 1),
 (1140, 1),
 (1141, 1),
 (1142, 1),
 (1143, 1),
 (1144, 1),
 (1145, 1)]

In [45]:
#Now we can create model which will allow us to represent documents as vectors. We need that to search for similarities using math. 
#Lets try TFidf which uses frequency for transforming
from gensim import models

# Train the tfidf model 
tfidf = models.TfidfModel(descriptions_bow)

# Transform the "shoot enemies" string to test how it works. First value is word ID and second one is tf-idf weight
words = "shoot enemies".lower().split()
print(tfidf[dictionary.doc2bow(words)])

[(2476, 0.5918280568737562), (18097, 0.8060642350936023)]


In [46]:
#Lets create spare matrix similarity
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[descriptions_bow], num_features=dictionary.num_pos)

In [47]:
#Now we can use model to find similar descriptions! Lets test one of the descriptions from base. I will use La casa de papel tv series, lets find it by title to get show_id.
netflix_data[netflix_data.title.str.find("La casa") > -1]

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s3489,TV Show,La casa de papel,,"Úrsula Corberó, Itziar Ituño, Álvaro Morte, Pa...",Spain,"April 3, 2020",2020,TV-MA,4 Seasons,"Crime TV Shows, International TV Shows, Spanis...",Eight thieves take hostages and lock themselve...


In [48]:
descriptions_tokenized[3488]

['Eight',
 'thieves',
 'take',
 'hostages',
 'and',
 'lock',
 'themselves',
 'in',
 'the',
 'Royal',
 'Mint',
 'of',
 'Spain',
 'as',
 'a',
 'criminal',
 'mastermind',
 'manipulates',
 'the',
 'police',
 'to',
 'carry',
 'out',
 'his',
 'plan',
 '.']

In [49]:
descriptions_bow[3488]

[(2, 1),
 (5, 1),
 (22, 2),
 (23, 1),
 (37, 1),
 (57, 1),
 (73, 1),
 (77, 1),
 (100, 1),
 (610, 1),
 (983, 1),
 (1097, 1),
 (1332, 1),
 (1536, 1),
 (1766, 1),
 (1991, 1),
 (2402, 1),
 (3614, 1),
 (4206, 1),
 (6918, 1),
 (8125, 1),
 (9307, 1),
 (13759, 1),
 (13760, 1),
 (13761, 1)]

In [50]:
#Get query bag of words and tfidf model representation
query_bow = descriptions_bow[3488]
query_tfidf = tfidf[query_bow]

In [51]:
#Get similarity list
sims = index[query_tfidf]

In [52]:
#Lets sort them and check first 15 titles similar to La casa de papel
sorted_similar = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
for document_number, score in sorted_similar[:15]:
    print(document_number, netflix_data.iloc[document_number].title, score)

3488 La casa de papel 1.0
1134 Bullet Head 0.17674914
7381 V.R. Troopers 0.16174424
1453 Coin Heist 0.14132012
6249 The Crew 0.12991296
4870 Pizza, birra, faso 0.12717776
4950 Power Rangers Time Force 0.12639105
6307 The Do-Over 0.12385647
1808 Don 2 0.12192322
4799 Paul Blart: Mall Cop 0.11928776
373 Albert Pinto Ko Gussa Kyun Aata Hai? 0.118616186
3012 Inside Man: Most Wanted 0.114272796
4150 Mission of Honor 0.108038306
4453 New York Minute 0.10540801
5000 Pukar 0.10424699


In [53]:
#Lets check 3 highest scored films descriptions to check if its somehow similar to La casa de papel description. As you can see La casa de papel similarity is 1.0 because its equal to query.
for document_number, score in sorted_similar[0:4]:
   print(netflix_data.iloc[document_number].title)
   print(netflix_data.iloc[document_number].description + "\n")

La casa de papel
Eight thieves take hostages and lock themselves in the Royal Mint of Spain as a criminal mastermind manipulates the police to carry out his plan.

Bullet Head
After a daring heist, three fugitives lock themselves in a warehouse hoping to evade the police, but find a worse threat as they fight to stay alive.

V.R. Troopers
Three friends who study martial arts find themselves defending humanity from a criminal mastermind and his invading army from another dimension.

Coin Heist
When a crisis threatens to destroy their high school, four teens hatch a daring plan to raise $10 million. Step one? Breaking into the U.S. Mint.



In [112]:
# Doc2Vec model (propably too small dataset for this one to work good)
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
doc_model = models.Doc2Vec(vector_size=50, min_count=2, epochs=500)

In [113]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(descriptions_tokenized)]
doc_model.build_vocab(documents)

In [114]:
doc_model.train(documents, total_examples=doc_model.corpus_count, epochs=500)

In [115]:
la_papel_vector = doc_model.infer_vector(descriptions_tokenized[3488])
doc_sims = doc_model.docvecs.most_similar([la_papel_vector])

In [116]:
for document_number, score in doc_sims:
    print(document_number, netflix_data.iloc[document_number].title, score)

3488 La casa de papel 0.9755862951278687
4403 Narcoworld: Dope Stories 0.6559664011001587
3267 K.O.3an Guo 0.638350248336792
4546 NSU German History X 0.6172715425491333
7016 The World Is Yours 0.6070820689201355
1711 Designated Survivor 0.584574818611145
1134 Bullet Head 0.581136167049408
6657 The Memphis Belle: A Story of a
Flying Fortress 0.5721739530563354
654 Baadshaho 0.5694758892059326
1054 Born in Gaza 0.5673355460166931


In [117]:
for document_number, score in doc_sims[0:4]:
   print(netflix_data.iloc[document_number].title)
   print(netflix_data.iloc[document_number].description + "\n")

La casa de papel
Eight thieves take hostages and lock themselves in the Royal Mint of Spain as a criminal mastermind manipulates the police to carry out his plan.

Narcoworld: Dope Stories
Ride along as police officers and drug smugglers go toe-to-toe, trying to outwit each other in locales around the world.

K.O.3an Guo
In the Silver Dimension, Liu Bei, Guan Yu and Zhang Fei plan to enroll in an elite academy as sworn brothers, but KO One visitors derail the plan.

NSU German History X
After German reunification, the neo-Nazi National Socialist Underground begins a killing spree while cops fight an uphill battle to catch them.

