# Chapter 4 - TF-IDF and similarity scores

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

## Building tf-idf document vectors

## tf-idf vectors for TED talks

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
ted = pd.read_csv("ted.csv")

In [4]:
ted.head()

Unnamed: 0,transcript,url
0,"We're going to talk — my — a new lecture, just...",https://www.ted.com/talks/al_seckel_says_our_b...
1,"This is a representation of your brain, and yo...",https://www.ted.com/talks/aaron_o_connell_maki...
2,It's a great honor today to share with you The...,https://www.ted.com/talks/carter_emmart_demos_...
3,"My passions are music, technology and making t...",https://www.ted.com/talks/jared_ficklin_new_wa...
4,It used to be that if you wanted to get a comp...,https://www.ted.com/talks/jeremy_howard_the_wo...


In [5]:
# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ted['transcript'])

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

(500, 29158)


## Cosine similarity

### Computing dot product

In [6]:
# Initialize numpy vectors
A = np.array([1,3])
B = np.array([-2,2])

# Compute dot product
dot_prod = np.dot(A, B)

# Print dot product
print(dot_prod)

4


### Cosine similarity matrix of a corpus

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
corpus = ted['transcript']

In [11]:
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)

[[1.         0.47014672 0.43111468 ... 0.46412547 0.45670633 0.49282253]
 [0.47014672 1.         0.44271363 ... 0.46985012 0.44711665 0.50449557]
 [0.43111468 0.44271363 1.         ... 0.46801727 0.39181858 0.46637994]
 ...
 [0.46412547 0.46985012 0.46801727 ... 1.         0.45097242 0.48634418]
 [0.45670633 0.44711665 0.39181858 ... 0.45097242 1.         0.49410619]
 [0.49282253 0.50449557 0.46637994 ... 0.48634418 0.49410619 1.        ]]


## Building a plot line based recommender

In [24]:
#movies = pd.read_csv("movies_genres.csv.bz2", delimiter='\t')
#movies_plot = movies['plot']
#movies_plot.head()

### Comparing linear_kernel and cosine_similarity

In [14]:
import time
from sklearn.metrics.pairwise import linear_kernel

In [13]:
# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" %(time.time() - start))

[[1.         0.47014672 0.43111468 ... 0.46412547 0.45670633 0.49282253]
 [0.47014672 1.         0.44271363 ... 0.46985012 0.44711665 0.50449557]
 [0.43111468 0.44271363 1.         ... 0.46801727 0.39181858 0.46637994]
 ...
 [0.46412547 0.46985012 0.46801727 ... 1.         0.45097242 0.48634418]
 [0.45670633 0.44711665 0.39181858 ... 0.45097242 1.         0.49410619]
 [0.49282253 0.50449557 0.46637994 ... 0.48634418 0.49410619 1.        ]]
Time taken: 0.09172368049621582 seconds


In [15]:
# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" %(time.time() - start))

[[1.         0.47014672 0.43111468 ... 0.46412547 0.45670633 0.49282253]
 [0.47014672 1.         0.44271363 ... 0.46985012 0.44711665 0.50449557]
 [0.43111468 0.44271363 1.         ... 0.46801727 0.39181858 0.46637994]
 ...
 [0.46412547 0.46985012 0.46801727 ... 1.         0.45097242 0.48634418]
 [0.45670633 0.44711665 0.39181858 ... 0.45097242 1.         0.49410619]
 [0.49282253 0.50449557 0.46637994 ... 0.48634418 0.49410619 1.        ]]
Time taken: 0.0937492847442627 seconds


### The recommender function

In [43]:
metadata = pd.read_csv("movie_overviews.csv")
metadata.head(10)

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...
5,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga
6,11860,Sabrina,An ugly duckling having undergone a remarkable...,You are cordially invited to the most surprisi...
7,45325,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",The Original Bad Boys.
8,9091,Sudden Death,International action superstar Jean Claude Van...,Terror goes into overtime.
9,710,GoldenEye,James Bond must unmask the mysterious head of ...,No limits. No fears. No substitutes.


In [75]:
def get_recommendations(title, cosine_sim, indices):
    # Get index of movie that matches title
    idx = indices[title]
    
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [39]:
metadata = metadata.dropna()

In [40]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

tfidf_vectorizer = TfidfVectorizer()
corpus = metadata['overview']
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [44]:
get_recommendations('Sudden Death',cosine_sim,indices)

4143                                 Men in Black II
1992                             Simply Irresistible
6713                                            Fido
4147                               Road to Perdition
1993                                        20 Dates
3775                                        Cruising
2867                               Battlefield Earth
1968    Texas Chainsaw Massacre: The Next Generation
1995                                   The Last Days
4144                       The Powerpuff Girls Movie
Name: title, dtype: object

### TED talk recommender

In [89]:
def get_ted_recommendations(title, cosine_sim, indices, df):
    # Get index of movie that matches title
    idx = indices[title]
    
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the ted indices
    ted_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df['title'].iloc[ted_indices]

In [82]:
ted = pd.read_csv("dataset/ted_main.csv")
transcripts = pd.read_csv("dataset/transcripts.csv")

In [83]:
ted['title'].head()

0        Do schools kill creativity?
1        Averting the climate crisis
2                   Simplicity sells
3                Greening the ghetto
4    The best stats you've ever seen
Name: title, dtype: object

In [84]:
transcripts = transcripts['transcript']

In [85]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(transcripts)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [86]:
indices = pd.Series(ted.index, index=ted['title']).drop_duplicates()

In [90]:
print(get_ted_recommendations('Averting the climate crisis', cosine_sim, indices, ted))

364                   Organic algorithms in architecture
197                 A preview of the WorldWide Telescope
40     My wish: Manufactured landscapes and green edu...
219                          Where does creativity hide?
354                            Why we make bad decisions
210             The search for the true face of Leonardo
102                                      "Black Men Ski"
199              On the verge of creating synthetic life
597                     10 young Indian artists to watch
242                         Your genes are not your fate
Name: title, dtype: object


## Beyond n-grams: word embeddings

In [91]:
import spacy

In [93]:
sent = 'I like apples and oranges'
nlp = spacy.load('en_core_web_sm')

In [95]:
# Create the doc object
doc = nlp(sent)

# Compute pairwise similarity scores
for token1 in doc:
    for token2 in doc:
        print(token1.text, token2.text, token1.similarity(token2))

I I 1.0
I like 0.13463897
I apples -0.0361336
I and -0.08523058
I oranges 0.03370864
like I 0.13463897
like like 1.0
like apples 0.0007651841
like and 0.10452179
like oranges -0.045859132
apples I -0.0361336
apples like 0.0007651841
apples apples 1.0
apples and -0.051072996
apples oranges 0.46452007
and I -0.08523058
and like 0.10452179
and apples -0.051072996
and and 1.0
and oranges 0.038236685
oranges I 0.03370864
oranges like -0.045859132
oranges apples 0.46452007
oranges and 0.038236685
oranges oranges 1.0


  import sys


### Computing similarity of Pink Floyd songs

In [97]:
mother = "\nMother do you think they'll drop the bomb?\nMother do you think they'll like this song?\nMother do you think they'll try to break my balls?\nOoh, ah\nMother should I build the wall?\nMother should I run for President?\nMother should I trust the government?\nMother will they put me in the firing mine?\nOoh ah,\nIs it just a waste of time?\nHush now baby, baby, don't you cry.\nMama's gonna make all your nightmares come true.\nMama's gonna put all her fears into you.\nMama's gonna keep you right here under her wing.\nShe won't let you fly, but she might let you sing.\nMama's gonna keep baby cozy and warm.\nOoh baby, ooh baby, ooh baby,\nOf course mama's gonna help build the wall.\nMother do you think she's good enough, for me?\nMother do you think she's dangerous, to me?\nMother will she tear your little boy apart?\nOoh ah,\nMother will she break my heart?\nHush now baby, baby don't you cry.\nMama's gonna check out all your girlfriends for you.\nMama won't let anyone dirty get through.\nMama's gonna wait up until you get in.\nMama will always find out where you've been.\nMama's gonna keep baby healthy and clean.\nOoh baby, ooh baby, ooh baby,\nYou'll always be baby to me.\nMother, did it need to be so high?\n"
hey = "\nHey you, out there in the cold\nGetting lonely, getting old\nCan you feel me?\nHey you, standing in the aisles\nWith itchy feet and fading smiles\nCan you feel me?\nHey you, don't help them to bury the light\nDon't give in without a fight\nHey you out there on your own\nSitting naked by the phone\nWould you touch me?\nHey you with you ear against the wall\nWaiting for someone to call out\nWould you touch me?\nHey you, would you help me to carry the stone?\nOpen your heart, I'm coming home\nBut it was only fantasy\nThe wall was too high\nAs you can see\nNo matter how he tried\nHe could not break free\nAnd the worms ate into his brain\nHey you, out there on the road\nAlways doing what you're told\nCan you help me?\nHey you, out there beyond the wall\nBreaking bottles in the hall\nCan you help me?\nHey you, don't tell me there's no hope at all\nTogether we stand, divided we fall\n"
hopes = "\nBeyond the horizon of the place we lived when we were young\nIn a world of magnets and miracles\nOur thoughts strayed constantly and without boundary\nThe ringing of the division bell had begun\nAlong the Long Road and on down the Causeway\nDo they still meet there by the Cut\nThere was a ragged band that followed in our footsteps\nRunning before times took our dreams away\nLeaving the myriad small creatures trying to tie us to the ground\nTo a life consumed by slow decay\nThe grass was greener\nThe light was brighter\nWhen friends surrounded\nThe nights of wonder\nLooking beyond the embers of bridges glowing behind us\nTo a glimpse of how green it was on the other side\nSteps taken forwards but sleepwalking back again\nDragged by the force of some in a tide\nAt a higher altitude with flag unfurled\nWe reached the dizzy heights of that dreamed of world\nEncumbered forever by desire and ambition\nThere's a hunger still unsatisfied\nOur weary eyes still stray to the horizon\nThough down this road we've been so many times\nThe grass was greener\nThe light was brighter\nThe taste was sweeter\nThe nights of wonder\nWith friends surrounded\nThe dawn mist glowing\nThe water flowing\nThe endless river\nForever and ever\n"

In [101]:
mother_doc = nlp(mother)
hopes_doc = nlp(hopes)
hey_doc = nlp(hey)

In [102]:
print(mother_doc.similarity(hopes_doc))

0.3908603353929541


  """Entry point for launching an IPython kernel.


In [103]:
print(mother_doc.similarity(hey_doc))

0.8043759491830315


  """Entry point for launching an IPython kernel.


In [105]:
50737-51950

-1213