# Vector Similarity

## Cosine Similarity

In [1]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [7]:
def cos_sim(A, B):
    return np.dot(A, B) / (norm(A)*norm(B))

doc1 = np.array([0,1,1,1])
doc2 = np.array([1,0,1,1])
doc3 = np.array([2,0,2,2])

print(cos_sim(doc1, doc2))
print(cos_sim(doc1, doc3))
print(cos_sim(doc2, doc3))

# Vector들이 같은 Span에 존재할 경우 유사도 1
# 코사인 유사도는 벡터의 방향을 비교

0.6666666666666667
0.6666666666666667
1.0000000000000002


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv('data\movies_metadata.csv', low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [4]:
print(data['overview'].isnull().sum())

954


In [5]:
data['overview'] = data['overview'].fillna('')

In [6]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
print('TF-IDF 행렬의 크기(shape) :',tfidf_matrix.shape)
# 즉 45466 영화를 표현하기 위해 75827개의 단어가 사용되었음.

TF-IDF 행렬의 크기(shape) : (45466, 75827)


In [None]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
print(cosine_sim.shape)

(45466, 45466)


In [10]:
title_to_index = dict(zip(data['title'], data.index))

idx = title_to_index['Father of the Bride Part II']
print(idx)

4


In [14]:
def get_recommendations(title, cosine_sim = cosine_sim):
    idx = title_to_index[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [idx[0] for idx in sim_scores]
    return data['title'].iloc[movie_indices]

In [15]:
get_recommendations('Inception')

44314                                 III
2039                                House
25299                       Borrowed Time
2114                    The Farmer's Wife
44792                            Altitude
22619                    The Monkey's Paw
3424     What Ever Happened to Baby Jane?
349                                  Cobb
37187                      Straight Story
8988                           Stone Cold
Name: title, dtype: object

## Euclidean Distance

In [17]:
import numpy as np

def dist(x,y):   
    return np.sqrt(np.sum((x-y)**2))

doc1 = np.array((2,3,0,1))
doc2 = np.array((1,2,3,1))
doc3 = np.array((2,1,2,2))
docQ = np.array((1,1,0,1))

print('문서1과 문서Q의 거리 :',dist(doc1,docQ))
print('문서2과 문서Q의 거리 :',dist(doc2,docQ))
print('문서3과 문서Q의 거리 :',dist(doc3,docQ))

문서1과 문서Q의 거리 : 2.23606797749979
문서2과 문서Q의 거리 : 3.1622776601683795
문서3과 문서Q의 거리 : 2.449489742783178


## Jaccard similarity

In [18]:
doc1 = "apple banana everyone like likey watch card holder"
doc2 = "apple banana coupon passport love you"


tokenized_doc1 = doc1.split()
tokenized_doc2 = doc2.split()

print('문서1 :',tokenized_doc1)
print('문서2 :',tokenized_doc2)

문서1 : ['apple', 'banana', 'everyone', 'like', 'likey', 'watch', 'card', 'holder']
문서2 : ['apple', 'banana', 'coupon', 'passport', 'love', 'you']


In [None]:
union = set(tokenized_doc1).union(set(tokenized_doc2))
print('문서1과 문서2의 합집합 :',union)

intersection = set(tokenized_doc1).intersection(set(tokenized_doc2))
print('문서1과 문서2의 교집합 :',intersection)

In [19]:
def jaccard_similarity(doc1, doc2):
    union = set(doc1).union(set(doc2))
    intersection = set(doc1).intersection(set(doc2))
    return len(intersection) / len(union)

In [20]:
jaccard_similarity(tokenized_doc1, tokenized_doc2)

0.16666666666666666