## embedding을 활용한 유사도 측정

### word2vec을 이용한 단어 embedding

In [1]:
from gensim.models import Word2Vec
import re

In [2]:
# GPT야, simpsons 캐릭터 이름이 들어간 랜덤 문장 10개를 생성해줘

sentences = ["Homer Simpson forgot his lunch at home, so he had to buy a burger on his way to work.",
    "Marge was busy knitting a new sweater for Bart's upcoming school play.",
    "Lisa Simpson played a beautiful saxophone solo at the school concert.",
    "Mr. Burns secretly plotted another scheme from his office at the Springfield Nuclear Power Plant.",
    "Ned Flanders offered to help Homer fix the fence between their houses.",
    "Bart Simpson tried a new prank at school, but it didn't go as planned.",
    "Milhouse and Bart spent the afternoon playing video games and forgot to do their homework.",
    "Maggie Simpson's adorable giggle filled the room as she played with her toys.",
    "Apu had a busy day at the Kwik-E-Mart, dealing with a rush of customers.",
    "Krusty the Clown decided to change his show a bit to attract a new audience."]

In [3]:
# preprocessing
# get rid of stopwords, lower case

sentences = [re.sub(r"[.',]", "", sentence).lower().split(" ") for sentence in sentences]

In [4]:
sentences[0]

['homer',
 'simpson',
 'forgot',
 'his',
 'lunch',
 'at',
 'home',
 'so',
 'he',
 'had',
 'to',
 'buy',
 'a',
 'burger',
 'on',
 'his',
 'way',
 'to',
 'work']

In [5]:
# train word2vec

skip_gram = Word2Vec(sentences, vector_size=300, min_count=1, window=5, sg=1)

In [6]:
print("{} 의 vector representation : \n{}".format('homer', skip_gram.wv.get_vector(skip_gram.wv.key_to_index['homer'])))

homer 의 vector representation : 
[-1.66068680e-03 -4.20303462e-04  1.09255337e-03 -2.18484551e-03
 -3.23499087e-03 -3.12648294e-03  3.04949097e-03  1.87829603e-03
 -1.61115616e-03 -2.80928891e-03  4.52324020e-04  9.58630932e-04
 -4.38321120e-04  4.20039461e-04 -1.44900673e-03  1.57583959e-03
  5.10560756e-04  2.96067167e-03 -3.31212161e-03 -1.82445510e-03
 -3.01285670e-03 -1.11729649e-04 -2.61634029e-03  1.71959959e-03
 -2.11100094e-03 -2.03485345e-03  1.69092277e-03 -2.72677210e-03
  4.76851710e-04 -2.43410910e-03  3.28659941e-03  2.89322482e-03
  5.95899648e-04  1.94471062e-03  1.52419391e-03 -1.98301394e-03
  3.28957522e-03 -3.26667982e-03  2.68764189e-03  9.17330268e-04
 -9.88418004e-04 -1.19753159e-03  3.04961996e-03 -1.81994250e-03
  2.79433304e-03 -1.95553945e-03  2.79207341e-03 -1.48160951e-04
  2.66201468e-03 -1.02206389e-03  2.01950571e-03  2.95562134e-03
  8.11300066e-04  4.55215428e-04  1.65783032e-03  2.71425396e-03
  2.85594515e-03  2.84643611e-03  2.36571440e-03  2.69569

In [7]:
skip_gram.wv.most_similar("homer")

[('marge', 0.14081521332263947),
 ('offered', 0.13243569433689117),
 ('games', 0.12250109761953354),
 ('her', 0.11486156284809113),
 ('nuclear', 0.10569246113300323),
 ('do', 0.09913020581007004),
 ('toys', 0.0984482690691948),
 ('office', 0.09244135022163391),
 ('bart', 0.09009940177202225),
 ('way', 0.08802291750907898)]

직접 유사도 구해보기

In [8]:
homer_vector = skip_gram.wv.get_vector(skip_gram.wv.key_to_index['homer'])
marge_vector = skip_gram.wv.get_vector(skip_gram.wv.key_to_index['marge'])

In [9]:
# 유사도 계산하기 from scratch
import numpy as np
from numpy.linalg import norm

def cosine_similarity(vector_a: np.ndarray, vector_b: np.ndarray) -> float:
    """
    두 벡터간 cosine similarity를 계산
    
    Parameters
    ----------
    vector_a : np.ndarray
        The first input vector.
    vector_b : np.ndarray
        The second input vector.

    Returns
    -------
    float
        The cosine similarity between `vector_a` and `vector_b`, which is a value between -1 and 1.

    """

    dot_product = np.dot(vector_a, vector_b)
    norm_a = norm(vector_a)
    norm_b = norm(vector_b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [10]:
cosine_similarity(homer_vector, marge_vector)

0.14081518

### Simpsons dataset을 활용한 Word2Vec

In [11]:
from typing import Optional
import nltk
import re
import pandas as pd
import spacy

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sckim\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sckim\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [14]:
df = pd.read_csv('../data/simpsons_dataset.csv')
df.shape

(158314, 2)

In [15]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [16]:
df.isnull().sum()

raw_character_text    17814
spoken_words          26459
dtype: int64

In [17]:
df.loc[0, 'spoken_words']

"No, actually, it was a little of both. Sometimes when a disease is in all the magazines and all the news shows, it's only natural that you think you have it."

#### 데이터 전처리

In [18]:
# lemmatize and remove the stopwords and non-alphabetic characters for each line of dialogue

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

def cleaning(doc):
    """
    Cleans a spaCy Doc object by lemmatizing its tokens and removing stop words,
    then joins the remaining tokens into a single string if there are more than two tokens left.
    
    Parameters:
    ----------
    doc : spacy.tokens.Doc
        A spaCy Doc object containing the processed text.
    
    Returns:
    ----------
    Optional : str
        A string composed of the lemmatized, non-stop tokens separated by spaces,
        if the resulting list of tokens has more than two elements. Otherwise, returns None.
    """

    txt = [token.lemma_ for token in doc if not token.is_stop]
    if len(txt) > 2:
        return ' '.join(txt)

In [19]:
# only keep alphabets
cleaner = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [20]:
txt = [cleaning(doc) for doc in nlp.pipe(cleaner, batch_size=5000)]

In [21]:
txt[0]

'actually little disease magazine news show natural think'

In [22]:
# dataframe에 넣어서 null이 있는 대화는 삭제
# 주로 null은 특정 행동을 했지만 대화가 없었을 때임

df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85956, 1)

In [24]:
# 하나의 문장을 여러 단위의 단어로 분할
sentences = [s.split(' ') for s in df_clean['clean']]

In [25]:
len(sentences)

85956

In [26]:
sentences[0]

['actually',
 'little',
 'disease',
 'magazine',
 'news',
 'show',
 'natural',
 'think']

#### Word2Vec 모델 훈련

In [27]:
from gensim.models import Word2Vec

In [None]:
# help(Word2Vec)

- `window` : 문장 내에서 현재 단어와 예측 단어 사이의 최대 거리. ex) 타겟 단어의 왼쪽과 오른쪽 n번째 단어
- `vector_size` : 단어 벡터의 차원 수
- `min_count` : 이 값보다 총 절대 빈도수가 낮은 모든 단어를 무시함 - (2, 100)
- `sg` : 1은 skip-gram, 0은 CBOW method를 사용

In [28]:
# 모델 정의 하기
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     vector_size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007)

In [29]:
# 문장에 들어있는 각 단어들을 Word2Vec 모델이 인식할 수 있는 형태로 변환
w2v_model.build_vocab(sentences)

In [30]:
# 모델 훈련
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=100)

(19983507, 54001900)

In [None]:
# help(w2v_model.train)

### 단어간 유사도 확인하기

In [None]:
# dir(w2v_model.wv)

- most_similar : 주어진 조건에 가장 적합한 단어 탐색
- similarity : 주어진 단어들의 유사도 계산
- doesnt_match : 주어진 단어들 중 가장 '덜 유사한' 단어

In [None]:
# help(w2v_model.wv.most_similar)

In [None]:
# help(w2v_model.wv.similarity)

In [32]:
w2v_model.wv.most_similar(positive=["homer"])

[('marge', 0.4234519898891449),
 ('bart', 0.3308035433292389),
 ('simpson', 0.31797534227371216),
 ('mr', 0.2797403931617737),
 ('lisa', 0.2631458640098572),
 ('barney', 0.25580036640167236),
 ('son', 0.24946914613246918),
 ('husband', 0.24716949462890625),
 ('wife', 0.24083773791790009),
 ('mrs', 0.23470626771450043)]

In [33]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.41932833194732666),
 ('homer', 0.3308035135269165),
 ('dad', 0.32987260818481445),
 ('mom', 0.32922524213790894),
 ('milhouse', 0.3127206861972809),
 ('boy', 0.3014339506626129),
 ('kid', 0.28203287720680237),
 ('son', 0.2761775553226471),
 ('child', 0.27022725343704224),
 ('think', 0.26444777846336365)]

- Woman : homer = ___ : marge

In [34]:
w2v_model.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)

[('man', 0.20725250244140625),
 ('modern', 0.20201577246189117),
 ('see', 0.20097629725933075)]

In [35]:
w2v_model.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.3182581067085266),
 ('maggie', 0.2833455801010132),
 ('mom', 0.2700842022895813)]

In [36]:
w2v_model.wv.doesnt_match(['bart', 'homer', 'marge'])

'bart'

In [37]:
w2v_model.wv.doesnt_match(['bart', 'lisa', 'marge'])

'marge'

### 단어 임베딩의 한계점

In [38]:
bank_vector = w2v_model.wv.get_vector(w2v_model.wv.key_to_index['bank'])

In [None]:
# bank_vector

In [None]:
# bank_vector

- 우리가 사용하는 모든 단어는 context에 따라 의미가 다르다
- 단어 embedding의 경우 이런 유연성을 확보하지 못 함
    - 배를 깎아 먹었다 / 배가 고프다 / 배 멀미를 하다

In [None]:
# bank_vector

### sentence embeddings

In [39]:
from transformers import BertTokenizer, BertModel
import torch

In [40]:
# pre-trained model tokenizer와 and bert model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # smaller & uncased model
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [41]:
# bank가 들어간 유사한 문장 두 개
sentence1 = "I deposited money at the bank."
sentence2 = "The ducks swam to the river bank."

In [42]:
# 문장을 BERT가 인식할 수 있는 형태로 Tokenize
encoded_input1 = tokenizer(sentence1, return_tensors='pt') # pytorch
encoded_input2 = tokenizer(sentence2, return_tensors='pt')

In [43]:
encoded_input1

{'input_ids': tensor([[  101,  1045, 14140,  2769,  2012,  1996,  2924,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [44]:
encoded_input2

{'input_ids': tensor([[  101,  1996, 14875, 16849,  2000,  1996,  2314,  2924,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

- `input_ids` : 각 단어별로 매핑된 key. 101은 문장의 시작을, 102는 문장의 끝을 의미
- `token_type_ids` : 문장 번호
- `attention_mask` : attention을 가져야 하는 단어는 1, 그렇지 않은 단어는 0. (만약 input이 실제 단어들이라면 1)

In [45]:
# embedding 생성!
with torch.no_grad():
    output1 = model(**encoded_input1)
    output2 = model(**encoded_input2)

In [46]:
# embedding 내에서 bank라는 단어 찾아오기 (문장의 5번째에 있는 단어)
bank_embedding_sentence1 = output1.last_hidden_state[0, 5, :]
bank_embedding_sentence2 = output2.last_hidden_state[0, 5, :]

In [47]:
# cosine similarity 계산

similarity = cosine_similarity(bank_embedding_sentence1, bank_embedding_sentence2)
# print("Embedding for 'bank' in sentence 1:", bank_embedding_sentence1)
# print("Embedding for 'bank' in sentence 2:", bank_embedding_sentence2)
print("Cosine similarity between the two embeddings:", similarity)

Cosine similarity between the two embeddings: 0.592241
