# sentence-BERT를 이용한 문장 유사도 측정

In [None]:
### 필요한 라이브러리 설치
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=6fa3ed0dbfca8c7ba417dfc1e2354d915784cc9ac991ae59574f0031d733fb36
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tr

In [None]:
### 필요한 라이브러리 임폴트
from sentence_transformers import SentenceTransformer

## 영어 문장 유사도 측정

In [None]:
### SentenceTransformer(SenteceBERT) 모델 생성
model_name='sentence-transformers/all-MiniLM-L12-v2'
model_eng = SentenceTransformer(model_name)

In [None]:
### 영문 텍스트 데이터 생성
eng_sentences = ["What should I do to be a great scientist?", "How can I be a good scientist?"]

In [None]:
### 문장 임베딩 벡터 생성
eng_embeddings = model_eng.encode(eng_sentences)

# 결과 확인하기
print(f'영문 텍스트 데이터 전체의 임베딩 행렬의 모양 : {eng_embeddings.shape}')
print('-'*80)
print(f'첫번째 문장의 임베딩 벡터 : \n{eng_embeddings[0, :]}')
print('-'*80)
print(f'두번째 문장의 임베딩 벡터 : \n{eng_embeddings[1, :]}')

영문 텍스트 데이터 전체의 임베딩 행렬의 모양 : (2, 384)
--------------------------------------------------------------------------------
첫번째 문장의 임베딩 벡터 : 
[ 2.22636163e-02  8.50509629e-02  7.91068599e-02 -6.84864074e-03
 -1.46627463e-02 -6.85858354e-02  2.64204349e-02  6.64388910e-02
 -4.24608253e-02 -3.75801958e-02  1.80893745e-02 -9.48859900e-02
 -4.75542396e-02  7.65434429e-02 -3.93009633e-02  2.58018598e-02
 -7.41134211e-02  5.87385967e-02  3.13342884e-02 -7.86113665e-02
 -6.38961121e-02  4.31189574e-02  6.02598302e-03  1.21794816e-03
 -5.90102486e-02  8.42711702e-02 -2.48904601e-02 -6.01979792e-02
  2.75624543e-02 -5.94474003e-02 -8.06285942e-04 -7.62991086e-02
  1.78552195e-02  2.06335429e-02 -3.27370800e-02  4.25934419e-02
  6.07815199e-02 -2.56904643e-02  5.79138026e-02  5.28666861e-02
  9.59739275e-03 -4.70351726e-02  2.61405464e-02 -9.10800532e-04
  8.14124290e-03 -1.69612709e-02  2.88743880e-02 -1.83994416e-03
  3.87182906e-02  3.14612091e-02 -5.87458573e-02 -7.69814849e-02
 -1.39820710e-01 -6

In [None]:
### 코사인 유사도 측정하기

# 필요한 함수 임폴트
from sklearn.metrics.pairwise import cosine_similarity

# 1차원 배열 --> 2차원 배열로 변환
eng_embedding1 = eng_embeddings[0, :].reshape((1, 384))
eng_embedding2 = eng_embeddings[1, :].reshape((1, 384))

# 코사인 유사도 측정
eng_sim = cosine_similarity(eng_embedding1, eng_embedding2)
print(f'영문 텍스트 데이터의 문장 간 유사도 : {eng_sim}')

영문 텍스트 데이터의 문장 간 유사도 : [[0.8954653]]


## 한글 문장 유사도 측정

In [None]:
### 사전 학습된 한국어 SentenceTransformer 모델 생성
model_name='ddobokki/klue-roberta-base-nli-sts'
# model_name='sentence-transformers/all-MiniLM-L12-v2'
# model_name='sentence-transformers/multi-qa-distilbert-cos-v1'
# model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

kor_model = SentenceTransformer(model_name)

In [None]:
### 한글 텍스트 데이터 생성
kor_sentences = ["직원이 무단 퇴사를 했는데 손해 배상 청구할 수 있나요?",
                 "무단 퇴사한 직원에 대한 손해 배상 청구가 가능한가요?"]

In [None]:
### 문장 임베딩 벡터 생성
kor_embeddings = kor_model.encode(kor_sentences)

# 결과 확인하기
print(f'한글 텍스트 데이터 전체 문장에 대한 임베딩 행렬의 모양 : {kor_embeddings.shape}')
print('-'*80)
print(f'첫번째 문장에 대한 임베딩 벡터 : \n{kor_embeddings[0, :]}')
print('-'*80)
print(f'두번째 문장에 대한 임베딩 벡터 : \n{kor_embeddings[1, :]}')

한글 텍스트 데이터 전체 문장에 대한 임베딩 행렬의 모양 : (2, 768)
--------------------------------------------------------------------------------
첫번째 문장에 대한 임베딩 벡터 : 
[-4.99942571e-01 -3.59200835e-01 -7.38402531e-02  4.84219283e-01
 -4.96614248e-01  5.82403541e-01 -4.56283726e-02 -3.81823957e-01
 -3.94545138e-01  2.18352810e-01  2.40236789e-01 -6.53119922e-01
  1.17322057e-01 -4.41113472e-01 -8.18792358e-03 -2.16762990e-01
  1.42985918e-02 -9.46455747e-02  1.55718908e-01 -1.03794202e-01
 -5.95203459e-01 -5.41155756e-01 -6.22601569e-01 -1.00601912e+00
 -2.20479086e-01 -1.32809609e-01 -4.36401814e-02 -4.01161388e-02
  5.56914985e-01  7.56343365e-01  6.72926083e-02  8.65490139e-01
  4.12386395e-02  2.79035717e-01 -1.98161840e-01 -5.00128530e-02
  3.10567111e-01 -3.46614331e-01 -6.20579302e-01  6.81772709e-01
  8.81860033e-02  2.54626155e-01 -4.60292697e-01 -1.45168155e-01
 -6.55795122e-03 -2.85244942e-01  6.48579188e-03  3.51094782e-01
  9.55613628e-02 -1.56446829e-01 -1.90832376e-01 -3.73097628e-01
  2.392450

In [None]:
### 코사인 유사도 측정

# 1차원 배열 --> 2차원 배열로 변환
kor_embedding1 = kor_embeddings[0, :].reshape((1,-1))
kor_embedding2 = kor_embeddings[1, :].reshape((1,-1))

# cosine_similarity() 사용
kor_sim = cosine_similarity(kor_embedding1, kor_embedding2)
print(f'한국어 텍스트 데이터의 두 문장간의 유사도 : {kor_sim}')

한국어 텍스트 데이터의 두 문장간의 유사도 : [[0.96061534]]


## 영어 / 한글 문장 유사도 측정

In [None]:
### 다국어 지원 Sentence-BERT 모델 생성
model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
# model_name='sentence-transformers/all-MiniLM-L12-v2'
# model_name='sentence-transformers/multi-qa-distilbert-cos-v1'
multi_model = SentenceTransformer(model_name)

In [None]:
### 텍스트 데이터 생성
sentences = ['What should I do to be a great scientist?', '훌륭한 과학자가 되려면 어떻게 해야 할까요?']

In [None]:
### 문장 임베딩 벡터 생성하기
embeddings = multi_model.encode(sentences)

# 결과 확인하기
print(f'실습용 문장 전체의 임베딩 행렬의 모양 : {embeddings.shape}')
print('-'*80)
print(f'첫번째 문장의 임베딩 벡터 : \n{embeddings[0, :]}')
print('-'*80)
print(f'두번째 문장의 임베딩 벡터 : \n{embeddings[1, :]}')

실습용 문장 전체의 임베딩 행렬의 모양 : (2, 384)
--------------------------------------------------------------------------------
첫번째 문장의 임베딩 벡터 : 
[ 5.46488203e-02  4.94343251e-01  4.10707593e-01  1.27668083e-01
 -9.11792293e-02 -5.76901734e-01  1.17372675e-02  1.73615143e-01
 -3.01463693e-01 -1.61016673e-01 -2.14112010e-02 -7.54894078e-01
 -1.01586759e-01  2.17390079e-02 -2.76769996e-01  4.08532843e-03
 -1.66494563e-01 -1.53407618e-01  9.10674408e-03 -2.29964718e-01
 -4.85452652e-01  1.59717634e-01  7.86594078e-02 -1.78895891e-01
 -2.89161235e-01  2.65021116e-01 -1.87272504e-01 -2.74408847e-01
  8.98739174e-02 -1.40582860e-01  1.85040031e-02 -8.13296661e-02
  2.98329979e-01  1.22198788e-02  8.52153525e-02  4.40637112e-01
 -3.57331261e-02 -2.71989942e-01  3.55497718e-01  2.01634899e-01
  3.68284702e-01  8.60847458e-02  6.20844901e-01  1.44933268e-01
 -1.14840753e-01 -4.09185201e-01 -2.46176887e-02 -2.33240739e-01
  7.04460591e-02  2.04685375e-01 -2.15648532e-01 -1.35802731e-01
 -4.60398763e-01 -4.503

In [None]:
### 코사인 유사도 측정

# 1차원 배열 --> 2차원 배열로 변환
embedding0 = embeddings[0, :].reshape((1,-1))
embedding1 = embeddings[1, :].reshape((1,-1))

# cosine_similarity() 사용
sim = cosine_similarity(embedding0, embedding1)
print(f'영어/한글 텍스트간의 유사도 : {sim}')

영어/한글 텍스트간의 유사도 : [[0.9493034]]


## 문장 유사도를 이용한 추천 시스템

In [None]:
'''
### 실습 개요
- dataset : 2017년 7월 또는 그 이전에 개봉된 영화 45,000편에 대한 각종 정보가
포함되어 있음
- 영화에 대한 줄거리(텍스트 데이터)를 SentenceBERT를 이용하여 문장 임베딩 행렬로 변환한다.
- 좋아하는 영화의 제목을 입력한다.
- 입력한 영화의 줄거리와 유사한 줄거리를 가지는 영화 제목을 찾아서 추천해준다

# 우리에게 필요한 컬럼 : 제목(title), 줄거리(overview)
'''

# 필요한 라이브러리 / 함수 임폴트
import pandas as pd
from sklearn.metrics.pairwise import linear_kernel

# 파일 경로 설정
file_path='/content/drive/MyDrive/KDT/딥러닝/자연어처리/movies_metadata.csv'

# pd.read_csv() 사용
df_movies = pd.read_csv(file_path)
print(df_movies)


       adult                              belongs_to_collection    budget  \
0      False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1      False                                                NaN  65000000   
2      False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3      False                                                NaN  16000000   
4      False  {'id': 96871, 'name': 'Father of the Bride Col...         0   
...      ...                                                ...       ...   
45461  False                                                NaN         0   
45462  False                                                NaN         0   
45463  False                                                NaN         0   
45464  False                                                NaN         0   
45465  False                                                NaN         0   

                                                  genres  \
0      [{'id': 

  df_movies = pd.read_csv(file_path)


In [None]:
### 줄거리(overview), 제목(title) 컬럼을 기준 --> 누락 데이터 처리

# 팬시 인덱싱 사용
df = df_movies.loc[:, ['overview','title']]
print(df)

print('-'*80)

# 누락 데이터 제거 --> 누락 데이터가 존재하는 행 제거 --> dropna() 사용
df.dropna(inplace=True)
print(df)

                                                overview  \
0      Led by Woody, Andy's toys live happily in his ...   
1      When siblings Judy and Peter discover an encha...   
2      A family wedding reignites the ancient feud be...   
3      Cheated on, mistreated and stepped on, the wom...   
4      Just when George Banks has recovered from his ...   
...                                                  ...   
45461        Rising and falling between a man and woman.   
45462  An artist struggles to finish his work while a...   
45463  When one of her hits goes wrong, a professiona...   
45464  In a small town live two brothers, one a minis...   
45465  50 years after decriminalisation of homosexual...   

                             title  
0                        Toy Story  
1                          Jumanji  
2                 Grumpier Old Men  
3                Waiting to Exhale  
4      Father of the Bride Part II  
...                            ...  
45461               

In [None]:
### 누락 데이터 제거 후 행 인덱스 재조정 --> reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
print(df)

                                                overview  \
0      Led by Woody, Andy's toys live happily in his ...   
1      When siblings Judy and Peter discover an encha...   
2      A family wedding reignites the ancient feud be...   
3      Cheated on, mistreated and stepped on, the wom...   
4      Just when George Banks has recovered from his ...   
...                                                  ...   
44501        Rising and falling between a man and woman.   
44502  An artist struggles to finish his work while a...   
44503  When one of her hits goes wrong, a professiona...   
44504  In a small town live two brothers, one a minis...   
44505  50 years after decriminalisation of homosexual...   

                             title  
0                        Toy Story  
1                          Jumanji  
2                 Grumpier Old Men  
3                Waiting to Exhale  
4      Father of the Bride Part II  
...                            ...  
44501               

In [None]:
### data의 수를 10000개로 축소
df_data = df.iloc[:10000, :]

# 결과 확인하기
print(df_data)

                                               overview  \
0     Led by Woody, Andy's toys live happily in his ...   
1     When siblings Judy and Peter discover an encha...   
2     A family wedding reignites the ancient feud be...   
3     Cheated on, mistreated and stepped on, the wom...   
4     Just when George Banks has recovered from his ...   
...                                                 ...   
9995  In the peaceful countryside, Vassily opposes t...   
9996  Francisco is rich, rather strict on principles...   
9997  Cashier Maurice Legrand is married to Adele, a...   
9998  While Erendira, a beautiful teenage girl, has ...   
9999  This period drama frames the tumultuous affair...   

                                         title  
0                                    Toy Story  
1                                      Jumanji  
2                             Grumpier Old Men  
3                            Waiting to Exhale  
4                  Father of the Bride Part II

In [None]:
### SentenceTransformer 모델 생성
model_name='sentence-transformers/all-MiniLM-L12-v2'
model = SentenceTransformer(model_name)

In [None]:
### 문장 임베딩 벡터 생성

# overview 컬럼 --> 배열 생성
sentences = df_data.loc[:, 'overview'].values

# model.encode() 함수 사용
embeddings = model.encode(sentences)

# 결과 확인하기
print(f'생성된 임베딩 행렬의 모양 : {embeddings.shape}')
print('-'*80)
print(f'문장 1개의 임베딩 벡터의 모양 : {embeddings[0, :].shape}')

생성된 임베딩 행렬의 모양 : (10000, 384)
--------------------------------------------------------------------------------
문장 1개의 임베딩 벡터의 모양 : (384,)


In [None]:
### 문장 임베딩 행렬 저장

# 필요한 라이브러리 임폴트
import numpy as np

# 저장
np.save('/content/drive/MyDrive/KDT/딥러닝/자연어처리/embeddings.npy', embeddings)

In [None]:
### 저장된 임베딩 행렬 불러오기

# 저정 경로 확인
file_path='/content/drive/MyDrive/KDT/딥러닝/자연어처리/embeddings.npy'

# 임베딩 행렬 불러오기
loaded_embeddings = np.load(file_path)

# 결과 확인하기
print(f'다운로드 받은 문장 임베딩 행렬 전체의 모양 : {loaded_embeddings.shape}')

다운로드 받은 문장 임베딩 행렬 전체의 모양 : (10000, 384)


In [None]:
### 생성한 임베딩 행렬과 다운로드 받은 임베딩 행렬의 동일성 검증
result = (embeddings[0, :] == loaded_embeddings[0, :]).sum()
print(result)

384


In [None]:
### linear_kernel() 함수 --> 10000개의 영화 줄거리 각각에 대해서 코사인 유사도 측정
similarity = linear_kernel(X=embeddings, Y=embeddings)

# 결과 확인하기
print(f'유사도 측정의 결과값 확인 : \n{similarity}')
print('-'*80)
print(f'유사도 측정의 결과값의 모양 : {similarity.shape}')
print('-'*80)
print(f'Toy Story에 대한 유사도 측정의 결과값 확인 : \n{similarity[0, :]}')

유사도 측정의 결과값 확인 : 
[[1.0000001  0.33839953 0.12692964 ... 0.12217252 0.17416474 0.06714299]
 [0.33839953 1.0000002  0.2637778  ... 0.1654589  0.3252555  0.16340762]
 [0.12692964 0.2637778  0.9999999  ... 0.24624068 0.2797468  0.14819568]
 ...
 [0.12217252 0.1654589  0.24624068 ... 1.0000001  0.21591775 0.0477701 ]
 [0.17416474 0.3252555  0.2797468  ... 0.21591775 1.         0.18749893]
 [0.06714299 0.16340762 0.14819568 ... 0.0477701  0.18749893 0.9999999 ]]
--------------------------------------------------------------------------------
유사도 측정의 결과값의 모양 : (10000, 10000)
--------------------------------------------------------------------------------
Toy Story에 대한 유사도 측정의 결과값 확인 : 
[1.0000001  0.33839953 0.12692964 ... 0.12217252 0.17416474 0.06714299]


In [None]:
### 코사인 유사도 --> 데이터프레임 생성

# 영화 제목 --> 컬럼, 행 인덱스
columns = df_data.loc[:, 'title'].values
index = df_data.loc[:, 'title'].values

# 데이터프레임 생성
df_sim = pd.DataFrame(data=similarity, columns=columns, index=index)

# 결과 확인하기
df_sim

Unnamed: 0,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Between Your Legs,Zatôichi Meets the One-Armed Swordsman,The Angry Silence,Isadora,San Francisco,Earth,Él,La Chienne,Eréndira,The Private Lives of Elizabeth and Essex
Toy Story,1.000000,0.338400,0.126930,0.175748,0.137818,0.224320,0.183411,0.304267,0.143974,0.111093,...,0.204337,0.152693,0.093307,0.089167,0.061069,0.035998,0.183392,0.122173,0.174165,0.067143
Jumanji,0.338400,1.000000,0.263778,0.205652,0.144077,0.294304,0.253075,0.397237,0.254173,0.167929,...,0.187563,0.154846,-0.012331,0.034107,0.052756,-0.014982,0.177061,0.165459,0.325256,0.163408
Grumpier Old Men,0.126930,0.263778,1.000000,0.360031,0.175603,0.211339,0.314121,0.291589,0.186269,0.203596,...,0.301054,0.278452,-0.001206,0.061602,0.210004,0.081070,0.420658,0.246241,0.279747,0.148196
Waiting to Exhale,0.175748,0.205652,0.360031,1.000000,0.204694,0.286783,0.332937,0.295825,0.280217,0.182333,...,0.448264,0.279091,0.003809,0.087916,0.077089,0.156300,0.390909,0.197444,0.306730,0.167567
Father of the Bride Part II,0.137818,0.144077,0.175603,0.204694,1.000000,0.089654,0.117296,0.065354,0.216099,0.111157,...,0.225442,0.093269,-0.020158,0.064023,0.096882,0.023103,0.253434,0.131023,0.240259,0.088746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Earth,0.035998,-0.014982,0.081070,0.156300,0.023103,-0.033135,0.104973,0.045792,0.025830,0.041411,...,0.022366,0.119633,0.161267,0.051231,0.043433,1.000000,0.102056,0.001478,0.113179,-0.004952
Él,0.183392,0.177061,0.420658,0.390909,0.253434,0.247657,0.311852,0.195511,0.231736,0.145004,...,0.483851,0.272619,0.078805,0.182014,0.235567,0.102056,1.000000,0.329538,0.309063,0.172004
La Chienne,0.122173,0.165459,0.246241,0.197444,0.131023,0.212082,0.328271,0.201282,0.177863,0.131936,...,0.350755,0.240732,0.101820,0.196731,0.059917,0.001478,0.329538,1.000000,0.215918,0.047770
Eréndira,0.174165,0.325256,0.279747,0.306730,0.240259,0.258495,0.177591,0.255010,0.347747,0.144574,...,0.330732,0.329395,0.094023,0.309069,0.138785,0.113179,0.309063,0.215918,1.000000,0.187499


In [None]:
### 특정 영화 기준 --> 줄거리가 유사한 영화 추천 함수 생성

"""
1. 유사도 크기 순으로 정렬
2. 해당 영화의 유사도 제거
3. 특정 영화와 다른 영화와의 유사도 트기 순으로 n개만 추출
"""

# 추천 함수 정의
def recommend(title, k):
    top_k = df_sim.loc[:, title].sort_values(ascending=False).iloc[1:k+1]
    return top_k

In [None]:
### 추천 함수 실행

# 매개 변수의 입력값(argument) 설정
title = 'GoldenEye'
k=10

# 추천 함수 실행
top10 = recommend(title=title, k=k)

# 결과 확인하기
print(top10)

Live and Let Die               0.628484
Octopussy                      0.618100
Never Say Never Again          0.605646
The Living Daylights           0.594822
The Man with the Golden Gun    0.591555
Goldfinger                     0.573768
Casino Royale                  0.573743
Licence to Kill                0.571086
Die Another Day                0.567978
Dr. No                         0.553751
Name: GoldenEye, dtype: float32
