# 🔗 문장 임베딩 기반 리뷰 유사도 분석 (SentenceBERT)

In [None]:

# 설치 (최초 1회 필요)
# !pip install sentence-transformers pandas scikit-learn


In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Using cached transformers-4.53.1-py3-none-any.whl.metadata (40 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.1-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Using cached huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting fsspec>=20

In [None]:

import pandas as pd

df = pd.read_csv("36000_reviews_label.csv")
df = df[['review_content', 'sentiment_tfidf']].dropna().reset_index(drop=True)
df = df[df['sentiment_tfidf'].isin(['긍정', '부정', '중립'])]
df = df.head(300)  # 연습용 소규모 샘플
df.head()


In [None]:

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v1')
embeddings = model.encode(df['review_content'], convert_to_tensor=True)

print("임베딩 완료! 리뷰 개수:", len(embeddings))


In [None]:

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 한 문장을 골라 유사한 리뷰 5개 찾기
target_idx = 0
target_vec = embeddings[target_idx].unsqueeze(0)

cos_scores = cosine_similarity(target_vec, embeddings)[0]
top_indices = np.argsort(cos_scores)[::-1][1:6]  # 자기 자신 제외

print("🔍 기준 문장:")
print(df['review_content'][target_idx])
print("\n📍 유사 리뷰 Top 5:")
for i in top_indices:
    print(f"- ({cos_scores[i]:.3f})", df['review_content'][i])
