In [1]:
!pip install -U sentence-transformers openai

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Using cached torch-2.4.1-cp38-none-macosx_11_0_arm64.whl.metadata (26 kB)
Collecting sympy (from torch>=1.11.0->sentence-transformers)
  Using cached sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers)
  Using cached networkx-3.1-py3-none-any.whl.metadata (5.3 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy->torch>=1.11.0->sentence-transformers)
  Using cached mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
Using cached torch-2.4.1-cp38-none-macosx_11_0_arm64.whl (62.1 MB)
Using cached networkx-3.1-py3-none-any.whl (2.1 MB)
Using cached sympy-1.13.3-py3-none-any.whl (6.2 MB)
Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Installing collected packages: mpmath, sympy, networkx, torch, sentence-transf

## Hugging Face Embedding

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd

  from tqdm.autonotebook import tqdm, trange


In [3]:
model = SentenceTransformer('BAAI/bge-m3')

In [4]:
def get_embedding(text):
    return list(model.encode(text))

In [5]:
embedding_result = get_embedding("저는 배가 고파요")
print(embedding_result)

[0.021626674, 0.009280821, -0.05210821, -0.019242445, -0.008087506, -0.02791718, 0.032577038, 0.013890103, -0.004750229, -0.023241216, 0.006533629, -0.009793396, -0.0014175926, -0.019782495, 0.006590598, -0.014435603, 0.05338057, 0.0071364376, 0.0014252728, 0.018539065, 0.0036249124, 0.00938739, -0.0070147137, 0.012587266, -0.0029035378, 0.01997257, 0.017060714, -0.0072685825, 0.037819274, -0.0077150557, 0.025726613, -0.051372208, 0.009581594, -0.06156163, -0.009842026, -0.032080255, 0.0036586097, -0.0071939994, -0.01376766, 0.064089485, 0.019516196, -0.009957554, 0.0014377651, -0.016735112, 0.013088274, -0.018781394, -0.027572487, 0.0012611138, -0.016827948, 0.016431851, 0.03218302, -0.045799557, 0.06265151, -0.03241703, 0.016951162, 0.026035601, -0.0053126714, -0.013995235, -0.048088342, -0.030344589, -0.009330456, 0.01301734, 0.0019433332, -0.0019273114, -0.00034924506, 0.14532919, 0.010187955, 0.034748033, -0.03623437, -0.026172886, -0.011427408, 0.00764198, -0.010387914, -0.022206

In [6]:
len(embedding_result)

1024

In [7]:
data = ['저는 배가 고파요',
        '저기 배가 지나가네요',
        '굶어서 허기가 지네요',
        '허기 워기라는 게임이 있는데 즐거워',
        '스팀에서 재밌는 거 해야지',
        '스팀에어프라이어로 연어구이 해먹을거야']

df = pd.DataFrame(data, columns=['text'])
df

Unnamed: 0,text
0,저는 배가 고파요
1,저기 배가 지나가네요
2,굶어서 허기가 지네요
3,허기 워기라는 게임이 있는데 즐거워
4,스팀에서 재밌는 거 해야지
5,스팀에어프라이어로 연어구이 해먹을거야


In [8]:
df['embedding'] = df.apply(lambda row: get_embedding(
        row.text
    ), axis=1)

## Cosine Similarity

In [9]:
def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

def return_answer_candidate(df, query):
    # query라고 하는 텍스트가 들어오면 get_embedding이라는 함수를 통해서 벡터값을 얻음.
    # query라고 하는 텍스트의 임베딩 값은 query_embedding에 저장이 됩니다.
    query_embedding = get_embedding(
        query
    )

    # query라는 텍스트가 임베딩이 된 query_embedding과
    # 데이터프레임 df의 embedding 열에 있는 모든 임베딩 벡터값들과 유사도를 계산을 하여
    # similarity 열에다가 각각의 유사도 점수를 기록.
    df["similarity"] = df.embedding.apply(lambda x: cos_sim(np.array(x),
                                                            np.array(query_embedding)))

    # similarity 열에 있는 유사도 값 기준으로 상위 3개의 행만 반환
    results_co = df.sort_values("similarity",
                                ascending=False,
                                ignore_index=True)
    return results_co.head(3)

In [10]:
sim_result = return_answer_candidate(df, '아무 것도 안 먹었더니 꼬르륵 소리가나네')
sim_result

Unnamed: 0,text,embedding,similarity
0,굶어서 허기가 지네요,"[0.0004956795, 0.027304385, -0.06006089, -0.04...",0.566034
1,저는 배가 고파요,"[0.021626674, 0.009280821, -0.05210821, -0.019...",0.490944
2,허기 워기라는 게임이 있는데 즐거워,"[-0.012998731, 0.019039107, -0.07117989, -0.00...",0.477863


## OPEN AI Embedding

In [11]:
from openai import OpenAI
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd

In [12]:
from dotenv import load_dotenv
import os
import openai

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key=api_key)

사용 가능한 임베딩 모델: https://platform.openai.com/docs/guides/embeddings#embedding-models

In [13]:
def get_embedding(text):
    response = client.embeddings.create(
    input=text,
    model="text-embedding-ada-002"
    )
    return response.data[0].embedding

In [14]:
embedding_result = get_embedding('저는 학생이에요')
print(embedding_result)

[-0.023699026554822922, -0.0165309589356184, -0.016086919233202934, -0.03511718660593033, -0.026185648515820503, 0.016480211168527603, -0.011221514083445072, 0.005899382755160332, -0.012940581887960434, 0.006349765695631504, -0.004507001489400864, -0.0005451378528960049, 0.0015462092123925686, -0.01812950149178505, -0.002726085716858506, -0.028723016381263733, 0.02555130608379841, -0.010149476118385792, -0.0030210549011826515, -0.018116815015673637, 0.010022607631981373, -0.006565442308783531, -0.00028505755471996963, -0.022531837224960327, -0.01000357698649168, 0.009921113029122353, 0.01767277531325817, -0.024371428415179253, 0.020578062161803246, -0.009730810299515724, 0.02454904466867447, -0.002036238554865122, -0.013790600001811981, -0.01852279342710972, -0.0016778352437540889, 0.004040759988129139, 0.008811013773083687, -0.0036442962009459734, -0.0025643284898251295, 0.010859939269721508, 0.006197523791342974, -0.015896616503596306, -0.0006775567890144885, -0.011126362718641758, -

In [15]:
len(embedding_result)

1536

In [16]:
data = ['저는 배가 고파요',
        '저기 배가 지나가네요',
        '굶어서 허기가 지네요',
        '허기 워기라는 게임이 있는데 즐거워',
        '스팀에서 재밌는 거 해야지',
        '스팀에어프라이어로 연어구이 해먹을거야']

df = pd.DataFrame(data, columns=['text'])
df

Unnamed: 0,text
0,저는 배가 고파요
1,저기 배가 지나가네요
2,굶어서 허기가 지네요
3,허기 워기라는 게임이 있는데 즐거워
4,스팀에서 재밌는 거 해야지
5,스팀에어프라이어로 연어구이 해먹을거야


In [17]:
df['embedding'] = df.apply(lambda row: get_embedding(
        row.text
    ), axis=1)

In [18]:
df

Unnamed: 0,text,embedding
0,저는 배가 고파요,"[-0.01663736067712307, -0.02178889885544777, 0..."
1,저기 배가 지나가네요,"[-0.003291434608399868, -0.02751476690173149, ..."
2,굶어서 허기가 지네요,"[-0.006181030999869108, -0.0069507937878370285..."
3,허기 워기라는 게임이 있는데 즐거워,"[-0.011329255998134613, -0.011715852655470371,..."
4,스팀에서 재밌는 거 해야지,"[-0.016108456999063492, -0.014401600696146488,..."
5,스팀에어프라이어로 연어구이 해먹을거야,"[-0.002138908952474594, -0.030034277588129044,..."


In [19]:
def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

def return_answer_candidate(df, query):
    query_embedding = get_embedding(
        query
    )
    df["similarity"] = df.embedding.apply(lambda x: cos_sim(np.array(x),
                                                            np.array(query_embedding)))
    results_co = df.sort_values("similarity",
                                ascending=False,
                                ignore_index=True)
    return results_co.head(3)

In [20]:
sim_result = return_answer_candidate(df, '아무 것도 안 먹었더니 꼬르륵 소리가나네')
sim_result

Unnamed: 0,text,embedding,similarity
0,굶어서 허기가 지네요,"[-0.006181030999869108, -0.0069507937878370285...",0.838333
1,스팀에어프라이어로 연어구이 해먹을거야,"[-0.002138908952474594, -0.030034277588129044,...",0.821603
2,저는 배가 고파요,"[-0.01663736067712307, -0.02178889885544777, 0...",0.814259
