In [14]:
import numpy as np

def cosine_similarity(vec_a:np.ndarray, vec_b:np.ndarray)->float:
    dot = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)
    return dot / (norm_a*norm_b)

# openai embedding

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [4]:
from openai import OpenAI
openai_client = OpenAI()

In [14]:
def get_openai_embedding(text:str, model='text-embedding-3-small'):
    response = openai_client.embeddings.create(
        input= text,
        model= model
    )
    return response.data[0].embedding

In [None]:
text_str = '안녕하세요'
emb_vector = get_openai_embedding(text_str)
emb_vector

[-0.002531265141442418,
 -0.06127675995230675,
 -0.008443817496299744,
 0.031540773808956146,
 0.031089577823877335,
 -0.04993239790201187,
 -0.059171177446842194,
 0.03244316577911377,
 -0.014116000384092331,
 -0.06140567362308502,
 -0.020722804591059685,
 0.006832401733845472,
 0.009195811115205288,
 -0.020067494362592697,
 -0.011484021320939064,
 0.035214800387620926,
 -0.04653768241405487,
 0.004017795901745558,
 -0.0141482288017869,
 0.027394063770771027,
 0.05706559494137764,
 -0.017306603491306305,
 -0.031368888914585114,
 -0.020185666158795357,
 0.04413130134344101,
 0.06239400804042816,
 0.056936681270599365,
 0.0005176672129891813,
 0.017166946083307266,
 -0.05358493700623512,
 0.0314333438873291,
 -0.02758743427693844,
 -0.01743551529943943,
 0.0011414192849770188,
 -0.002343266736716032,
 0.02537442371249199,
 0.02264576032757759,
 -0.04391644522547722,
 -0.011752590537071228,
 -0.034204982221126556,
 -0.013546633534133434,
 -0.018176767975091934,
 -0.00301334704272449,
 0.

In [None]:
from langchain_upstage import UpstageEmbeddings

#임베딩 모델 생성
embeddings = UpstageEmbeddings(
    api_key = os.getenv('UPSTAGE_API_KEY'),
    model = "solar-embedding-1-large"
)

In [None]:
#업스테이지 모델로 임베딩함수
def get_upstage_embedding(text: str, is_query: bool = False) -> np.ndarray:
    """
    Upstage 임베딩 벡터를 반환한다.
    
    Args:
        text (str): 임베딩할 문장
        is_query (bool): True이면 검색 쿼리용 임베딩(embed_query), 
                         False이면 문서 임베딩(embed_documents)

    Returns:
        np.ndarray: 임베딩 벡터 (float 배열)
    """
    if is_query:
        vec = embeddings.embed_query(text)
    else:
        vec = embeddings.embed_documents([text])[0]  # 리스트에서 첫 번째 요소만 추출
    return np.array(vec)

In [10]:
# pip install -qU langchain-core langchain-upstage
 
from langchain_upstage import UpstageEmbeddings
 
embeddings = UpstageEmbeddings(
    api_key = os.getenv('UPSTAGE_API_KEY'),
    model = "embedding-query"
)
 
doc_result = embeddings.embed_documents(
    ["Sam is a teacher.", "This is another document"]
)
print(doc_result)
 
query_result = embeddings.embed_query("What does Sam do?")
print(query_result)

[[0.0163726806640625, 0.0171051025390625, -0.007747650146484375, 0.0241241455078125, 0.0033721923828125, -0.00681304931640625, -0.014678955078125, -0.010711669921875, -0.015655517578125, 0.006740570068359375, 0.01837158203125, 0.00504302978515625, 0.00727081298828125, 0.01177978515625, 0.0277099609375, 0.0218658447265625, -0.021484375, -0.0012388229370117188, -0.0029773712158203125, -0.0167083740234375, -0.02410888671875, -0.00911712646484375, -0.0112457275390625, -0.00592041015625, -0.01033782958984375, 0.016265869140625, 0.0027980804443359375, -0.0119476318359375, 0.005157470703125, 0.0214996337890625, 0.00566864013671875, 0.0140838623046875, 0.0020599365234375, -0.00940704345703125, 0.01068115234375, -0.0122528076171875, -0.0036144256591796875, 0.023773193359375, -0.0124053955078125, 0.01280975341796875, -0.0112762451171875, -0.020050048828125, -0.0140533447265625, 8.082389831542969e-05, -0.01654052734375, 0.01288604736328125, 0.00689697265625, -0.0008783340454101562, 0.004817962646

In [11]:
len(query_result)

4096

In [1]:
texts = ['king', 'queen', 'slave', '왕']

# openai embedding 모델로 임베딩

In [None]:
openai_embeddings = {txt: get_openai_embedding(txt) for txt in texts}


{'king': [0.037221893668174744,
  -0.022094957530498505,
  0.05194341763854027,
  0.00014054813073016703,
  -0.013733332976698875,
  -0.023032471537590027,
  -0.02377995103597641,
  0.005580750294029713,
  -0.07190996408462524,
  -0.002796709770336747,
  0.043784502893686295,
  0.002969326451420784,
  -0.016748584806919098,
  -0.012821156531572342,
  0.04624231159687042,
  0.002609840128570795,
  -0.06618351489305496,
  -0.004269495606422424,
  0.01686260849237442,
  0.03144477307796478,
  -0.007702828850597143,
  0.015076261013746262,
  0.09593062847852707,
  0.03739926218986511,
  -0.006853997707366943,
  0.037525951862335205,
  -0.004757257178425789,
  -0.014265436679124832,
  0.07718031853437424,
  -0.041047971695661545,
  0.05315965414047241,
  -0.021284133195877075,
  0.03744993731379509,
  -0.01978917606174946,
  -0.02123345620930195,
  -0.05655498057603836,
  -0.004750922322273254,
  -0.013682656921446323,
  -0.038159407675266266,
  0.04109864681959152,
  -0.01785079948604107,


In [18]:
cosine_similarity(openai_embeddings['queen'], openai_embeddings['king'])

np.float64(0.590601530239691)

# 업스테이지 임베딩 모델로 임베딩

In [21]:
upsatage_embeddings = {txt: get_upstage_embedding(txt) for txt in texts}


In [22]:
cosine_similarity(upsatage_embeddings['queen'], upsatage_embeddings['king'])

np.float64(0.6445486818426877)

# 왕과 king의 비교

In [23]:
cosine_similarity(openai_embeddings['왕'], openai_embeddings['king'])

np.float64(0.5040406331683572)

In [24]:
cosine_similarity(upsatage_embeddings['왕'], upsatage_embeddings['king'])

np.float64(0.6964319194913206)

# 올라마 임베팅 > huggingface의 임베딩 모델

In [2]:
from langchain_community.chat_models import ChatOllama
llm_ollama = ChatOllama(model='gemma2')
response = llm_ollama.invoke('안녕? 네 소개를 2줄로 작성해.')
response.content

  llm_ollama = ChatOllama(model='gemma2')


'안녕하세요! 저는 구글에서 훈련된 대규모 언어 모델입니다.\n\n텍스트 생성, 번역, 요약 등 다양한 작업을 수행할 수 있습니다. 😊'

# OS 계열의 임베딩 모델 사용
허깅페이스의 임베딩 모델 사용 - transformers 라이브러리, GPU기반 pytorch<br>
토치쿠다 기반의 가상환경에서 실행

In [18]:
import torch
MODEL = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [21]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(MODEL, device=device)
gemma_embedding = {txt: model.encode(txt) for txt in texts}
gemma_embedding

{'king': array([-9.88262426e-03,  6.36291802e-01, -1.05976701e-01, -9.73002985e-02,
        -3.49916190e-01, -2.51597077e-01,  5.45767844e-01,  2.71291286e-01,
         3.66875678e-01,  1.68192849e-01, -1.43975481e-01, -1.69313207e-01,
         1.31551817e-01, -3.81145149e-01, -2.25974515e-01, -1.72765747e-01,
        -3.63810301e-01,  1.78069815e-01, -2.50198901e-01,  3.72574963e-02,
         7.22347558e-01,  3.62045169e-01, -2.91301787e-01, -5.72886467e-02,
        -3.42295557e-01, -4.41963762e-01,  2.13033795e-01,  5.40564172e-02,
         1.64299980e-01, -2.67429203e-01, -1.75393835e-01, -5.77891052e-01,
         6.20606899e-01,  3.14617515e-01, -2.48781011e-01,  1.98306605e-01,
         1.55594066e-01,  3.44438940e-01,  7.89323509e-01, -3.39817017e-01,
         3.32258195e-01, -4.51881699e-02, -4.57230024e-02,  1.97162822e-01,
         1.69300437e-01,  4.11989570e-01, -2.04015031e-01,  2.29335189e-01,
        -2.98015028e-01,  3.81826423e-02,  1.14806294e-01,  4.15260226e-01,
    

In [22]:
cosine_similarity(gemma_embedding['king'], gemma_embedding['왕'])

np.float32(0.8712734)