# Embeddings

In [1]:
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')

In [2]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)
text="안녕하세요"
response = client.embeddings.create(
    model="text-embedding-3-small",
    input=[text]
)

In [3]:
print(response.data[0].embedding)
print(len(response.data[0].embedding))

[-0.002531265141442418, -0.06127675995230675, -0.008443817496299744, 0.031540773808956146, 0.031089577823877335, -0.04993239790201187, -0.059171177446842194, 0.03244316577911377, -0.014116000384092331, -0.06140567362308502, -0.020722804591059685, 0.006832401733845472, 0.009195811115205288, -0.020067494362592697, -0.011484021320939064, 0.035214800387620926, -0.04653768241405487, 0.004017795901745558, -0.0141482288017869, 0.027394063770771027, 0.05706559494137764, -0.017306603491306305, -0.031368888914585114, -0.020185666158795357, 0.04413130134344101, 0.06239400804042816, 0.056936681270599365, 0.0005176672129891813, 0.017166946083307266, -0.05358493700623512, 0.0314333438873291, -0.02758743427693844, -0.01743551529943943, 0.0011414192849770188, -0.002343266736716032, 0.02537442371249199, 0.02264576032757759, -0.04391644522547722, -0.011752590537071228, -0.034204982221126556, -0.013546633534133434, -0.018176767975091934, -0.00301334704272449, 0.04782681167125702, 0.05259660258889198, 0.0

### 음식 리뷰 데이터

corpus -> embedding vector -> 유사도 기반 검

In [4]:
import pandas as pd

df = pd.read_csv('fine_food_reviews_1k.csv')
df

Unnamed: 0.1,Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text
0,0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...
1,1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos..."
2,2,1351123200,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...
3,3,1351123200,B004AHGBX4,A2UY46X0OSNVUQ,3,These also have SALT and it's not sea salt.,I like the fact that you can see what you're g...
4,4,1351123200,B001BORBHO,A1AFOYZ9HSM2CZ,5,Happy with the product,My dog was suffering with itchy skin. He had ...
...,...,...,...,...,...,...,...
995,995,1351209600,B004OQLIHK,AKHQMSUORSA91,5,Delicious!,I have ordered these raisins multiple times. ...
996,996,1351209600,B0006349W6,A21BT40VZCCYT4,5,Good Training Treat,My dog will come in from outside when I am tra...
997,997,1351209600,B00611F084,A6D4ND3C3BCYV,5,Jamica Me Crazy Coffee,Wolfgang Puck's Jamaica Me Crazy is that wonde...
998,998,1351209600,B005QKH5HA,A3LR9HCV3D96I3,5,Party Peanuts,Great product for the price. Mix with the Asia...


In [5]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [6]:
import tiktoken

gpt4o_encoding = tiktoken.encoding_for_model('gpt-4o')

In [7]:
df['n_tokens'] = df['Text'].apply(lambda x: len(gpt4o_encoding.encode(x)))
df[['Text', 'n_tokens']].head()

Unnamed: 0,Text,n_tokens
0,Wanted to save some to bring to my Chicago fam...,33
1,"Not pleased at all. When I opened the box, mos...",26
2,I'm not sure that custard is really custard wi...,242
3,I like the fact that you can see what you're g...,216
4,My dog was suffering with itchy skin. He had ...,76


In [8]:
df['n_tokens'].describe()

Unnamed: 0,n_tokens
count,1000.0
mean,83.818
std,71.905308
min,22.0
25%,38.0
50%,59.0
75%,104.0
max,614.0


In [9]:
def texts_to_embedding(texts):
  # 전처리 - 불용어 제거 등...
  texts = [text.replace('\n', ' ') for text in texts]

  response = client.embeddings.create(
      model='text-embedding-3-small',
      input=texts
  )

  return [data.embedding for data in response.data]

In [10]:
texts_to_embedding(["안녕하세요, 저는 이주원입니다.", "오늘 날씨가 좋네요."])

[[0.0148796197026968,
  -0.015200103633105755,
  -0.028458990156650543,
  0.025254148989915848,
  0.028477301821112633,
  -0.0357477143406868,
  -0.06105680391192436,
  0.05699123069643974,
  -0.030290327966213226,
  -0.015081066638231277,
  -0.0454171784222126,
  0.032194919884204865,
  -0.008414996787905693,
  -0.0062494403682649136,
  -0.00994416419416666,
  0.022030994296073914,
  -0.010475251823663712,
  0.009641993790864944,
  -0.00591064291074872,
  0.03261612728238106,
  0.01551143079996109,
  -0.009733560495078564,
  0.029081644490361214,
  0.02318473719060421,
  0.033898063004016876,
  0.03686482831835747,
  0.02930140495300293,
  0.0036970132496207952,
  0.01402804721146822,
  -0.0880507230758667,
  -0.02164641208946705,
  -0.01928398758172989,
  -0.04732177034020424,
  -0.009788500145077705,
  -0.027781395241618156,
  0.015483961440622807,
  0.01347864605486393,
  -0.029795866459608078,
  -0.011967792175710201,
  -0.04746827483177185,
  0.002294895239174366,
  -0.0276898276

In [11]:
df['embedding'] = texts_to_embedding(df['Text'].tolist())

In [12]:
df['embedding']

Unnamed: 0,embedding
0,"[0.016770517453551292, -0.008579619228839874, ..."
1,"[-0.005216312129050493, 0.040469057857990265, ..."
2,"[0.005564768798649311, -0.012970144860446453, ..."
3,"[-0.016292475163936615, 0.008886804804205894, ..."
4,"[-0.004322985652834177, -0.06378211826086044, ..."
...,...
995,"[0.018859002739191055, -0.03717847540974617, -..."
996,"[-0.037051208317279816, -0.013656404800713062,..."
997,"[-0.04671481251716614, -0.07131096720695496, -..."
998,"[-0.014545817859470844, -0.03374110162258148, ..."


In [13]:
embed_df = df['embedding'].to_frame('embedding')  # 1줄만 있으면 시리즈 형태이기 때문에 데이터프레임 형식으로 변환, to_frame('embedding') 컬럼명 지정.
embed_df.index = df['Text']
embed_df

Unnamed: 0_level_0,embedding
Text,Unnamed: 1_level_1
Wanted to save some to bring to my Chicago family but my North Carolina family ate all 4 boxes before I could pack. These are excellent...could serve to anyone,"[0.016770517453551292, -0.008579619228839874, ..."
"Not pleased at all. When I opened the box, most of the rings were broken in pieces. A total waste of money.","[-0.005216312129050493, 0.040469057857990265, ..."
"I'm not sure that custard is really custard without eggs. But this comes close. I got it for use in a ""Vegan pancake"" recipe. We were having houseguests who were Vegan and I wanted to make some special breakfasts while they were here. One of the cooking/recipe sites had a recipe using this and there were lots of great reviews. I tried the recipe and it turned out like wallpaper paste -- yuck!<br />However, the so-called custard isn't so bad. I think it's probably just cornstarch and annatto (yellow coloring with a slight flavor). It's fun playing with it. You could dress it up with fruit. Seems to come out on the thin side when you make it as directed, so I use less milk because I like my custards to set firm. As a custard sauce it's fine. I would say it tastes something between a pudding and a custard.<br /><br />If you want a really good egg-free ""custard"" get an original recipe for ""blanc mange."" It takes a lot longer to make, but it's certainly worth the difference.","[0.005564768798649311, -0.012970144860446453, ..."
"I like the fact that you can see what you're getting and that there are no bones or dark meat. There are 7 nice big chunks in every jar.<br /><br />These taste like tuna in a can but, because they're preserved in glass, you don't have to worry about either aluminum or BPA; BUT ... they are not just tuna and spring water.<br /><br />There is salt in there, too, and it's not healthy sea salt, it's toxic table salt.<br /><br />I am trying to contact Tonnino to confirm that. I might be wrong because the label states that the ingredients are ""tuna fish"" but the sticker on the top clarifies that it is the smaller (healthier) yellowfin, so the ""salt"" listed in the ingredients might be sea salt but, if it was, why don't they say so?<br /><br />Without confirmation, I will continue to look for a salt-free olive-oil free tuna preserved in glass.<br /><br />If you know of one, please contact me!","[-0.016292475163936615, 0.008886804804205894, ..."
My dog was suffering with itchy skin. He had been eating Natural Choice brand (cheaper) since he was a puppy. I was nervous to change foods. The vet suggested to change foods sand see if the skin issues cleared up. Wellness brand did the job. My dog seems to love the food and the skin issues cleared up within a few weeks.,"[-0.004322985652834177, -0.06378211826086044, ..."
...,...
I have ordered these raisins multiple times. They are always great and arrive timely. I can't go back to store bought chocolate covered raisins now! Love this product.,"[0.018859002739191055, -0.03717847540974617, -..."
My dog will come in from outside when I am training her and look at the cupboard waiting for her treat. When I use the clicker training method she comes because she knows she has something special.,"[-0.037051208317279816, -0.013656404800713062,..."
Wolfgang Puck's Jamaica Me Crazy is that wonderful blend of island flavors in a coffee. Have loved it from the first time tasting. Great product.,"[-0.04671481251716614, -0.07131096720695496, -..."
Great product for the price. Mix with the Asian rice crackers for an excellent snack. Big container lasts a long time. Only lightly slighted. Peanuts are whole and large.,"[-0.014545817859470844, -0.03374110162258148, ..."


In [14]:
import numpy as np

cos_sim = lambda a, b: np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) # 코사인 유사도

def get_similar_texts(query, embed_df, top_n=5):
  query_embed = texts_to_embedding(query)[0]
  embed_df['cos_sim'] = embed_df['embedding'].apply(lambda x:cos_sim(x, query_embed))
  return embed_df.sort_values('cos_sim', ascending=False).head(top_n)

In [15]:
get_similar_texts(['coffee'], embed_df)

Unnamed: 0_level_0,embedding,cos_sim
Text,Unnamed: 1_level_1,Unnamed: 2_level_1
"I I haven't had a bad cup of coffee yet. So far, my favorites are the Decaf, Columbian, and Breakfast blend. I only drink one cup of coffee a day, so this type of coffee maker is perfect for me. Especially since I only like coffee when it's hot and fresh. My guests love it too!","[-0.01845250464975834, -0.05068936571478844, 0...",0.419657
"I have a coffee maker that grinds my coffee beans. It's hard to find whole bean decafinated coffee. When I find it in the brand that I like, I am excited. Seattle's Best is my favorite.","[-0.03934145346283913, -0.03586488589644432, -...",0.406217
So yummy... Drinking it Black coffee or w cream this coffee is delish .... One of my favs... I would recommend this coffee to everyone :),"[0.01336692739278078, -0.04671807214617729, -0...",0.402125
So yummy... Drinking it Black coffee or w cream this coffee is delish .... One of my favs... I would recommend this coffee to everyone :),"[0.01336692739278078, -0.04671807214617729, -0...",0.402125
"So my wife is a latte freak, and nursing, so decaf is the approved type. After the Senseo left the market, I struggled and found the <a href=""http://www.amazon.com/gp/product/B0047BIWSK"">Aerobie AeroPress Coffee and Espresso Maker</a> which is like a French Press for the 21st century. After getting our recipe figured out, my wife, who's been buying Venti Decaf Latte's at $4 a pop almost daily for years now declares that Seattle's best Level 3 Decaf in her home-made Latte is the best coffee she can get. We've tried other bands, and this is her favorite, hands down!","[-0.013580745086073875, -0.04869876429438591, ...",0.40058
