In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("fine_food_reviews_1k.csv", index_col=0)
df.head()

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos..."
2,1351123200,B000JMBE7M,AQX1N6A51QOKG,4,"It isn't blanc mange, but isn't bad . . .",I'm not sure that custard is really custard wi...
3,1351123200,B004AHGBX4,A2UY46X0OSNVUQ,3,These also have SALT and it's not sea salt.,I like the fact that you can see what you're g...
4,1351123200,B001BORBHO,A1AFOYZ9HSM2CZ,5,Happy with the product,My dog was suffering with itchy skin. He had ...


In [3]:
df.isnull().sum()

Time         0
ProductId    0
UserId       0
Score        0
Summary      0
Text         0
dtype: int64

In [4]:
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " \
        + df.Text.str.strip()
)

In [5]:
df.head(2)

Unnamed: 0,Time,ProductId,UserId,Score,Summary,Text,combined
0,1351123200,B003XPF9BO,A3R7JR3FMEBXQB,5,where does one start...and stop... with a tre...,Wanted to save some to bring to my Chicago fam...,Title: where does one start...and stop... wit...
1,1351123200,B003JK537S,A3JBPC3WFUT5ZP,1,Arrived in pieces,"Not pleased at all. When I opened the box, mos...",Title: Arrived in pieces; Content: Not pleased...


In [6]:
top_n = 3 
df = df.sort_values("Time").tail(top_n*2)
df.drop("Time", axis=1, inplace=True)

In [7]:
max_tokens = 8000

In [8]:
import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")

In [9]:
df["n_tokens"]= df.combined.apply(lambda x: len(encoding.encode(x)))

In [10]:
df = df[df.n_tokens <= max_tokens].tail(top_n)

In [11]:
len(df)

3

In [12]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [13]:
from openai import OpenAI 

client = OpenAI()

In [14]:
def get_embedding(text, model = "text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(
        input=[text], model = model
    ).data[0].embedding 

In [17]:
df["embedding"] = df.combined.apply( \
    lambda x: get_embedding(x, model="text-embedding-ada-002"))

In [18]:
df.head()

Unnamed: 0,ProductId,UserId,Score,Summary,Text,combined,n_tokens,embedding
625,B0009ET7TC,A2FSDQY5AI6TNX,5,My furbabies LOVE these!,Shake the container and they come running. Eve...,Title: My furbabies LOVE these!; Content: Shak...,47,"[-0.009747001342475414, -0.006828183773905039,..."
619,B007PA32L2,A15FF2P7RPKH6G,5,got this for the daughter,all i have heard since she got a kuerig is why...,Title: got this for the daughter; Content: all...,50,"[-0.005203990265727043, 0.0009916637791320682,..."
999,B001EQ5GEO,A3VYU0VO6DYV6I,5,I love Maui Coffee!,My first experience with Maui Coffee was bring...,Title: I love Maui Coffee!; Content: My first ...,118,"[-0.006159266456961632, -0.015069019980728626,..."


In [19]:
df.to_csv("fine_food_reviews_with_embeddings.csv", index = False)