# OpenAI Embeddings

For 1536 examples from the new_annotate_700 dataset, it'll cost about $0.005006 USD.

In [None]:
# Enter your API Key via a redacted input box.
import panel as pn
pn.extension()

password_input = pn.widgets.PasswordInput(name='Enter your OpenAI API key then run the next cell:', placeholder='<OpenAI API Key>')
password_input

In [None]:
os.environ['OPENAI_API_KEY'] = password_input.value

In [None]:
# Validate your API key.
import re
assert len(os.environ['OPENAI_API_KEY']) == 51, "OpenAI's API Key are 51 characters."
os.environ['OPENAI_API_KEY'][:3] + re.sub('.', '*', os.environ['OPENAI_API_KEY'][3:])

# Loop over dataset and save the embeddings

In [None]:
import openai
import srsly

def request_and_persist(sentence: str, dir_: str='.'):
    if not isinstance(sentence, str): raise TypeError("sentence must be a str.")
    print("requesting for embeddings... ", end='')
    response = openai.Embedding.create(
      input=sentence,
      model="text-embedding-ada-002",
    )
    path = f'{dir_}/{hash(sentence)}.json'
    srsly.write_json(path, dict(response))
    print(f"persisted to {path}.")

In [None]:
import pandas as pd
df = pd.read_excel('./new_annot_data700.xlsx')

In [None]:
sent = df.iloc[0].sentence
sent

In [None]:
request_and_persist(sent, dir_='embeddings')

In [None]:
# this cell costs money.
# from tqdm.auto import tqdm
# for sent in tqdm(df.sentence):
    # print(sent, type(sent))
    # request_and_persist(sent, dir_='embeddings')

In [None]:
from pathlib import Path

files = list(Path('./embeddings').glob('*.json'))
len(files), len(df)

In [None]:
df['sent_hash'] = df.sentence.apply(lambda s: hash(s))

In [None]:
dff = df.set_index('sent_hash')

In [None]:
duplicate_indices = dff.index.duplicated()
dff = dff[~duplicate_indices]

In [None]:
import srsly
import numpy as np

embeddings = []
for f in files:
    data = srsly.read_json(f)
    emb = np.array(data.get('data')[0].get('embedding'))
    # print(f.stem)
    # find associated sentence per query.
    try:
        row = dff.loc[int(f.stem)]
        embeddings.append((row.sentence, emb, row.det, row.se, row.nat, row.hom, row.pos))
    except:
        continue
len(embeddings), len(files)

In [None]:
emb_df = pd.DataFrame(embeddings, columns=['query', 'embedding', 'det', 'se', 'nat', 'hom', 'pos'])
len(emb_df)

In [None]:
emb_df = emb_df[(emb_df['det'] ^ emb_df['se'] ^ emb_df['nat'] ^ emb_df['hom'] == 1)| (df['pos'] == 0)]
len(emb_df)

In [None]:
def to_string(row):
    if row.det: return "determinism"
    if row.se: return "specific_aetiology"
    if row.nat: return "naturalness"
    if row.hom: return "homogeneity"
    return "neutral"
emb_df['clazz'] = emb_df.apply(to_string, axis=1)
emb_df['clazz'].value_counts()

In [None]:
sub_emb_df = emb_df[emb_df['clazz'] != 'neutral']

In [None]:
embeddings = np.vstack(sub_emb_df.embedding.to_list())
embeddings.shape

In [None]:
from causation.sampler import Sampler

compressed_2d = Sampler.umap(embeddings)
compressed_2d.shape

In [None]:
import plotly.graph_objects as go
from sklearn.preprocessing import LabelEncoder
clazz_labels = LabelEncoder().fit_transform(sub_emb_df.loc[:, 'clazz'])

scatter_clazz = go.Scatter(
    x=compressed_2d[:, 0],
    y=compressed_2d[:, 1],
    mode='markers',
    marker=dict(
        size=8,
        color=clazz_labels,  # assign color to each label
        colorscale='Viridis',  # choose a colorscale
        opacity=0.8
    ),
    text=sub_emb_df.loc[:, 'query'].to_list(),
    name='class'
)
fig = go.Figure(data=[scatter_clazz])
fig.update_layout(autosize=False, width=800, height=800)
fig.show()