### Generate Embeddings

In [1]:
!pip install pandas openai python-dotenv

Collecting numpy<2,>=1.22.4 (from pandas)
  Using cached numpy-1.26.2-cp39-cp39-macosx_10_9_x86_64.whl.metadata (61 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 kB)
Collecting anyio<4,>=3.5.0 (from openai)
  Using cached anyio-3.7.1-py3-none-any.whl.metadata (4.7 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.8.0-py3-none-any.whl (20 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.25.2-py3-none-any.whl.metadata (6.9 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Using cached pydantic-2.5.2-py3-none-any.whl.metadata (65 kB)
Collecting tqdm>4 (from openai)
  Using cached tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
Collecting idna>=2.8 (from anyio<4,>=3.5.0->openai)
  Using cached idna-3.6-py3-none-any.whl.metadata (9.9 kB)
Collecting sniffio>=1.1 (from anyio<4,>=3.5.0->openai)
  

In [14]:
import pandas as pd

df = pd.read_csv("data/tr_reviews.csv")

In [3]:
df.Sentiment.value_counts() * 100 / len(df)

Sentiment
positive    75.717213
negative    24.282787
Name: count, dtype: float64

In [6]:
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv() 

API_KEY = os.getenv('AZURE_OPENAI_API_KEY')
RESOURCE_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')

client = AzureOpenAI(
  api_key=API_KEY,
  api_version="2023-05-15",
  azure_endpoint=RESOURCE_ENDPOINT
)

In [7]:
# Configuration variables
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"
max_tokens = 8000
top_n = 1000

In [10]:
import time

def generate_embeddings(text, model):
    if not hasattr(generate_embeddings, "counter"):
        generate_embeddings.counter = 0  
        generate_embeddings.start_time = time.time()

    generate_embeddings.counter += 1

    if generate_embeddings.counter % 100 == 0:
        elapsed_time = time.time() - generate_embeddings.start_time
        texts_left = len(df["Review"]) - generate_embeddings.counter
        estimated_time_left = (elapsed_time / generate_embeddings.counter) * texts_left
        print(f"Generated embeddings for {generate_embeddings.counter} texts. "
              f"Approximately {estimated_time_left:.2f} seconds remaining for {texts_left} texts.")

    embedding = client.embeddings.create(input=[text], model=model).data[0].embedding
    return embedding

In [11]:
df['ada_v2'] = df["Review"].apply(lambda x: generate_embeddings(x, model=embedding_model))
total_time = time.time() - generate_embeddings.start_time
print(f"Completed generating embeddings for {generate_embeddings.counter} texts in {total_time:.2f} seconds.")

Generated embeddings for 100 texts. Approximately 156.66 seconds remaining for 876 texts.
Generated embeddings for 200 texts. Approximately 137.60 seconds remaining for 776 texts.
Generated embeddings for 300 texts. Approximately 117.97 seconds remaining for 676 texts.
Generated embeddings for 400 texts. Approximately 100.76 seconds remaining for 576 texts.
Generated embeddings for 500 texts. Approximately 81.20 seconds remaining for 476 texts.
Generated embeddings for 600 texts. Approximately 64.25 seconds remaining for 376 texts.
Generated embeddings for 700 texts. Approximately 46.94 seconds remaining for 276 texts.
Generated embeddings for 800 texts. Approximately 29.92 seconds remaining for 176 texts.
Generated embeddings for 900 texts. Approximately 12.79 seconds remaining for 76 texts.
Completed generating embeddings for 976 texts in 170.77 seconds.


In [13]:
df.to_csv("data/tr_reviews_with_embeddings.csv", index=False)