# lab09_umap_embeddings.py

- First, copy this lab to your notebooks folder.
- Then modify the parameters with your user namee and other values to your liking.
- **Retrieve File ID**: The file ID is located in the URL of your file on Google Drive. Look for the part between `/d/` and `/view`.
- **Replace `your_file_id_here`**: Insert the actual file ID into the script.


In [2]:
params = {
    'file_id': '1qX6rlkSRz6dbx73sME94-7YIB47Xi3rB', # Replace with your actual file ID
    'n_components': 2, # UMAP setting
    'n_neighbors': 3, # UMAP setting
    'min_dist': 0.001, # UMAP setting
}

import io
import pandas as pd
import openai
from dotenv import dotenv_values
from openai import OpenAI
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import logging
from soxm.Paths import Paths
import umap
import plotly.express as px

print(openai.__version__)

  from .autonotebook import tqdm as notebook_tqdm


1.37.1


In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Ensure the OpenAI API key exists
config = dotenv_values()
openai_api_key = config.get('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError("OpenAI API key not found in the environment variables.")

# Initialize OpenAI client
client = OpenAI(api_key=openai_api_key)

# Function to get embeddings from OpenAI API
def get_embeddings(text):
    response = client.embeddings.create(input=text, model="text-embedding-ada-002")
    return response.data[0].embedding

# Set up Google Drive API
credentials_path = Paths.project('credentials.json') / 'credentials.json'
if not credentials_path.exists():
    raise ValueError(f"No credentials found. {credentials_path} must exist.")

scope = ["https://www.googleapis.com/auth/drive"]

logger.info('Authenticating with Google API using service account...')
credentials = Credentials.from_service_account_file(credentials_path, scopes=scope)
drive_service = build('drive', 'v3', credentials=credentials)

# Your shared folder ID (replace with your actual folder ID)
parent_folder_id = config.get('DATA_RAW_FOLDER_ID')
if not parent_folder_id:
    raise ValueError("Google Drive folder ID not found in the environment variables.")

# ID of the file to read (replace with your actual file ID)
file_id = params['file_id']  

# Export the Google Sheet as a CSV file content from Google Drive
# request = drive_service.files().export_media(fileId=file_id, mimeType='text/csv') # use export_media if reading a google sheet as csv
request = drive_service.files().get_media(fileId=file_id) # use get_media to get a raw csv
csv_content = io.BytesIO()
downloader = MediaIoBaseDownload(csv_content, request)
done = False
while not done:
    status, done = downloader.next_chunk()
    logger.info(f"Download {int(status.progress() * 100)}% complete.")

csv_content.seek(0)  # Move the cursor to the beginning of the in-memory file

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_content)

# Create a new DataFrame to store results
results = []

# Iterate through each row in the DataFrame and generate embeddings
for index, row in df.iterrows():
    story = row['Response']
    embedding = get_embeddings(story)
    results.append({
        "Model": row['Model'],
        "Topic": row['Topic'],
        "Prompt": row['Prompt'],
        "Response": story,
        "Embedding": embedding
    })

# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Display the results DataFrame
print(df.head())

INFO:__main__:Authenticating with Google API using service account...
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0
INFO:__main__:Download 100% complete.
INFO:openai._base_client:Retrying request to /embeddings in 0.756127 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.ope

           Model Topic                       Prompt  \
0  gpt-3.5-turbo  dogs  Tell me a story about dogs.   
1  gpt-3.5-turbo  dogs  Tell me a story about dogs.   
2  gpt-3.5-turbo  dogs  Tell me a story about dogs.   
3  gpt-3.5-turbo  dogs  Tell me a story about dogs.   
4  gpt-3.5-turbo  cats  Tell me a story about cats.   

                                            Response  \
0  Once upon a time, in a small village nestled i...   
1  Once upon a time, in a small village nestled i...   
2  Once upon a time in a small village, there liv...   
3  Once upon a time, in a small town nestled in t...   
4  Once upon a time, in a small village nestled a...   

                                           Embedding  
0  [0.014890466816723347, -0.010746299289166927, ...  
1  [0.002060612430796027, -0.002314301673322916, ...  
2  [0.009732970967888832, -0.005511600524187088, ...  
3  [0.010636508464813232, -0.013543259352445602, ...  
4  [0.004113786853849888, -0.025664234533905983, ...  


## UMAP Visualization

In [6]:
# Extract embeddings and transform them to 2D using UMAP
embedding_list = df['Embedding'].tolist()  # Directly use the 'Embedding' column as a list
umap_model = umap.UMAP(n_components=params['n_components'], n_neighbors=params['n_neighbors'], min_dist=params['min_dist'])
embedding_2d = umap_model.fit_transform(embedding_list)

# Add the 2D embeddings to the DataFrame
df['UMAP1'] = embedding_2d[:, 0]
df['UMAP2'] = embedding_2d[:, 1]

# Prepare hover text
df['hover_text'] = 'Topic: ' + df['Topic'] + '<br>' + 'Response: '# + df['Response']

# Plot using Plotly
fig = px.scatter(
    df,
    x='UMAP1',
    y='UMAP2',
    color='Topic',  # Use color based on Topic
    hover_name='hover_text',  # Use custom hover text
    title='UMAP Visualization of Stories',
    width=1500,
    height=350
)

fig.show()