# Lab 1 UMAP with Summary Clustering

Run this lab to cluster on embeddings and include the the summarires in the umap hover text.
Use the number of clusters you determined from lab 11.

In [None]:
params = {
    'file_id': 'your_file_id_from_lab08', # <----- Replace with your actual file ID from lab 08
    'n_components': 2, # UMAP setting
    'n_neighbors': 3, # UMAP setting
    'min_dist': 0.001, # UMAP setting
    'cluster_count': 3, # <----- enter your value from lab 11 # KMeans settings
    'model': 'gpt-3.5-turbo', # <----- enter open ai model
}

from dotenv import dotenv_values
import io
import pandas as pd
import openai
from dotenv import dotenv_values
from openai import OpenAI
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import logging
from soxm.Paths import Paths
import umap
import plotly.express as px
from sklearn.cluster import KMeans
import plotly.graph_objects as go

print(openai.__version__)

In [None]:

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Ensure the OpenAI API key exists
config = dotenv_values()
openai_api_key = config.get('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError("OpenAI API key not found in the environment variables.")

# Initialize OpenAI client
client = OpenAI(api_key=openai_api_key)

# Define a function to get a response from OpenAI API
def get_openai_response(input_text, model):
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": input_text,
            }
        ],
        model=model,
    )
    return chat_completion.choices[0].message.content.strip()

# Function to get embeddings from OpenAI API
def get_embeddings(text):
    response = client.embeddings.create(input=text, model="text-embedding-ada-002")
    return response.data[0].embedding

# Set up Google Drive API
credentials_path = Paths.project('credentials.json') / 'credentials.json'
if not credentials_path.exists():
    raise ValueError(f"No credentials found. {credentials_path} must exist.")

scope = ["https://www.googleapis.com/auth/drive"]

logger.info('Authenticating with Google API using service account...')
credentials = Credentials.from_service_account_file(credentials_path, scopes=scope)
drive_service = build('drive', 'v3', credentials=credentials)

# Your shared folder ID (replace with your actual folder ID)
parent_folder_id = config.get('DATA_RAW_FOLDER_ID')
if not parent_folder_id:
    raise ValueError("Google Drive folder ID not found in the environment variables.")

# ID of the file to read (replace with your actual file ID)
file_id = params['file_id']  

# Export the Google Sheet as a CSV file content from Google Drive
# request = drive_service.files().export_media(fileId=file_id, mimeType='text/csv') # use export_media if reading a google sheet as csv
request = drive_service.files().get_media(fileId=file_id) # use get_media to get a raw csv
csv_content = io.BytesIO()
downloader = MediaIoBaseDownload(csv_content, request)
done = False
while not done:
    status, done = downloader.next_chunk()
    logger.info(f"Download {int(status.progress() * 100)}% complete.")

csv_content.seek(0)  # Move the cursor to the beginning of the in-memory file

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_content)

# Create a new DataFrame to store results
results = []

# Iterate through each row in the DataFrame and generate embeddings
for index, row in df.iterrows():
    story = row['Response']
    embedding = get_embeddings(story)
    results.append({
        "Model": row['Model'],
        "Topic": row['Topic'],
        "Prompt": row['Prompt'],
        "Response": story,
        "Embedding": embedding
    })

# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Display the results DataFrame
print(df.head())

## UMAP Projection

In [None]:
# Extract embeddings and transform them to 2D using UMAP
embedding_list = df['Embedding'].tolist()  # Directly use the 'Embedding' column as a list
umap_model = umap.UMAP(n_components=params['n_components'], n_neighbors=params['n_neighbors'], min_dist=params['min_dist'])
embedding_2d = umap_model.fit_transform(embedding_list)

# Add the 2D embeddings to the DataFrame
df['UMAP1'] = embedding_2d[:, 0]
df['UMAP2'] = embedding_2d[:, 1]
embedding_2d

In [None]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=params['cluster_count'], random_state=42)
df['cluster'] = kmeans.fit_predict(embedding_2d)
print(df['cluster'].unique())
df.head(8)

## Add summaries of each cluster by calling openai to summarize the cluster.

In [None]:
print(f"Summarizing {params['cluster_count']} clusters")
for n in range(params['cluster_count']):
    print(f'-------- cluster # {n}')
    cluster = df[df['cluster'] == n]
    responses = ' '.join(cluster['Response'])
    prompt = responses + " Please summarize the above text into a main topic in 7 words or less"
    print(prompt)
    topic = get_openai_response(prompt, params['model'])
    print(f'--- summary---\n{topic}')
    df.loc[df['cluster'] == n,'Summary'] = topic
df.head(8)

## Visualize the UMAP

In [None]:
# Prepare hover text
df['hover_text'] = 'Topic: ' + df['Topic'] + '<br>' + 'Summary: ' + df['Summary']

# Plot using Plotly
fig = px.scatter(
    df,
    x='UMAP1',
    y='UMAP2',
    color='Topic',  # Use color based on Topic
    hover_name='hover_text',  # Use custom hover text
    title='UMAP Visualization of Stories'
)

fig.show()