In [12]:

import io  # Importing the io module to enable in-memory file operations
import json
import pandas as pd
from dotenv import dotenv_values
import openai
from openai import OpenAI
from googleapiclient.http import MediaIoBaseUpload  # Importing the correct module for in-memory uploads
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import logging
from soxm.Paths import Paths
from google.auth.transport.requests import Request
from google.oauth2.service_account import Credentials
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import umap
import plotly.express as px
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import re
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans

In [13]:
params = {
    'file_id':'17dDeqfiU6PjQyhooO7Mp1cqAqCIv7Euc', # Replace with your actual file ID
    'n_components': 2, # UMAP setting
    'n_neighbors': 3, # UMAP setting
    'min_dist': 0.001, # UMAP setting
}


# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Ensure the OpenAI API key exists
config = dotenv_values()
openai_api_key = config.get('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError("OpenAI API key not found in the environment variables.")

# Initialize OpenAI client
client = OpenAI(api_key=openai_api_key)

# Function to get embeddings from OpenAI API
def get_embeddings(text):
    response = client.embeddings.create(input=text, model="text-embedding-ada-002")
    return response.data[0].embedding

# Set up Google Drive API
credentials_path = Paths.project('credentials.json') / 'credentials.json'
if not credentials_path.exists():
    raise ValueError(f"No credentials found. {credentials_path} must exist.")

scope = ["https://www.googleapis.com/auth/drive"]

logger.info('Authenticating with Google API using service account...')
credentials = Credentials.from_service_account_file(credentials_path, scopes=scope)
drive_service = build('drive', 'v3', credentials=credentials)

# Your shared folder ID (replace with your actual folder ID)
parent_folder_id = config.get('DATA_RAW_FOLDER_ID')
if not parent_folder_id:
    raise ValueError("Google Drive folder ID not found in the environment variables.")

# ID of the file to read (replace with your actual file ID)
file_id = params['file_id']  

# Export the Google Sheet as a CSV file content from Google Drive
# request = drive_service.files().export_media(fileId=file_id, mimeType='text/csv') # use export_media if reading a google sheet as csv
request = drive_service.files().get_media(fileId=file_id) # use get_media to get a raw csv
csv_content = io.BytesIO()
downloader = MediaIoBaseDownload(csv_content, request)
done = False
while not done:
    status, done = downloader.next_chunk()
    logger.info(f"Download {int(status.progress() * 100)}% complete.")

csv_content.seek(0)  # Move the cursor to the beginning of the in-memory file

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_content)

# Create a new DataFrame to store results
results = []

# Iterate through each row in the DataFrame and generate embeddings
for index, row in df.iterrows():
    story = row['GPT Response']
    embedding = get_embeddings(story)
    results.append({
        "Model": row['ChatGPT Version'],
        "Topic": row['Subreddit'],
        "Prompt": row['Prompt'],
        "Response": story,
        "Embedding": embedding
    })

# Convert the results to a DataFrame
df = pd.DataFrame(results)

# Display the results DataFrame
# print(df.head())

embedding_list = df['Embedding'].tolist()  # Directly use the 'Embedding' column as a list
umap_model = umap.UMAP(n_components=params['n_components'], n_neighbors=params['n_neighbors'], min_dist=params['min_dist'])
embedding_2d = umap_model.fit_transform(embedding_list)
print(embedding_2d)

INFO:__main__:Authenticating with Google API using service account...
INFO:googleapiclient.discovery_cache:file_cache is only supported with oauth2client<4.0.0


INFO:__main__:Download 100% complete.
INFO:openai._base_client:Retrying request to /embeddings in 0.882126 seconds
INFO:openai._base_client:Retrying request to /embeddings in 1.510157 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.o

[[ 4.41519356e+00  7.31759357e+00]
 [ 4.32053423e+00  7.37676907e+00]
 [-1.59104705e-01  1.42896013e+01]
 [-9.19601798e-01  1.51574841e+01]
 [ 3.95618892e+00  7.54119873e+00]
 [ 4.04228783e+00  7.53744173e+00]
 [ 6.47576952e+00  4.30484676e+00]
 [ 4.01154900e+00  7.55999088e+00]
 [ 4.22793436e+00  7.43513203e+00]
 [-8.15166175e-01  1.48718119e+01]
 [ 5.55643177e+00  4.93520784e+00]
 [ 4.00337696e+00  7.55487776e+00]
 [-1.66753411e-01  1.40924034e+01]
 [ 3.36185431e+00  7.09499550e+00]
 [-4.30094689e-01  1.41234264e+01]
 [ 3.84743285e+00  1.02405653e+01]
 [ 5.96601915e+00  4.98573351e+00]
 [ 4.37835312e+00  9.67168331e+00]
 [ 4.58379889e+00 -1.29895759e+00]
 [-1.17656529e-01  1.40088243e+01]
 [-9.94698331e-02  1.43538456e+01]
 [ 4.23971748e+00  1.17435312e+01]
 [ 3.95515203e+00  1.01453114e+01]
 [ 4.14183521e+00  1.18938847e+01]
 [ 4.22619820e+00  1.19755087e+01]
 [ 4.33390570e+00  9.66306210e+00]
 [ 4.23136711e+00  1.19882689e+01]
 [-3.97549212e-01  1.45649052e+01]
 [-1.18947536e-01  1

In [14]:
# Function to calculate inertia for different k values
def calculate_inertia(data, max_k):
    inertias = []
    for k in range(1, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(data)
        inertias.append(kmeans.inertia_)
    return inertias

# Calculate inertia for k values from 1 to 10
max_k = 10
inertias = calculate_inertia(embedding_2d, max_k)

# Visualize the Elbow Method
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(1, max_k + 1)),
    y=inertias,
    mode='lines+markers',
    marker=dict(color='blue'),
    name="Inertia (Within-cluster Sum of Squares)"
))

fig.update_layout(
    title="Elbow Method for Determining Optimal Number of Clusters",
    xaxis_title="Number of Clusters (k)",
    yaxis_title="Inertia",
    showlegend=False
)
fig.show()