***Start of Main Code***

In [108]:
# import required libraries

import os
import supabase
from nomic import atlas
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
import numpy as np
import time
import pandas as pd

In [109]:
# loading environment variables

env_path = "../.env"
load_dotenv(dotenv_path=env_path)

True

In [110]:
# initialize supabase client

url = os.environ.get("SUPABASE_URL")
key = os.environ.get("SUPABASE_API_KEY")

supabase_client = supabase.create_client(url, key)

In [111]:
# querying conversation data from supabase

response = supabase_client.table("llm-convo-monitor").select("*").limit(2000).execute()
data = response.data
len(data)

1416

In [112]:
# convert into dataframe

df = pd.DataFrame(data)
df.head()

Unnamed: 0,id,created_at,convo,convo_id,course_name,user_email
0,974,2023-06-27T20:27:40.04635+00:00,"{'id': '6de7f780-2092-440a-92de-5b53fe830178',...",6de7f780-2092-440a-92de-5b53fe830178,gpt4,
1,1136,2023-06-30T19:42:14.162925+00:00,"{'id': '8cf2fbfd-4372-41d7-98c9-529b80355ea8',...",8cf2fbfd-4372-41d7-98c9-529b80355ea8,gpt4,
2,15,2023-06-09T21:22:22.887494+00:00,"{'id': '101c57bf-bdbc-4fb7-ba66-88f2da33a1c1',...",101c57bf-bdbc-4fb7-ba66-88f2da33a1c1,gpt4,
3,232,2023-06-14T22:15:19.908152+00:00,"{'id': '8c26597c-695b-41c5-abdd-6d2db138d3dd',...",8c26597c-695b-41c5-abdd-6d2db138d3dd,gpt4,
4,55,2023-06-10T22:34:35.495748+00:00,"{'id': '4e330d66-c6e4-4a5e-a613-2a049fa00bbe',...",4e330d66-c6e4-4a5e-a613-2a049fa00bbe,gpt4,


In [113]:
# list of course names

course_names = df['course_name'].unique()
len(course_names)

123

In [114]:
course_names

array(['gpt4', 'badm_550_ashley', None, 'ece120', 'test-video-ingest',
       'badm-567-v3', 'badm-567', 'new-weather', 'gies-online-mba-v2',
       'frontend', 'test-video-ingest-28', 'ECE220FA23', 'ECE408FA23',
       'pdeploy999', 'badm-350-summer', 'previewtesting1', 'localtest2',
       'your-favorite-url', 'mantine', 'ece408', 'test-video-ingest-2',
       'Snowmass', 'badm_567_v2', 'erpnext', 'mip',
       'farmdoc_test_kastan-v1', 'personalstatement', 'hrc', 'csv',
       'star_nox', 'badm_567', 'SPaRCEd', 'NPRE247', 'localdemo8',
       'badm_567_thumbnails', 'your-awesome-course', 'chatbot', 'erp',
       'extreme', 'rohan_atree', 'zotero-extreme', 'pract',
       'gies-online-mba2', 'gies-online-mba', 'ece120FL22',
       'careerassistant', 'weather', 'lillian-wang-blog', 'local-test5',
       'demo-for-vyriad', 'rtest', 'previewdeploy', 'r2test',
       'Law794-TransactionalDraftingAlam', 'personal-statement',
       'rohan_excel', 'langchain-python', 'langchain', 'ncsa-liv

In [115]:
# initialize langchain OpenAI embeddings model

embeddings_model = OpenAIEmbeddings()

In [None]:
# main cell to extract course wise data and create individual maps

i = 1
skipped_courses = []
for course in course_names:
    if course is None:
        continue

    # get all queries for a course and create metadata
    user_queries = []
    metadata = []
    course_df = df[df['course_name'] == course]['convo']
    print(course)

    for convo in course_df:
        # get all messages for a conversation
        messages = convo['messages']

        # get user queries out of the messages
        for m in messages:
            if m['role'] == 'user' and m['content'] != '':
                user_queries.append(m['content'])
                metadata.append({'course_name': course, 'query': m['content'], 'id': i})
                i += 1
    print("course name: ", course)
    print(len(user_queries))
    if len(user_queries) < 20: # Nomic requires at least 20 queries to create a map
        skipped_courses.append(course)
        continue
    
    # point where one course is done
    # convert query and context to embeddings
    metadata = pd.DataFrame(metadata)
    embeddings = embeddings_model.embed_documents(user_queries)
    embeddings = np.array(embeddings)
    print(embeddings.shape)

    # create an Atlas project
    project_name = "User Query Text Viz for " + course
    index_name = course + "_index"
    project = atlas.map_embeddings(embeddings=np.array(embeddings),
                                data=metadata,
                                id_field='id',
                                build_topic_model=True,
                                topic_label_field='query',
                                name=project_name,
                                colorable_fields=['query'])
    print(project.maps)

    project.create_index(index_name, build_topic_model=True)
    #break



In [107]:
# skipped courses with < 20 data points/queries

print(len(skipped_courses))
print(skipped_courses)


101
['test-video-ingest', 'badm-567', 'test-video-ingest-28', 'pdeploy999', 'badm-350-summer', 'previewtesting1', 'localtest2', 'your-favorite-url', 'mantine', 'test-video-ingest-2', 'Snowmass', 'badm_567_v2', 'erpnext', 'mip', 'personalstatement', 'hrc', 'csv', 'star_nox', 'badm_567', 'SPaRCEd', 'localdemo8', 'badm_567_thumbnails', 'chatbot', 'erp', 'extreme', 'rohan_atree', 'zotero-extreme', 'gies-online-mba2', 'gies-online-mba', 'careerassistant', 'weather', 'lillian-wang-blog', 'local-test5', 'demo-for-vyriad', 'rtest', 'previewdeploy', 'r2test', 'personal-statement', 'rohan_excel', 'langchain-python', 'langchain', 'ncsa-live-demo', 'rohan_atree_individual', 'HealthyLivingGuide', 'rohan', 'babreu', 'test-video-ingest-17', 'summary', 'test-video-ingest-3', 'test-video-ingest-27', 'lillian-wang-blog-2', 'python-magic', 'ansible2', 'ece408fa23', 'farmdoc_test_josh_v2', 'local-test3', 'automata', 'SpaceFlorida-GT', 'GBSI-GT', 'newnew_ncsa', 'canvas', 'gbsi-gt', 'meditation-tutorial', '

***End of Main Code***

Just rough old code below. Don't run!

In [67]:
course_df = df[df['course_name'] == 'gpt4']['convo']
print("total conversations in the course: ", len(course_df))
print("total user queries in the course: ", len(user_queries))


total conversations in the course:  593
total user queries in the course:  1723


In [73]:
metadata = pd.DataFrame(metadata)

In [71]:
embeddings_model = OpenAIEmbeddings()
embeddings = embeddings_model.embed_documents(user_queries)

In [72]:
embeddings = np.array(embeddings)
embeddings.shape

(1723, 1536)

In [74]:
# upload
import nomic
from nomic import atlas
nomic.login(os.getenv('NOMIC_API_KEY'))

In [None]:
project = atlas.map_embeddings(embeddings=np.array(embeddings),
                                data=metadata,
                                id_field='id',
                                build_topic_model=True,
                                topic_label_field='query',
                                name='User Query Viz for gpt4',
                                colorable_fields=['query'])
print(project.maps)

project.create_index('GPT-4', build_topic_model=True)


In [5]:
convos = []

for d in data:
    messages = d['convo']['messages']
    for m in messages:
        m['course_name'] = d['course_name']
    convos.append(messages)

In [6]:
temp_data = [convos[0][0]['content'], convos[0][1]['content'], convos[0][2]['content']]
temp_data

["Help me update this react so that the get_user_permission() function is ONLY called after clerk_user.is_loaded is true. Probably use hooks for this. \n\nimport { useUser } from '@clerk/nextjs'\nimport { NextPage } from 'next'\nimport { useRouter } from 'next/router'\nimport { useEffect, useState } from 'react'\nimport { CanViewOnlyCourse } from '~/components/UIUC-Components/CanViewOnlyCourse'\nimport { CannotEditCourse } from '~/components/UIUC-Components/CannotEditCourse'\nimport { CannotViewCourse } from '~/components/UIUC-Components/CannotViewCourse'\nimport { LoadingSpinner } from '~/components/UIUC-Components/LoadingSpinner'\nimport { MainPageBackground } from '~/components/UIUC-Components/MainPageBackground'\nimport { get_user_permission } from '~/components/UIUC-Components/runAuthCheck'\nimport { CourseMetadata } from '~/types/courseMetadata'\n\nconst NotAuthorizedPage: NextPage = (props) => {\n  const router = useRouter()\n  const clerk_user = useUser()\n\n\n  const getCurren

In [7]:
embeddings_model = OpenAIEmbeddings()
embeddings = embeddings_model.embed_documents(temp_data)

In [8]:
print(np.array(embeddings).shape)


(3, 1536)


In [9]:
# structuring data and metadata
response_data = []
metadata = []

# one response is one full convo - contains dicts of user and bot responses
for convo in convos:
    if convo[0]['role'] == 'user' and convo[0]['content'] != "":
        response_data.append(convo[0]['content'])
    meta_row = {'course_name': convo[0]['course_name'], 'query': convo[0]['content']}
    metadata.append(meta_row)

print(response_data[200])
print(metadata[200])    

make some changes here to put Experience in performing predictive analytics in a large scale enterprise :
 Paisabazaar Marketing and Consulting Private Limited, Data Analyst Intern                                          April 2022 – June 2022
•	Built and sustained large-scale databases, utilizing advanced statistical techniques to scrutinize customer and partner data, leading to a 20% surge in customer retention and a 15% uplift in partner satisfaction.
•	Employed python scripts for automating operations to extract insights, performing analysis and making the data compatible with SQL server requirements.
•	Handled Big Data Sets using SQL and generated periodical audit reports for data management using SQL triggers (Toad for Oracle) & conducted Statistical Analysis to identify trends and patterns in data.
•	Crafted 35+ interactive filters, parameters, & calculations for dashboards and worksheets in Tableau.


{'course_name': 'your-awesome-course', 'query': 'make some changes here to p

In [10]:
data_embeddings = embeddings_model.embed_documents(response_data)

In [11]:
print(np.array(data_embeddings).shape)


(1361, 1536)


In [12]:
import time

current_time = time.time()
print(current_time)

1692760073.1680226


In [13]:
for i in range(len(data_embeddings)):
    metadata[i]['id'] = i+1

metadata = pd.DataFrame(metadata)

In [14]:
metadata.head()

Unnamed: 0,course_name,query,id
0,gpt4,Help me update this react so that the get_user...,1
1,gpt4,"Update this code to have all code in methods, ...",2
2,gpt4,1. hello,3
3,gpt4,hi,4
4,gpt4,asdgf,5


In [16]:
project = atlas.map_embeddings(embeddings=np.array(data_embeddings),
                                data=metadata,
                                id_field='id',
                                name='User Query Text Viz 3',
                                colorable_fields=['course_name'])
print(project.maps)

2023-08-22 22:08:20.690 | INFO     | nomic.project:_create_project:779 - Creating project `User Query Text Viz 3` in organization `dabholkar.asmita`
2023-08-22 22:08:21.592 | INFO     | nomic.atlas:map_embeddings:107 - Uploading embeddings to Atlas.
2it [00:03,  1.83s/it]                       
2023-08-22 22:08:26.039 | INFO     | nomic.project:_add_data:1401 - Upload succeeded.
2023-08-22 22:08:26.127 | INFO     | nomic.atlas:map_embeddings:126 - Embedding upload succeeded.
2023-08-22 22:08:27.503 | INFO     | nomic.project:create_index:1111 - Created map `User Query Text Viz 3` in project `User Query Text Viz 3`: https://atlas.nomic.ai/map/efb685d0-707b-431a-83b4-f6a0f5998675/81aa5be8-f8c8-4d73-bbc8-def68765559e
2023-08-22 22:08:27.506 | INFO     | nomic.atlas:map_embeddings:139 - User Query Text Viz 3: https://atlas.nomic.ai/map/efb685d0-707b-431a-83b4-f6a0f5998675/81aa5be8-f8c8-4d73-bbc8-def68765559e


[User Query Text Viz 3: https://atlas.nomic.ai/map/efb685d0-707b-431a-83b4-f6a0f5998675/81aa5be8-f8c8-4d73-bbc8-def68765559e]
