In [1]:

import os, sys, re, argparse, json
import numpy as np
import pandas as pd

# Azure OpenAI (new SDK)
from openai import AzureOpenAI
# Chroma
import chromadb
from chromadb.api.models.Collection import Collection

In [2]:
embedding_model_name = "text-embedding-3-small"

In [3]:
# from langchain_openai import AzureOpenAIEmbeddings
# embeddings = AzureOpenAIEmbeddings(model=embedding_model_name,
#                                    azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"])

In [4]:
def concat(title: str, desc: str) -> str:
    return f"{(title)} :: {(desc)}"

def cosine(a: np.ndarray, b: np.ndarray) -> float:
    # d = (np.linalg(a) * np.linalg(b))
    similarity = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    return similarity

def quoted_titles(q: str) -> set:
    # find things inside double or single quotes -> treat as exact course titles to exclude
    return {(x) for x in re.findall(r"[\"“](.*?)[\"”]", q) if x.strip()}

In [5]:
import os
vector_db_path = "VectorDB_Chroma"
if os.path.isdir(vector_db_path):
    pass
else:
    os.makedirs(vector_db_path,exist_ok=True)


In [6]:
col = chromadb.PersistentClient(path = vector_db_path).get_or_create_collection(name = "courses")

In [7]:
def embed_text(texts):
    r = client.embeddings.create(model = embedding_model_name, input = texts)
    return [d.embedding for d in r.data]

In [11]:
client = AzureOpenAI()

In [None]:
if col.count() == 0:
    df = pd.read_csv("assignment2dataset.csv")
    ids = df['course_id'].astype(str).tolist()
    metas = [{"title" : t, "description" : d} for t,d in zip(df['title'], df['description'])]
    texts = [concat(m['title'], m["description"]) for m in metas]
    
    for i in range(0, len(texts)):
        chunk = texts[i]
        e = embed_text(chunk)
        col.upsert(ids = ids[i], embeddings = e, metadatas = metas[i])

In [9]:
query = input("Query")

In [12]:
q_embed = np.array(embed_text([query])[0])
res = col.query(query_embeddings = [q_embed.tolist()], n_results = 5, include = ["metadatas","embeddings"])

In [13]:
ids = res["ids"][0]
metas = res["metadatas"][0]
embs = res["embeddings"][0]

In [14]:
out = []
for cid, meta, e in zip(ids, metas, embs):
    title = meta.get("title", "")
    sim = cosine(q_embed, np.array(e))
    out.append({
        "course_id" : cid,
        "title" : title,
        "cosine_similarity" : round(sim, 6),
        "description" : meta.get("description", "")
    })

out.sort(key = lambda x: x['cosine_similarity'], reverse = True)


In [16]:
out

[{'course_id': 'C016',
  'title': 'Python Programming for Data Science',
  'cosine_similarity': 0.548732,
  'description': 'Learn Python fundamentals for data science: variables, control flow, functions, and object-oriented programming. Advance to data handling with pandas, numerical computing with NumPy, and basic plotting with matplotlib. You’ll build reproducible data workflows, clean and transform datasets, and perform exploratory analysis, laying the groundwork for machine learning and statistical modeling projects.'},
 {'course_id': 'C014',
  'title': 'Data Visualization with Tableau',
  'cosine_similarity': 0.434369,
  'description': 'Transform raw data into compelling visual stories using Tableau. Learn to connect to diverse data sources, create interactive dashboards, and apply best practices in chart selection. Topics include calculated fields, parameters, LOD expressions, and storytelling features. Through real-world case studies, you’ll design user-driven analytics that rev