# Vector embeddings with OpenAI

## Setup OpenAI API

In [None]:
import os

import azure.identity
import dotenv
import openai

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")

azure_credential = azure.identity.AzureDeveloperCliCredential(tenant_id=os.getenv("AZURE_TENANT_ID"))
token_provider = azure.identity.get_bearer_token_provider(azure_credential,
    "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2024-06-01",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)


## Vector representations

In [None]:
sentence = "A dog just walked past my house and yipped yipped like a Martian"

response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentence)

vector = response.data[0].embedding

In [None]:
vector

In [None]:
len(vector)

In [None]:
import json
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA

# --- 1️⃣ Load JSON from file ---
json_file = "/workspaces/rag-with-azure-ai-search-notebooks/openai_movies.json"

with open(json_file, "r") as f:
    embedding_json = json.load(f)

print(f"Loaded {len(embedding_json)} total embeddings from file.")

# --- 2️⃣ Filter only dinosaur-related titles ---
dino_titles = [title for title in embedding_json if "dinosaur" in title.lower()]
dino_embeddings = np.array([embedding_json[title] for title in dino_titles])

print(f"Filtered {len(dino_titles)} dinosaur-related embeddings.")

# Check if any dinosaur titles found
if len(dino_titles) == 0:
    raise ValueError("No dinosaur-related titles found in the dataset!")

# --- 3️⃣ Dimensionality reduction (PCA) ---
pca = PCA(n_components=3)
reduced_3d = pca.fit_transform(dino_embeddings)

# --- 4️⃣ Build DataFrame ---
df = pd.DataFrame({
    "x": reduced_3d[:, 0],
    "y": reduced_3d[:, 1],
    "z": reduced_3d[:, 2],
    "title": dino_titles
})

# --- 5️⃣ Interactive 3D plot ---
fig = px.scatter_3d(
    df, x="x", y="y", z="z", text="title", hover_data=["title"],
    title="Interactive 3D Embedding Visualization (Dinosaurs 🦕)"
)
fig.update_traces(marker=dict(size=5))

fig.show()

Loaded 573 embeddings from file.
Embedding matrix shape: (573, 1536)


### Document similarity modeled as cosine distance

In [None]:
import numpy as np
import pandas as pd


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The new movie is awesome',
              'This recent movie is so good',
              'some random gibberish']

def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

### Vector search

In [None]:
import json

# Load in vectors for movie titles
with open('openai_movies.json') as json_file:
    movie_vectors = json.load(json_file)

In [None]:
# Compute vector for query
query = "dinasaurs"

embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=[query])
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)