# Preparing the data

## Setup OpenAI API

In [None]:
import os

import azure.identity
import dotenv
import openai

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

azure_credential = azure.identity.AzureDeveloperCliCredential(
    tenant_id=os.getenv("AZURE_TENANT_ID"))
token_provider = azure.identity.get_bearer_token_provider(azure_credential,
                                                          "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2024-06-01",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)

Sure! One fun fact about space is that it is completely silent. Unlike on Earth, where sound travels through air or water, space is a vacuum, meaning there are no molecules to carry sound waves. So, if you were floating in space and someone shouted, you wouldn't be able to hear them at all!


### Chunking and creating Embeddings

In [77]:
import json
import openai
import numpy as np
import faiss

# Load movie data
with open("/workspaces/rag-with-azure-ai-search-notebooks/movies.json", "r") as file:
    data = json.load(file)

# Step 1: Extract movie details
movies = data["movies"]

# Step 2: Create chunks of movie details
chunk_size = 1  # Number of movies per chunk
chunks = []

for i in range(0, len(movies), chunk_size):
    chunk = movies[i:i + chunk_size]
    chunk_text = "\n".join(
        f"Title: {movie['title']}, Plot: {movie['plot']}"
        for movie in chunk
    )
    chunks.append(chunk_text)

# Step 3: Display the chunks
print("Chunked Movie Details:")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i + 1}:\n{chunk}\n")

Chunked Movie Details:
Chunk 1:
Title: The Shawshank Redemption, Plot: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.

Chunk 2:
Title: Pulp Fiction, Plot: The lives of two mob hit men, a boxer, a gangster's wife, and a pair of diner bandits intertwine in four tales of violence and redemption.

Chunk 3:
Title: Forrest Gump, Plot: Forrest Gump, while not intelligent, has accidentally been present at many historic moments, but his true love, Jenny Curran, eludes him.

Chunk 4:
Title: The Lord of the Rings: The Return of the King, Plot: Gandalf and Aragorn lead the World of Men against Sauron's army to draw his gaze from Frodo and Sam as they approach Mount Doom with the One Ring.

Chunk 5:
Title: The Lord of the Rings: The Fellowship of the Ring, Plot: A meek Hobbit from the Shire and eight companions set out on a journey to destroy the powerful One Ring and save Middle Earth from the Dark Lord Sauron.

Chunk 6:
Title

In [85]:
chunk_embeddings = []
for chunk in chunks:
    # Combine titles in the chunk into a single string
    chunk_text = " ".join(chunk)
    response = openai_client.embeddings.create(
        model=AZURE_OPENAI_ADA_DEPLOYMENT, input=chunk_text)
    embedding = response.data[0].embedding
    chunk_embeddings.append(embedding)

print("Chunk embeddings created", len(chunk_embeddings), "chunks.")
# Print first 5 dimensions of the first chunk embedding
print("First chunk embedding:", chunk_embeddings[0][:10])

Chunk embeddings created 25 chunks.
First chunk embedding: [-0.006426840554922819, -0.025539157912135124, 0.009973167441785336, -0.0159935113042593, 0.010162398219108582, 0.003430685494095087, -0.030921723693609238, -0.01998838596045971, -0.007856585085391998, -0.0034657281357795]


### Why is length of all Embeddings = 1536?

In [86]:
len(chunk_embeddings[2])

1536

# Storing the Data

### Indexing

In [None]:

# Step 4: Index the embeddings using FAISS
dimension = len(chunk_embeddings[0])  # Dimensionality of the embeddings
index = faiss.IndexFlatL2(dimension)  # Create a flat (non-compressed) index

# Convert embeddings to a NumPy array and add to the index
embeddings_array = np.array(chunk_embeddings).astype('float32')
index.add(embeddings_array)

# Save the FAISS index for later use
faiss.write_index(index, "movie_title_embeddings.index")

print("Movie titles have been chunked, embedded, and indexed successfully!")

Movie titles have been chunked, embedded, and indexed successfully!


# Retrieval of Data

### Similarity Search

In [None]:
import faiss
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load FAISS index
index = faiss.read_index("movie_title_embeddings.index")
dimension = index.d
num_vectors = index.ntotal

print(
    f"Loaded FAISS index with {num_vectors} vectors of dimension {dimension}")
assert len(
    movies) == num_vectors, "Number of titles must match number of embeddings in FAISS index."

# Retrieve all stored embeddings
embeddings = np.vstack([index.reconstruct(i) for i in range(num_vectors)])
print(f"Retrieved {embeddings.shape[0]} embeddings from index.")

# Define query
query_text = "soldier"
response = openai_client.embeddings.create(
    model=AZURE_OPENAI_ADA_DEPLOYMENT,
    input=[query_text]
)
query_vector = np.array(response.data[0].embedding).astype(
    'float32').reshape(1, -1)

# Compute cosine similarity
similarities = cosine_similarity(query_vector, embeddings)[0]

# Build DataFrame with titles and similarity scores
df = pd.DataFrame({
    "Title": movies,
    "Similarity": similarities
}).sort_values("Similarity", ascending=False)

print("\nTop 5 similar movies:")
print(df.head(5))

Loaded FAISS index with 25 vectors of dimension 1536
Retrieved 25 embeddings from index.

Top 5 similar movies:
                                                Title  Similarity
23  {'title': 'Iron Man', 'year': '2008', 'genres'...    0.770712
15  {'title': 'Schindler's List', 'year': '1993', ...    0.770570
9   {'title': 'The Matrix', 'year': '1999', 'genre...    0.770208
10  {'title': 'Gladiator', 'year': '2000', 'genres...    0.762998
18  {'title': 'Avatar', 'year': '2009', 'genres': ...    0.760081


### Recommendation System

In [81]:
import json
import openai
import numpy as np
import pandas as pd
import faiss

# 1. Load movie data
with open("movies.json", "r") as f:
    data = json.load(f)

movies = data["movies"]
titles = [movie["title"] for movie in movies]
texts = [f"{movie['title']} {movie['plot']}" for movie in movies]

# 2. Load index
index = faiss.read_index("movie_embeddings.index")

# 3. Query input
query_text = "A space adventure with aliens and distant planets"
query_response = openai_client.embeddings.create(
    model=AZURE_OPENAI_ADA_DEPLOYMENT,
    input=[query_text]
)
query_vector = np.array(query_response.data[0].embedding).astype(
    'float32').reshape(1, -1)

# 4. Similarity search
k = 5
distances, indices = index.search(query_vector, k)
top_movies = []
for idx, distance in zip(indices[0], distances[0]):
    movie = movies[idx]
    top_movies.append(
        f"Title: {movie['title']}\nYear: {movie['year']}\nGenres: {', '.join(movie['genres'])}\nPlot: {movie['plot']}")

movie_descriptions = "\n\n".join(top_movies)

# 5. Build prompt and Pass it to LLM
prompt = f"""
A user is looking for a movie recommendation similar to: "{query_text}".

Here are the top recommended movies:

{movie_descriptions}

Based on these, write a friendly recommendation message in 2-3 sentences, highlighting which movie the user should watch first and why.
"""

# Send chat completion request
response = openai_client.chat.completions.create(
    # this is your deployed model name, like "gpt-4o-mini"
    model=AZURE_OPENAI_DEPLOYMENT_NAME,
    messages=[
        {"role": "system", "content": "You are a helpful movie recommendation assistant."},
        {"role": "user", "content": prompt}
    ],
    temperature=0.7,
    max_tokens=500
)

# Print the assistant's reply
recommendation = response.choices[0].message.content

print("\n🎬 Recommended for you:\n")
print(recommendation)


🎬 Recommended for you:

If you're looking for a thrilling space adventure with aliens and distant planets, I highly recommend starting with "Interstellar." Its stunning visuals and thought-provoking plot about exploring new worlds while ensuring humanity's survival will keep you on the edge of your seat. Plus, the emotional depth and stunning soundtrack make it an unforgettable experience!
