# Movie Lens

Subset of the Movie Lens 25M dataset

# Setup, Vectorize and Load Data

In this tutorial, we'll demonstrate how to leverage a sample dataset stored in Azure Cosmos DB for MongoDB to ground OpenAI models. We'll do this taking advantage of Azure Cosmos DB for Mongo DB vCore's [vector similarity search](https://learn.microsoft.com/azure/cosmos-db/mongodb/vcore/vector-search) functionality. In the end, we'll create an interatice chat session with the GPT-3.5 completions model to answer questions about Azure services informed by our dataset. This process is known as Retrieval Augmented Generation, or RAG.


In [1]:
! /usr/bin/python3 -m pip install openai pymongo python-dotenv urlopen azure-cosmos tenacity aiohttp gradio >> /dev/null

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


# Load environment values and instantiate clients


In [3]:
# Import the required libraries
import zipfile
from dotenv import load_dotenv
from tenacity import retry, stop_after_attempt, wait_random_exponential
import json
from azure.cosmos.aio import CosmosClient
from azure.cosmos import exceptions, PartitionKey
import azure.identity
import os
import openai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set the OpenAI API Variables
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")
AZURE_OPENAI_DEPLOYMENT_NAME = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

azure_credential = azure.identity.AzureDeveloperCliCredential(
    tenant_id=os.getenv("AZURE_TENANT_ID")
)
token_provider = azure.identity.get_bearer_token_provider(
    azure_credential, "https://cognitiveservices.azure.com/.default"
)

# Create the Azure OpenAI client
# This is the Azure OpenAI client for the Ada model
openai_client = openai.AzureOpenAI(
    api_version="2024-06-01",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider,
)

# Set the Azure Cosmos DB for NoSQL Variables
COSMOS_NOSQL_KEY = os.getenv("COSMOS_NOSQL_KEY")
COSMOS_NOSQL_DATABASE_NAME = os.getenv("COSMOS_NOSQL_DATABASE_NAME")
COSMOS_NOSQL_COLLECTION_NAME = os.getenv("COSMOS_NOSQL_COLLECTION_NAME")
COSMOS_NOSQL_VECTOR_PROPERTY_NAME = os.getenv("COSMOS_NOSQL_VECTOR_PROPERTY_NAME")
COSMOS_NOSQL_CACHE_COLLECTION_NAME = os.getenv("COSMOS_NOSQL_CACHE_COLLECTION_NAME")
COSMOS_NOSQL_URL = os.getenv("COSMOS_NOSQL_URL")

# Create the Azure Cosmos DB for NoSQL client
cosmos_client = CosmosClient(url=COSMOS_NOSQL_URL, credential=COSMOS_NOSQL_KEY)

# Create a database and containers with vector policies

This function takes a database object, a collection name, the name of the document property that will store vectors, and the number of vector dimensions used for the embeddings.


In [4]:
try:
    db = await cosmos_client.create_database_if_not_exists(COSMOS_NOSQL_DATABASE_NAME)
    print(f"Database '{COSMOS_NOSQL_DATABASE_NAME}' created or already exists.")
except Exception as e:
    print(f"Failed to create or access database '{COSMOS_NOSQL_DATABASE_NAME}': {e}")
    raise

# Create the vector embedding policy to specify vector details
vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path": "/" + COSMOS_NOSQL_VECTOR_PROPERTY_NAME,
            "dataType": "float32",
            "distanceFunction": "dotproduct",
            "dimensions": 1536,
        },
    ]
}

# Create the vector index policy to specify vector details
indexing_policy = {
    "vectorIndexes": [
        {"path": "/" + COSMOS_NOSQL_VECTOR_PROPERTY_NAME, "type": "quantizedFlat"}
    ]
}

# Create the data collection with vector index
# Make sure to enable the vector search capability in your Azure Cosmos DB account
# az cosmosdb update \
#      --resource-group rg-prateek-3601_ai \
#      --name agent-memory-vector \
#      --capabilities EnableNoSQLVectorSearch
try:
    container = await db.create_container_if_not_exists(
        id=COSMOS_NOSQL_COLLECTION_NAME,
        partition_key=PartitionKey(path="/id"),
        vector_embedding_policy=vector_embedding_policy,
        offer_throughput=1000,
    )
    print(f"Container '{COSMOS_NOSQL_COLLECTION_NAME}' created or already exists.")
except exceptions.CosmosHttpResponseError as e:
    print(f"Failed to create container '{COSMOS_NOSQL_COLLECTION_NAME}': {e}")
    raise

# Create the cache collection with vector index
try:
    cache_container = await db.create_container_if_not_exists(
        id=COSMOS_NOSQL_CACHE_COLLECTION_NAME,
        partition_key=PartitionKey(path="/id"),
        indexing_policy=indexing_policy,
        vector_embedding_policy=vector_embedding_policy,
        offer_throughput=1000,
    )
    print(
        f"Cache container '{COSMOS_NOSQL_CACHE_COLLECTION_NAME}' created or already exists."
    )
except exceptions.CosmosHttpResponseError as e:
    print(
        f"Failed to create cache container '{COSMOS_NOSQL_CACHE_COLLECTION_NAME}': {e}"
    )
    raise

Database 'agent-memory-vector' created or already exists.
Container 'vector-collection' created or already exists.
Cache container 'chathistory' created or already exists.


# Generate embeddings from Azure OpenAI

We'll create a a helper function to generate embeddings from passed in text using Azure OpenAI. We'll also add a retry to handle any throttling due to quota limits.


In [5]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(20))
def generate_embeddings(text):

    response = openai_client.embeddings.create(
        input=text,
        model=openai_embeddings_deployment,
        dimensions=openai_embeddings_dimensions,
    )

    embeddings = response.model_dump()
    return embeddings["data"][0]["embedding"]

# Load the data from the JSON file


In [6]:
# Unzip the data file
with zipfile.ZipFile("./Data/MovieLens-4489-256D.zip", "r") as zip_ref:
    zip_ref.extractall("../Data")
zip_ref.close()

# Load the data file
data = []
with open("./Data/MovieLens-4489-256D.json", "r") as d:
    data = json.load(d)

In [8]:
# Peek at the first document
data[0]

{'adult': 'False',
 'belongs_to_collection': {'id': 10194,
  'name': 'Toy Story Collection',
  'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg',
  'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'},
 'budget': '30000000',
 'genres': [{'id': 16, 'name': 'Animation'},
  {'id': 35, 'name': 'Comedy'},
  {'id': 10751, 'name': 'Family'}],
 'homepage': 'http://toystory.disney.com/toy-story',
 'id': '862',
 'imdb_id': 'tt0114709',
 'original_language': 'en',
 'original_title': 'Toy Story',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'popularity': 21.946943,
 'poster_path': '/rhIRbceoE9lR4veEXuwCC2wARtG.jpg',
 'production_companies': [{'name': 'Pixar Animation Studios', 'id': 3}],
 'production_countries': [{'iso_3166_1'

In [9]:
# View the number of documents in the data (4489)
len(data)

4489

# Store data in Azure Cosmos DB.

Upsert data into Azure Cosmos DB for NoSQL. Optionally, vectorize properties of the document (this has been done in the sample data)


In [None]:
async def insert_data():
    # stream = urllib.request.urlopen(storage_file_url)
    counter = 0
    list_to_upsert = []
    await cosmos_client.__aenter__()
    for object in data:

        # The following code to create vector embeddings for the data is commented out as the sample data is already vectorized.
        # vectorArray = generate_embeddings("Title:" + data[i]['original_title'] + ", Tagline:" + data[i]['tagline'] + ", Overview:" + data[i]['overview'])
        # object[cosmos_vector_property] = vectorArray
        await container.upsert_item(body=object)

        # print progress every 100 upserts.
        counter += 1
        if counter % 5 == 0:
            print("Inserted {} documents into collection.".format(counter))
    print("Upsert completed!")

# Insert the data asynchronously
await insert_data()