In [115]:
import os
import json
import pandas as pd
import urllib.request
import boto3
import base64
import requests

from requests.auth import HTTPBasicAuth
from botocore.config import Config

In [116]:
json_files = [pos_json for pos_json in os.listdir('movielens/') if pos_json.endswith('.json')]
json_data = []
for index, js in enumerate(json_files):
    with open(os.path.join('movielens/', js)) as json_file:
        json_text = json.load(json_file).get('data').get('searchResults')
        json_data.extend(json_text)

In [21]:
df = pd.DataFrame([json_data[i]['movie'] for i in range(len(json_data))])

In [26]:

def download_image(url, file_path, file_name):
    full_path = file_path + file_name
    urllib.request.urlretrieve(url, full_path)

In [27]:
for index, row in df.iterrows():
    url = 'https://image.tmdb.org/t/p/w500/' + row['posterPath']
    download_image(url, 'images/', row['posterPath'])

In [148]:
my_config = Config(
    region_name = 'us-east-1',
    signature_version = 'v4',
    retries = {
        'max_attempts': 10,
        'mode': 'standard'
    }
)

bedrock = boto3.client(service_name="bedrock", config=my_config)
bedrock_runtime = boto3.client(service_name="bedrock-runtime", config=my_config)

In [119]:
def get_embedding_for_poster(image_path):
    with open(image_path, "rb") as image_file:
        input_image = base64.b64encode(image_file.read()).decode('utf8')

    body = json.dumps(
        {
            "inputImage": input_image
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))
    image_name = image_path.split("/")[-1].split(".")[0]

    return vector_json, image_name

def get_embedding_for_poster_and_title(image_path, title):
    with open(image_path, "rb") as image_file:
        input_image = base64.b64encode(image_file.read()).decode('utf8')

    body = json.dumps(
        {
            "inputImage": input_image,
            "inputText": title
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))
    image_name = image_path.split("/")[-1].split(".")[0]

    return vector_json, image_name


def get_embedding_for_text(text):
    body = json.dumps(
        {
            "inputText": text
        }
    )

    response = bedrock_runtime.invoke_model(
        body=body, 
        modelId="amazon.titan-embed-image-v1", 
        accept="application/json", 
        contentType="application/json"       
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))

    return vector_json, text

In [32]:
for index, row in df.iterrows():
    image_path = 'images/' + row['posterPath']
    vector_json, image_name = get_embedding_for_poster(image_path)
    with open('embeddings/' + image_name + '.json', 'w') as f:
        json.dump(vector_json, f)

In [35]:
for index, row in df.iterrows():
    image_path = 'images/' + row['posterPath']
    vector_json, image_name = get_embedding_for_poster_and_title(image_path, row['title'])
    with open('embeddings/' + 'with_title_' + image_name + '.json', 'w') as f:
        json.dump(vector_json, f)

In [51]:
df = df.drop(columns=['dvdReleaseDate', 'backdropPaths', 'youtubeTrailerIds', 'numRatings', 'avgRating'])

In [76]:
base_url = "https://0.0.0.0:9200"
username = "admin"
password = "admin"

requests.packages.urllib3.disable_warnings()

response = requests.get(base_url, auth=HTTPBasicAuth(username, password), verify=False)

In [93]:
# if exists, delete the index /multi-modal-embedding-index with requests
response = requests.delete(base_url + "/multi-modal-embedding-index", auth=HTTPBasicAuth(username, password), verify=False)
print(response.text)


{"acknowledged":true}


In [94]:
mapping = {
    "settings": {
        "index.knn": True,
    },
    "mappings": {
        "properties": {
            "titan_multimodal_embedding": {
                "type": "knn_vector",
                "dimension": 1024
            },
            "title": { 
                "type": "text"            
            },
            "plotSummary": { 
                "type": "text"            
            },
            "movieId": { 
                "type": "keyword"            
            },
            "imdbMovieId": { 
                "type": "keyword"            
            },
            "posterPath": { 
                "type": "text"            
            },
        }
    }
}

In [95]:
response = requests.put(base_url + "/multi-modal-embedding-index", auth=HTTPBasicAuth(username, password), verify=False, json=mapping)

In [97]:
def create_document_from_row(row):

    embedding_file = 'embeddings/with_title_' + row['posterPath'].split("/")[-1].split(".")[0] + '.json'
    with open(embedding_file) as json_file:
        data = json.load(json_file)

    document = {
        "titan_multimodal_embedding": data['embedding'],
        "title": row['title'],
        "plotSummary": row['plotSummary'],
        "movieId": row['movieId'],
        "imdbMovieId": row['imdbMovieId'],
        "posterPath": row['posterPath']        
    }
    return document

In [98]:
for index, row in df.iterrows():
    document = create_document_from_row(row)
    response = requests.post(base_url + "/multi-modal-embedding-index/_doc", auth=HTTPBasicAuth(username, password), verify=False, json=document)