In [None]:
%pip install pinecone sentence-transformers datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("MongoDB/embedded_movies")

In [None]:
dataset

In [None]:
import pandas as pd

In [None]:
dataset_df = pd.DataFrame(dataset['train'])

In [None]:
dataset_df.head()

In [None]:
dataset_df.columns

In [None]:
dataset_df["fullplot"].isnull().sum()

In [None]:
dataset_df = dataset_df.dropna(subset=["fullplot"])

In [None]:
dataset_df = dataset_df.drop(columns=['plot_embedding'])

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def get_embedding(text):
  if not text.strip():
    print("Attempted to get embedding for empty string")
    return []
  embedding = embedding_model.encode(text)
  return embedding.tolist()

In [None]:
dataset_df["plot_embedding"] = dataset_df["fullplot"].apply(get_embedding)

In [None]:
from google.colab import userdata
mongo_db_uri = userdata.get('mongo_db_uri')

In [None]:
%pip install pymongo

In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = mongo_db_uri

client = MongoClient(uri, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
db = client["movie_db"]

In [None]:
collection = db["collection02"]

In [None]:
document = dataset_df.to_dict("records")

In [None]:
collection.insert_many(document)

In [None]:
from google.colab import userdata
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("genai-test-index")

In [None]:
def get_result(user_prompt, top_k):
  user_embedding = get_embedding(user_prompt)
  embedding = user_embedding
  response = index.query(
      vector=embedding,
      top_k=top_k
  )
  return response

In [None]:
query = "Which one is the best movie based on the plot for children?"

In [None]:
response = get_result(query, 5)

In [None]:
response

In [None]:
from bson.objectid import ObjectId

In [None]:
mylist = []
for i in range(len(response["matches"])):
  value = response["matches"][i]['id']
  mylist.append(collection.find_one({"_id": ObjectId(value)}))

In [None]:
mylist

In [None]:
retrieved_info = ""
for i in range(len(mylist)):
  fullplot = mylist[i]['fullplot']
  title = mylist[i]['title']
  retrieved_info += f"Title: {title}, Plot: {fullplot}\n"

In [None]:
%pip install -U langchain-google-genai

In [None]:
from google.colab import userdata
import os

GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
os.environ['GEMINI_API_KEY'] = GEMINI_API_KEY

In [None]:
prompt = f"Query: {query}\nContinue to answer the query by using the plot only:\n{retrieved_info}."

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=1.0,
    max_tokens=None,
    timeout=None,
    max_retries=2
)

messages = [
    (
        "system",
        "You are a helpful assistant that analyzes the title and plot of movies provided and answer the asked queries.",
    ),
    ("human", prompt),
]

ai_msg = model.invoke(messages)

In [None]:
print(ai_msg.content)