# Evaluating data generation

In [3]:
import pandas as pd
import minsearch
import hashlib
import json
import os

from tqdm.auto import tqdm
tqdm.pandas()


In [None]:
from openai import OpenAI
client = OpenAI()

## Ingestion

In [4]:
df = pd.read_csv('../data/movie_dataset.csv')
print('Shape:', df.shape)
df.head(2)

Shape: (122397, 5)


Unnamed: 0,title,year,plot,genres,director
0,Patton Oswalt: Annihilation,2017,"Patton Oswald, despite a personal tragedy, pro...",uncategorized,Bobcat Goldthwait
1,New York Doll,2005,A recovering alcoholic and recently converted ...,"documentary, music",Greg Whiteley


In [8]:
# def generate_document_id(doc):
#     combined = f"{doc['title']}-{doc['plot'][:10]}"
#     hash_object = hashlib.md5(combined.encode())
#     hash_hex = hash_object.hexdigest()
#     document_id = hash_hex[:8]
#     return document_id


# df = pd.read_csv('../data/movie_dataset.csv')

# df['id'] = df.progress_apply(lambda row: generate_document_id(row.to_dict()), axis = 1)
# cols = df.columns.tolist() 
# df = df[[cols[-1]] + cols[:-1]]

# df.to_csv('../data/movie_dataset.csv', index=False)

In [34]:
documents = df.to_dict(orient='records')

In [69]:
prompt_template = """
You emulate a user of our Movie advisor application. You emulate that you are trying to recall the title of a movie or deciding what to watch based on a specific plot or genre.
Formulate 5 simple questions this user might ask based on a provided record. Questions should be specific to the movie plot, but should not contain the exact movie title. Questions should not be too philosophical.
The record should contain the answer to the questions, and the questions should be complete and not too short.

The record:

title : {title}
plot : {plot}
genres : {genres}
director : {director}
year : {year}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [73]:
prompt = prompt_template.format(**documents[150])
print(prompt)

You emulate a user of our Movie advisor application. You emulate that you are trying to recall the title of a movie or deciding what to watch based on a specific plot or genre.
Formulate 5 simple questions this user might ask based on a provided record. Questions should be specific to the movie plot, but should not contain the exact movie title. Questions should not be too philosophical.
The record should contain the answer to the questions, and the questions should be complete and not too short.

The record:

title : Generation Iron 2
plot : From the director of Generation Iron, comes the anticipated sequel that will depict 5 of the top bodybuilding and fitness mega-stars on a quest of achieving the ultimate physique and taking it to the next extreme level. In the world of social media and internet, the rules have changed as to what makes an iconic bodybuilding mass-monster. Starring Kai Greene, Calum Von Moger, Rich Piana, among others, this film will explore an all new generation of

In [74]:
def llm(prompt:str):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [75]:
questions = llm(prompt)

In [76]:
json.loads((questions))

['What kind of competition do the main figures in this film engage in regarding their physical appearance?',
 'How does social media influence the perception of bodybuilders in this movie?',
 'Who are some of the featured bodybuilders striving for physique perfection in this documentary?',
 'What year was this documentary about bodybuilding released?',
 'What is the main goal of the bodybuilders depicted in this film?']

In [98]:
index = minsearch.Index(
    text_fields=['title', 'year', 'plot', 'genres', 'director'],
    keyword_fields=['director']
)

index.fit(documents)

<minsearch.Index at 0x12c8d36aa50>

## RAG flow

In [None]:
from openai import OpenAI

client = OpenAI()

In [99]:
def search(query:str) -> list:
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results


def build_prompt(query:str, search_results:list) -> str:
    prompt_template = """
You are a professional assistant in selecting movies.
Your task is to recommend a movie from our movie dataset that best matches the request or description provided by user. 
Without any preamble, provide information about the movie that best matches the QUESTION based on the provided CONTEXT.

QUESTION: {question}

CONTEXT:
{context}
""".strip()
    
    entry_template = """
title : {title}
plot : {plot}
genres : {genres}
director : {director}
year : {year}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + entry_template.format(**doc) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

def llm(prompt:str):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer


In [100]:
query = 'advise me couple of  movies directed by Tim Burton?'

In [103]:
answer = rag(query)
print(answer)

1. **Planet of the Apes**  
   - **Plot**: In 2029, aboard the United States Air Force space station Oberon, Leo Davidson works closely with trained primates. After a storm causes him to crash-land on a planet ruled by humanoid apes, he discovers the humans are treated as slaves. He fights for their freedom while unraveling the history of the apes and how they came to dominate the planet.
   - **Genres**: Science Fiction  
   - **Director**: Tim Burton  
   - **Year**: 2001  

2. **Beetlejuice**  
   - **Plot**: The spirits of a deceased couple are disturbed by a family that moves into their home, so they hire a mischievous spirit to help drive them out.
   - **Genres**: Comedy, Fantasy  
   - **Director**: Tim Burton  
   - **Year**: 1988  

3. **Frankenweenie**  
   - **Plot**: After young Victor loses his pet dog Sparky, he brings him back to life using science. However, Sparky's resurrection causes chaos in the neighborhood, and Victor must show everyone that his dog is still the s

# Retrieval evaluation