# Evaluating data generation

In [1]:
import pandas as pd
import minsearch
import hashlib
import json
import os

from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from openai import OpenAI

client = OpenAI()

## Ingestion

In [4]:
df = pd.read_csv('../data/movie_dataset.csv')
print('Shape:', df.shape)
df.head(2)

Shape: (5000, 7)


Unnamed: 0,id,title,year,plot,genres,director,vector
0,4dac245a,I Love Maria,1988,"On one evening, Hong Kong's largest gang, the ...",sci-fi / comedy,David Chung,[-7.36143067e-02 3.23128849e-02 2.23438442e-...
1,6667e983,Girl in Progress,2012,Grace is a single mom raising her fourteen-yea...,comedy-drama,Patricia Riggen,[ 3.08281817e-02 -6.33204207e-02 2.21934766e-...


In [5]:
# def generate_document_id(doc):
#     combined = f"{doc['title']}-{doc['plot'][:10]}"
#     hash_object = hashlib.md5(combined.encode())
#     hash_hex = hash_object.hexdigest()
#     document_id = hash_hex[:8]
#     return document_id


# df = pd.read_csv('../data/movie_dataset.csv')

# df['id'] = df.progress_apply(lambda row: generate_document_id(row.to_dict()), axis = 1)
# cols = df.columns.tolist() 
# df = df[[cols[-1]] + cols[:-1]]

# df.to_csv('../data/movie_dataset.csv', index=False)

In [6]:
documents = df.to_dict(orient='records')

In [7]:
prompt_template = """
You emulate a user of our Movie advisor application. You emulate that you are trying to recall the title of a movie or deciding what to watch based on a specific plot or genre.
Formulate 5 simple questions this user might ask based on a provided record. Questions should be specific to the movie plot, but should not contain the exact movie title. Questions should not be too philosophical.
The record should contain the answer to the questions, and the questions should be complete and not too short.

The record:

title : {title}
plot : {plot}
genres : {genres}
director : {director}
year : {year}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [9]:
n=150
prompt = prompt_template.format(**documents[n])
print(prompt)

You emulate a user of our Movie advisor application. You emulate that you are trying to recall the title of a movie or deciding what to watch based on a specific plot or genre.
Formulate 5 simple questions this user might ask based on a provided record. Questions should be specific to the movie plot, but should not contain the exact movie title. Questions should not be too philosophical.
The record should contain the answer to the questions, and the questions should be complete and not too short.

The record:

title : Up in the Air
plot : Ryan Bingham works for a Human Resources consultancy firm which specializes in termination assistance, and makes his living traveling to workplaces across the United States, conducting company layoffs and firings on behalf of employers. He also gives motivational speeches, using the analogy "What's in Your Backpack?" to extol the virtues of a life free of burdensome relationships with people as well as things. A frequent flyer, Ryan has no fixed abode

In [10]:
def llm(prompt:str):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [11]:
questions = llm(prompt)

In [12]:
json.loads((questions))

['What does Ryan Bingham do for a living, and how does it relate to his frequent travels?',
 "How does Ryan's perspective on relationships change throughout the movie?",
 'What challenges does Natalie face regarding her new layoffs program, and how does Ryan respond to it?',
 "What significant events occur during Ryan's sister's wedding that impact his character development?",
 'What realization does Ryan come to about his life and personal philosophies towards the end of the film?']