# Building Dynamic Context with Custom Data Lab

### Import modules

In [1]:
import openai
import os
import pandas as pd
import numpy as np

### Set the OpenAI API Key

In [None]:
#openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = ""
client = openai.OpenAI(api_key="xxxxx")

### Define function for generating word embedding

In [3]:
def text_embedding(text) -> None:
    response = client.embeddings.create(model="text-embedding-ada-002", input=text)
    return response.data[0].embedding

### Define function for word completion

In [4]:
def get_word_completion(prompt):
    messages = [
        {"role": "system", "content": "You answer questions about 95th Oscar awards."},
        {"role": "user", "content": prompt},
    ]
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=messages,
        max_tokens=3000,
        n=1
    )
    return response.choices[0].message.content

### Load the Oscar dataset

##### Downloaded the dataset from https://www.kaggle.com/datasets/unanimad/the-oscar-award

In [5]:
df=pd.read_csv('./data/oscars.csv')
df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
0,1927,1928,1,ACTOR,Richard Barthelmess,The Noose,False
1,1927,1928,1,ACTOR,Emil Jannings,The Last Command,True
2,1927,1928,1,ACTRESS,Louise Dresser,A Ship Comes In,False
3,1927,1928,1,ACTRESS,Janet Gaynor,7th Heaven,True
4,1927,1928,1,ACTRESS,Gloria Swanson,Sadie Thompson,False
...,...,...,...,...,...,...,...
10760,2022,2023,95,HONORARY AWARD,"To Euzhan Palcy, a masterful filmmaker who bro...",,True
10761,2022,2023,95,HONORARY AWARD,"To Diane Warren, for her genius, generosity an...",,True
10762,2022,2023,95,HONORARY AWARD,"To Peter Weir, a fearless and consummate filmm...",,True
10763,2022,2023,95,GORDON E. SAWYER AWARD,Iain Neil,,True


### Process the dataset

In [6]:
df=df.loc[df['year_ceremony'] == 2023]
df=df.dropna(subset=['film'])
df['category'] = df['category'].str.lower()
df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False
...,...,...,...,...,...,...,...
10755,2022,2023,95,writing (original screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,True
10756,2022,2023,95,writing (original screenplay),Written by Steven Spielberg & Tony Kushner,The Fabelmans,False
10757,2022,2023,95,writing (original screenplay),Written by Todd Field,Tár,False
10758,2022,2023,95,writing (original screenplay),Written by Ruben Östlund,Triangle of Sadness,False


### Generate a text column (context)

In [7]:
# Create the column for all rows first
df['text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' to win the award'

# Find the rows where 'winner' is False and replace the 'text' for those rows
df.loc[df['winner'] == False, 'text'] = df['name'] + ' got nominated under the category, ' + df['category'] + ', for the film ' + df['film'] + ' but did not win'
df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False,Austin Butler got nominated under the category...
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False,Colin Farrell got nominated under the category...
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True,Brendan Fraser got nominated under the categor...
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False,"Paul Mescal got nominated under the category, ..."
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False,"Bill Nighy got nominated under the category, a..."
...,...,...,...,...,...,...,...,...
10755,2022,2023,95,writing (original screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,True,Written by Daniel Kwan & Daniel Scheinert got ...
10756,2022,2023,95,writing (original screenplay),Written by Steven Spielberg & Tony Kushner,The Fabelmans,False,Written by Steven Spielberg & Tony Kushner got...
10757,2022,2023,95,writing (original screenplay),Written by Todd Field,Tár,False,Written by Todd Field got nominated under the ...
10758,2022,2023,95,writing (original screenplay),Written by Ruben Östlund,Triangle of Sadness,False,Written by Ruben Östlund got nominated under t...


In [8]:
df['text'].iloc[100]

'Viktor Prášil, Frank Kruse, Markus Stemler, Lars Ginzel and Stefan Korte got nominated under the category, sound, for the film All Quiet on the Western Front but did not win'

### Generate word embeddings for the text column (context)

In [9]:
df=df.assign(embedding=(df["text"].apply(lambda x : text_embedding(x))))

In [10]:
df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text,embedding
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False,Austin Butler got nominated under the category...,"[-0.037987079471349716, -0.019986499100923538,..."
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False,Colin Farrell got nominated under the category...,"[-0.00808698870241642, -0.009935630485415459, ..."
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True,Brendan Fraser got nominated under the categor...,"[-0.004187682643532753, -0.016699425876140594,..."
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False,"Paul Mescal got nominated under the category, ...","[-0.010742668993771076, -0.003934294916689396,..."
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False,"Bill Nighy got nominated under the category, a...","[-0.013457710854709148, -0.010843760333955288,..."
...,...,...,...,...,...,...,...,...,...
10755,2022,2023,95,writing (original screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,True,Written by Daniel Kwan & Daniel Scheinert got ...,"[0.010547928512096405, 0.004263501614332199, 0..."
10756,2022,2023,95,writing (original screenplay),Written by Steven Spielberg & Tony Kushner,The Fabelmans,False,Written by Steven Spielberg & Tony Kushner got...,"[0.0015993294073268771, -0.03366387262940407, ..."
10757,2022,2023,95,writing (original screenplay),Written by Todd Field,Tár,False,Written by Todd Field got nominated under the ...,"[-0.014156533405184746, -0.034318070858716965,..."
10758,2022,2023,95,writing (original screenplay),Written by Ruben Östlund,Triangle of Sadness,False,Written by Ruben Östlund got nominated under t...,"[0.0017724850913509727, -0.031749479472637177,..."


### Define function to perform the dot product

In [11]:
def vector_similarity(vec1, vec2):
    return np.dot(np.squeeze(np.array(vec1)),np.squeeze(np.array(vec2)))

### Generate word embeddings for the query

In [12]:
query="Did Avatar win any awards?"

In [13]:
query_embedding=text_embedding(query)

In [14]:
len(query_embedding)

1536

### Perform similarity search on the text column (context) and add the score

In [15]:
df["similarity"]=df["embedding"].apply(lambda x: vector_similarity(x,query_embedding))
df

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text,embedding,similarity
10639,2022,2023,95,actor in a leading role,Austin Butler,Elvis,False,Austin Butler got nominated under the category...,"[-0.037987079471349716, -0.019986499100923538,...",0.793296
10640,2022,2023,95,actor in a leading role,Colin Farrell,The Banshees of Inisherin,False,Colin Farrell got nominated under the category...,"[-0.00808698870241642, -0.009935630485415459, ...",0.789289
10641,2022,2023,95,actor in a leading role,Brendan Fraser,The Whale,True,Brendan Fraser got nominated under the categor...,"[-0.004187682643532753, -0.016699425876140594,...",0.806645
10642,2022,2023,95,actor in a leading role,Paul Mescal,Aftersun,False,"Paul Mescal got nominated under the category, ...","[-0.010742668993771076, -0.003934294916689396,...",0.786227
10643,2022,2023,95,actor in a leading role,Bill Nighy,Living,False,"Bill Nighy got nominated under the category, a...","[-0.013457710854709148, -0.010843760333955288,...",0.786008
...,...,...,...,...,...,...,...,...,...,...
10755,2022,2023,95,writing (original screenplay),Written by Daniel Kwan & Daniel Scheinert,Everything Everywhere All at Once,True,Written by Daniel Kwan & Daniel Scheinert got ...,"[0.010547928512096405, 0.004263501614332199, 0...",0.784787
10756,2022,2023,95,writing (original screenplay),Written by Steven Spielberg & Tony Kushner,The Fabelmans,False,Written by Steven Spielberg & Tony Kushner got...,"[0.0015993294073268771, -0.03366387262940407, ...",0.784253
10757,2022,2023,95,writing (original screenplay),Written by Todd Field,Tár,False,Written by Todd Field got nominated under the ...,"[-0.014156533405184746, -0.034318070858716965,...",0.784760
10758,2022,2023,95,writing (original screenplay),Written by Ruben Östlund,Triangle of Sadness,False,Written by Ruben Östlund got nominated under t...,"[0.0017724850913509727, -0.031749479472637177,...",0.780681


### Filter the top n results

In [16]:
top_res=df.nlargest(20,'similarity')
top_res.head()

Unnamed: 0,year_film,year_ceremony,ceremony,category,name,film,winner,text,embedding,similarity
10715,2022,2023,95,best picture,"James Cameron and Jon Landau, Producers",Avatar: The Way of Water,False,"James Cameron and Jon Landau, Producers got no...","[-0.0033361848909407854, -0.02883896604180336,...",0.869932
10740,2022,2023,95,sound,"Julian Howarth, Gwendolyn Yates Whittle, Dick ...",Avatar: The Way of Water,False,"Julian Howarth, Gwendolyn Yates Whittle, Dick ...","[-0.004168398678302765, -0.022677678614854813,...",0.859168
10745,2022,2023,95,visual effects,"Joe Letteri, Richard Baneham, Eric Saindon and...",Avatar: The Way of Water,True,"Joe Letteri, Richard Baneham, Eric Saindon and...","[-0.00525031890720129, -0.0017376210307702422,...",0.858541
10725,2022,2023,95,production design,Production Design: Dylan Cole and Ben Procter;...,Avatar: The Way of Water,False,Production Design: Dylan Cole and Ben Procter;...,"[-0.00021826969168614596, -0.01915182918310165...",0.849645
10667,2022,2023,95,cinematography,Roger Deakins,Empire of Light,False,Roger Deakins got nominated under the category...,"[-0.007363494951277971, -0.022122584283351898,...",0.816673


### Build the context

In [17]:
context='\n'.join(top_res['text'])
print(context)

James Cameron and Jon Landau, Producers got nominated under the category, best picture, for the film Avatar: The Way of Water but did not win
Julian Howarth, Gwendolyn Yates Whittle, Dick Bernstein, Christopher Boyes, Gary Summers and Michael Hedges got nominated under the category, sound, for the film Avatar: The Way of Water but did not win
Joe Letteri, Richard Baneham, Eric Saindon and Daniel Barrett got nominated under the category, visual effects, for the film Avatar: The Way of Water to win the award
Production Design: Dylan Cole and Ben Procter; Set Decoration: Vanessa Cole got nominated under the category, production design, for the film Avatar: The Way of Water but did not win
Roger Deakins got nominated under the category, cinematography, for the film Empire of Light but did not win
Geoffrey Baumann, Craig Hammack, R. Christopher White and Dan Sudick got nominated under the category, visual effects, for the film Black Panther: Wakanda Forever but did not win
Chris Williams an

### Construct the prompt with the context and query

In [18]:
prompt = f"""
    From the data provided in three backticks, respond to the question {query}
    ```{context}```
"""

In [19]:
prompt

"\n    From the data provided in three backticks, respond to the question Did Avatar win any awards?\n    ```James Cameron and Jon Landau, Producers got nominated under the category, best picture, for the film Avatar: The Way of Water but did not win\nJulian Howarth, Gwendolyn Yates Whittle, Dick Bernstein, Christopher Boyes, Gary Summers and Michael Hedges got nominated under the category, sound, for the film Avatar: The Way of Water but did not win\nJoe Letteri, Richard Baneham, Eric Saindon and Daniel Barrett got nominated under the category, visual effects, for the film Avatar: The Way of Water to win the award\nProduction Design: Dylan Cole and Ben Procter; Set Decoration: Vanessa Cole got nominated under the category, production design, for the film Avatar: The Way of Water but did not win\nRoger Deakins got nominated under the category, cinematography, for the film Empire of Light but did not win\nGeoffrey Baumann, Craig Hammack, R. Christopher White and Dan Sudick got nominated

### Send the prompt to OpenAI

In [20]:
result=get_word_completion(prompt)

In [21]:
print(result)

No, "Avatar: The Way of Water" did not win any awards at the 95th Oscar awards.
