### Import the relevant libraries

In [1]:
import pandas as pd
import json
from openai import OpenAI
from dotenv import load_dotenv
import os
import time


load_dotenv()

True

### Import and inspect the dataset

In [2]:
df = pd.read_csv("./movies.csv")

df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Make the Genres column unique

In [3]:
all_genres = df.genres.dropna().unique()

### Create folder for our transformed dataset

In [4]:
movie_genre_folder = "movie_genre_batch_files"
if not os.path.exists(movie_genre_folder):
    os.mkdir(movie_genre_folder)

### Create function for generating the .jsonl lines

In [5]:
def create_request_object(request_number, genres):
    request_object = {
        "custom_id": f"request-{request_number}",
        "method": "POST",
        "url": "/v1/embeddings",
        "body": {
            "model": "text-embedding-3-small",
            "input": genres,
            "encoding_format": "float",
            
            }
        }
    return request_object


### Create function for writing .jsonl file into the transformed data folder

In [6]:
def create_batch_jsonl(batch_num, movie_genre_folder, genres, updated_idx_start):
    with open(f"{movie_genre_folder}/movie_genre_batch_{batch_num}.jsonl", "w") as f:
        for idx, genre in enumerate(genres):
            
            request_number = idx+1 + updated_idx_start
            
            genre_request_object = create_request_object(request_number, genre)
            f.write(json.dumps(genre_request_object) + "\n")

### Creating a batch of .jsonl data from the Genre column

In [7]:
batch_num = 2000 if len(all_genres) > 2000 else int(len(all_genres)/2)

new_beginning = 0
for batch_idx, num in enumerate(range(0, len(all_genres)+1, batch_num )):
    
    batch_genres = all_genres.tolist()[new_beginning: new_beginning + batch_num]
    
    create_batch_jsonl(batch_idx, movie_genre_folder, batch_genres, updated_idx_start = num)
    print(f"{new_beginning = }\n {batch_num = }")
    new_beginning += batch_num
        

new_beginning = 0
 batch_num = 819
new_beginning = 819
 batch_num = 819
new_beginning = 1638
 batch_num = 819


### Initialize OpenAI client

In [8]:
client = OpenAI()

### Create Batch Input File Object

In [9]:
batch_input_file = client.files.create(
  file=open(f"{movie_genre_folder}/movie_genre_batch_1.jsonl", "rb"),
  purpose="batch"
)

In [10]:
batch_input_file

FileObject(id='file-lbVRxGKZRFDAWrABG0PxhrHQ', bytes=156528, created_at=1727143446, filename='movie_genre_batch_1.jsonl', object='file', purpose='batch', status='processed', status_details=None)

In [11]:
batch_input_file_id = batch_input_file.id

### Start the process of generating embeddings

In [12]:
batch_creation_object = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/embeddings",
    completion_window="24h",
    metadata={
      "description": "nightly eval job"
    }
)


In [13]:
batch_id = batch_creation_object.id
print(f"Batch embedding creation object initialized.\nBatch_ID: {batch_id}")

job = client.batches.retrieve(batch_id)
print(f"\nJob status:\n{job}")

Batch embedding creation object initialized.
Batch_ID: batch_VLtiwBeNL45luSgMvILJK5uI

Job status:
Batch(id='batch_VLtiwBeNL45luSgMvILJK5uI', completion_window='24h', created_at=1727143447, endpoint='/v1/embeddings', input_file_id='file-lbVRxGKZRFDAWrABG0PxhrHQ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1727229847, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'nightly eval job'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


### Waiting for 60 seconds until the output_file_id value is no longer None

In [14]:
output_file_id = job.output_file_id

while not output_file_id:
    time.sleep(60)
    job = client.batches.retrieve(batch_id)
    output_file_id = job.output_file_id
    print(f"Output Field Id value: {output_file_id}\nWaiting another 60 seconds")

    completed = job.request_counts.completed
    failed = job.request_counts.failed
    total = completed = job.request_counts.total
    print(f"Completion status: \n{completed = }\n{failed = }\n{total = }\n")

Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 819
failed = 0
total = 819
Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 819
failed = 0
total = 819
Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 819
failed = 0
total = 819
Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 819
failed = 0
total = 819
Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 819
failed = 0
total = 819
Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 819
failed = 0
total = 819
Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 819
failed = 0
total = 819
Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 819
failed = 0
total = 819
Output Field Id value: None
Waiting another 60 seconds
Completion status: 
completed = 8

In [15]:
job.output_file_id

'file-GMxSt9SwYr0iHT45p9DBwQpD'

In [16]:
output_file_id = job.output_file_id
output_file_id

'file-GMxSt9SwYr0iHT45p9DBwQpD'

### Retrieving the embeddings

In [17]:
#Final Job result
job = client.batches.retrieve(batch_id)

output_file_id = job.output_file_id
output_file_id

'file-GMxSt9SwYr0iHT45p9DBwQpD'

In [18]:
file_content = client.files.content(output_file_id)

### Writing the embeddings to a .jsonl file

In [19]:
with open("gpt_output.jsonl", "wb") as f:
    f.write(file_content.read())