# Project 1 - Categorizing movies

Source code: https://cookbook.openai.com/examples/batch_processing 

## Setup

- We only need openAI, pandas ans json packages to do the entire project.
- Dont forget to setup the OPENAI_API_KEY as the env variable.

In [None]:
# !pip install openai --upgrade

In [53]:
import json
from openai import OpenAI
import pandas as pd

client = OpenAI()

## Loading And Formating Data

In [54]:
dataset_path = "imdb_top_1000.csv"

df = pd.read_csv(dataset_path)
df.head(1)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469


In [55]:
categorize_system_prompt = '''
Your goal is to extract movie categories from movie descriptions, as well as a 1-sentence summary for these movies.
You will be provided with a movie description, and you will output a json object containing the following information:

{
    categories: string[] // Array of categories based on the movie description,
    summary: string // 1-sentence summary of the movie based on the movie description
}

Categories refer to the genre or type of the movie, like "action", "romance", "comedy", etc. Keep category names simple and use only lower case letters.
Movies can have several categories, but try to keep it under 3-4. Only mention the categories that are the most obvious based on the description.
'''

def get_categories(description):
    response = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0.1,
    # This is to enable JSON mode, making sure responses are valid json objects
    response_format={ 
        "type": "json_object"
    },
    messages=[
        {
            "role": "system",
            "content": categorize_system_prompt
        },
        {
            "role": "user",
            "content": description
        }
    ],
    )

    return response.choices[0].message.content

In [56]:
# Testing on a few examples
for _, row in df[:2].iterrows():
    description = row['Overview']
    title = row['Series_Title']
    result = get_categories(description)
    print(f"TITLE: {title}\nOVERVIEW: {description}\n\nRESULT: {result}")
    print("\n\n----------------------------\n\n")

TITLE: The Shawshank Redemption
OVERVIEW: Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.

RESULT: {
    "categories": ["drama"],
    "summary": "Two imprisoned men develop a deep bond over the years, discovering solace and redemption through their shared acts of kindness."
}


----------------------------


TITLE: The Godfather
OVERVIEW: An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.

RESULT: {
    "categories": ["crime", "drama"],
    "summary": "An aging crime lord hands over his empire to his hesitant son."
}


----------------------------




In [57]:
# Creating an array of json tasks

tasks = []

for index, row in df[:20].iterrows():
    
    description = row['Overview']
    
    task = {
        "custom_id": f"task-{index}",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            # This is what you would have in your Chat Completions API call
            "model": "gpt-4o-mini",
            "temperature": 0.1,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": categorize_system_prompt
                },
                {
                    "role": "user",
                    "content": description
                }
            ],
        }
    }
    
    tasks.append(task)

In [58]:
tasks[0]

{'custom_id': 'task-0',
 'method': 'POST',
 'url': '/v1/chat/completions',
 'body': {'model': 'gpt-4o-mini',
  'temperature': 0.1,
  'response_format': {'type': 'json_object'},
  'messages': [{'role': 'system',
    'content': '\nYour goal is to extract movie categories from movie descriptions, as well as a 1-sentence summary for these movies.\nYou will be provided with a movie description, and you will output a json object containing the following information:\n\n{\n    categories: string[] // Array of categories based on the movie description,\n    summary: string // 1-sentence summary of the movie based on the movie description\n}\n\nCategories refer to the genre or type of the movie, like "action", "romance", "comedy", etc. Keep category names simple and use only lower case letters.\nMovies can have several categories, but try to keep it under 3-4. Only mention the categories that are the most obvious based on the description.\n'},
   {'role': 'user',
    'content': 'Two imprisoned 

## Create The File

In [59]:
file_name = "batch_tasks_movies.jsonl"

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

## Upload Batch File To OpenAI

In [60]:
batch_file = client.files.create(
  file=open(file_name, "rb"),
  purpose="batch"
)

## Create a Batch Job

In [61]:
batch_job = client.batches.create(
  input_file_id=batch_file.id,
  endpoint="/v1/chat/completions",
  completion_window="24h"
)

## Retrieve the Results File

In [63]:
batch_job = client.batches.retrieve(batch_job.id)
print(batch_job)
print(f"Job Status: {batch_job.status}")
print(f"Output file ID: {batch_job.output_file_id}")

Batch(id='batch_164fq12mAfxmTiqqsQVjf0ec', completion_window='24h', created_at=1722556303, endpoint='/v1/chat/completions', input_file_id='file-JZ5XwVR3LePsv06uQAa6ZGMU', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722642703, failed_at=None, finalizing_at=None, in_progress_at=1722556303, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=20))
Job Status: in_progress
Output file ID: None


## How to Find a File in OpenAI Files (Optional)

In [64]:
all_files = client.files.list().data

for file_object in all_files:
    if file_object.filename == "batch_H54kck8CoNnlDQkPgvWqVGDD_output.jsonl":
        print(file_object.id)
        break


result_file_id = "file-i73gA1w4SMyoFcyjYpzYpRXd"

file-i73gA1w4SMyoFcyjYpzYpRXd


## Extract and Explore Results

In [65]:
# result_file_id = batch_job.output_file_id
result = client.files.content(result_file_id).content

In [66]:
result_file_name = "batch_job_results_movies.jsonl"

with open(result_file_name, 'wb') as file:
    file.write(result)

In [67]:
# Loading data from saved file
results = []
with open(result_file_name, 'r') as file:
    for line in file:
        # Parsing the JSON string into a dict and appending to the list of results
        json_object = json.loads(line.strip())
        results.append(json_object)

In [68]:
# Reading only the first results
for res in results[:2]:
    task_id = res['custom_id']
    # Getting index from task id
    index = task_id.split('-')[-1]
    result = res['response']['body']['choices'][0]['message']['content']
    movie = df.iloc[int(index)]
    description = movie['Overview']
    title = movie['Series_Title']
    print(f"TITLE: {title}\nOVERVIEW: {description}\n\nRESULT: {result}")
    print("\n\n----------------------------\n\n")

TITLE: The Godfather
OVERVIEW: An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.

RESULT: {
    "player": "Jonathan Taylor",
    "team": "Indianapolis Colts",
    "sport": "Football",
    "gender": "Male"
}


----------------------------


TITLE: The Dark Knight
OVERVIEW: When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.

RESULT: {
    "player": null,
    "team": "OSU",
    "sport": "Football",
    "gender": null
}


----------------------------


