In [1]:
import json

def clean_span(span):
    """
    Cleans the given span string by ensuring balanced quotes and removing unwanted commas.

    Args:
        span (str): The span string to clean.

    Returns:
        str: The cleaned span string.
    """
    span = span.strip()
    if span.endswith('"],') or span.endswith('"]'):
        span = span.rstrip('"],').rstrip('"]') + '"'
    return span

def parse_attribute_spans(response_str):
    """
    Parses the attribute spans from the given string and returns a dictionary.

    Args:
        response_str (str): The response string containing the attribute spans.

    Returns:
        dict: A dictionary with attributes as keys and lists of strings as values.
    """
    attributes = ["Cinematography", "Direction", "Story", "Characters", "Production Design", "Unique Concept", "Emotions"]
    spans = {attr: [] for attr in attributes}

    # Remove the surrounding curly braces and split the response into lines
    lines = response_str.strip()[1:-1].strip().splitlines()
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if ":" in line:
            parts = line.split(":", 1)
            attr = parts[0].strip()
            if attr in attributes:
                # Remove the surrounding square brackets and split by comma
                span_list_str = parts[1].strip().strip("[]").strip()
                if span_list_str:
                    # Split by comma but handle cases where there might be commas within strings
                    spans[attr] = [clean_span(item.strip().strip('"').strip()) for item in span_list_str.split('", ') if item.strip().strip('"').strip() and item != ',']
    
    return spans

In [2]:
## Labeling Dataset
import pandas as pd
from openai import OpenAI
import csv
from dotenv import load_dotenv
import os, json

load_dotenv()

apikey = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=apikey)

df = pd.read_csv("../data/IMDB Dataset.csv")
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
reviews = list(df["review"][18500:])
df[18500:]

Unnamed: 0,review,sentiment
18559,well i wasn't sure what the film was going to ...,positive
18560,I have a lot of time for all the Columbo films...,positive
18561,YETI deserves the 8 star rating because it is ...,positive
18562,I am and have been a serious collector of Chri...,negative
18563,"This year's Royal Rumble wasn't really bad, bu...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
## Labeling Dataset

import pandas as pd
from openai import OpenAI
import csv
from dotenv import load_dotenv
import os, json

load_dotenv()

apikey = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=apikey)

sys_prompt = """
You are an assistant who gives for specific attributes. The attributes are Cinematography, Direction, Story, Characters, Production Design, Unique Concept, and Emotions. 

YOUR INPUT WOULD BE LIKE THIS:
Review: "The cinematography was stunning, but the story was weak. I loved the movie. There wasn't anything unique in the movie. characters could've been better tho."

YOU MUST FOLLOW THE OUTPUT FORMAT GIVEN BELOW. DON'T WRITE ANYTHING ELSE:
{
Cinematography: [list of strings with chunks where cinematography is discussed],
Direction: [list of strings with chunks where direction is discussed],
Story: [list of strings with chunks where story is discussed],
Characters: [list of strings with chunks where characters are discussed],
Production Design: [list of strings with chunks where production design is discussed],
Unique Concept: [list of strings with chunks where unique concept is discussed],
Other: [list of strings that mentions other things related to movie]
}

if something is not discussed, add empty list infront of it.

"""

# Define the attributes
attributes = ["Cinematography", "Direction", "Story", "Characters", "Production Design", "Unique Concept", "Emotions"]

def get_sentiment_spans(review):
    prompt = "Label the following review below:" + f"\nReview: {review}\n"
    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": prompt}
    ]
    )

    response = completion.choices[0].message.content.strip()    
    # Manually parse the response to extract spans
    spans = parse_attribute_spans(response)
    return spans

def label_dataset_and_save(reviews, output_file, checkpoint_file):
    labeled_data = []
    start_index = 0
    
    # Check if checkpoint exists
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, 'r') as file:
            start_index = int(file.read().strip())
    
    for i in range(start_index, len(reviews)):
        review = reviews[i]
        spans = get_sentiment_spans(review)
        labeled_data.append({"review": review, **spans})
        
        # Save in chunks of 100 reviews
        if (i + 1) % 100 == 0 or (i + 1) == len(reviews):
            with open(output_file, 'a') as file:
                json.dump(labeled_data, file, indent=4)
            labeled_data = []
            with open(checkpoint_file, 'w') as file:
                file.write(str(i + 1))
            print(f"Processed and saved {i + 1} reviews")

# Label the dataset and save continuously
output_file = '../data/new_spans_labeled.json'
checkpoint_file = '../data/checkpoint2.txt'
label_dataset_and_save(reviews, output_file, checkpoint_file)
 