In [None]:
# Open AI Prompt Engineering
# Date Created: 03/07/23

# Load in packages
import openai
import os
import re
import requests
import json
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
import tiktoken
from evaluation import evaluate, get_results_dict, save_results
from helper_functions import check_dir_exists, load_n_samples, load_balanced_n_samples, convert_labels
import datetime
import time
import math

In [None]:
# Load in datasets
binary_abuse =pd.read_csv("../data/binary_abuse/clean_data/binary_abuse_dev_sample.csv")
imdb = pd.read_csv("../data/binary_movie_sentiment/clean_data/binary_movie_sentiment_dev_sample.csv")

In [None]:
%%time
# https://openai.com/pricing#language-models
# GPT-3.5 Turbo (as of 18/07/2023)
# 4K context   $0.0015 / 1K tokens  $0.002 / 1K tokens
# 16K context  $0.003 / 1K tokens   $0.004 / 1K tokens

# https://azure.microsoft.com/en-us/pricing/details/cognitive-services/openai-service/?cdn=disable
# Models           Per 1,000 tokens
# gpt-3.5-turbo    £0.001586

# iterate over all to get the number of tokens - imdb
tokenizer = tiktoken.get_encoding("cl100k_base") # shuold be the tokenizer for gpt3.5
token_lengths = []

for i in range(len(imdb["text"])):
    sample_text=imdb["text"][i]
    decoded_text = tokenizer.decode_tokens_bytes(tokenizer.encode(sample_text))
    token_lengths.append(len(decoded_text))

print("IMDB: N=",len(token_lengths),"; Max Token Len=",max(token_lengths),"; Min Token Len=",min(token_lengths),"; Avg Token Len=",sum(token_lengths)/len(token_lengths))
print("\n")
print(imdb["label"].value_counts())


# IMDB: N= 1250 ; Max Token Len= 1407 ; Min Token Len= 8 ; Avg Token Len= 274.024


# False    625
# True     625
# Name: label, dtype: int64
# CPU times: total: 5.05 s
# Wall time: 5.18 s

In [None]:
%%time
# iterate over all to get the number of tokens -binary_abuse
tokenizer = tiktoken.get_encoding("cl100k_base") # shuold be the tokenizer for gpt3.5
token_lengths = []

for i in range(len(binary_abuse["text"])):
    sample_text=binary_abuse["text"][i]
    decoded_text = tokenizer.decode_tokens_bytes(tokenizer.encode(sample_text))
    token_lengths.append(len(decoded_text))

print("Wiki Abuse: N=",len(token_lengths),"; Max Token Len=",max(token_lengths),"; Min Token Len=",min(token_lengths),"; Avg Token Len=",sum(token_lengths)/len(token_lengths))
print("\n")
print(binary_abuse["label"].value_counts())


# Wiki Abuse: N= 2316 ; Max Token Len= 2858 ; Min Token Len= 2 ; Avg Token Len= 95.39896373056995


# False    2041
# True      275
# Name: label, dtype: int64
# CPU times: total: 3.61 s
# Wall time: 3.66 s

In [None]:
# API Access via text completion
api_key= "masked"
base_url = "masked"
deployment_name ="masked" # cheat gpt text model for testing

#deployment_name ="tos_gpt35" # gpt 3.5 final model for paper
url = base_url + "/openai/deployments/" + deployment_name + "/completions?api-version=2023-05-15"


def get_openai_response(prompt):
    """Gets a response from the OpenAI API from a given prompt
    Args:
        prompt: a text prompt to send to the API
    Returns:
        a string response from the API
    """
    payload = {
        "prompt":prompt,
        "max_tokens":20,
        "top_p":.1,
        #"stop":[" ",".",","] # ideally only want one word responses
        "stop":[".","\n",",","<"] # get one sentence responses
    }
        
    while True:
        try:
            r = requests.post(url,
                              headers={
                                  "api-key": api_key,
                                  "Content-Type": "application/json"
                              },
                              json = payload,
                              timeout=20)
        except (requests.Timeout,ConnectionResetError,ConnectionError) as e: # this only work if the api response fails due to the API server hanging
            print(e)
            time.sleep(5) # if service is struggling, wait a few moments and try again - keep doing this until no timeout error
            continue
        break   
    response = json.loads(r.text)
    if r.status_code!=200:
        #raise Exception(r.json().get("error").get("message"))
        text_return = r.json().get("error").get("message")
    
    if r.status_code==200:
        text_return = response['choices'][0]['text']
        text_return = text_return.replace('"', '') # remove 
        text_return = text_return.replace('\'', '') # remove
        text_return = text_return.strip()
    
        if text_return=="Negative" or text_return=="Abusive":
            text_return="Yes"        
        if text_return=="Not" or text_return=="Positive":
            text_return="No"
        
    return text_return

In [None]:
# API Access via chat - 18/07/23 switching to this as gpt3.5 turbo seems optimized for chat over completions

openai.api_key = "masked"
openai.api_base ="masked" 
openai.api_type = "azure"
openai.api_version = "2023-03-15-preview"
deployment_name ="masked" 

def get_openai_response_chat(prompt, text):
    """Gets a response from the OpenAI API from a given prompt
    Args:
        prompt: a text prompt to send to the API
    Returns:
        a string response from the API
    """
    
    while True:
        try:
            response = openai.ChatCompletion.create(
                engine=deployment_name,
                messages=[
                    {"role": "system", "content": prompt},
                    {"role": "user", "content": text}
                ],
                temperature=.1,
                max_tokens=20,
                stop=[".",",","<"],
                request_timeout=20, #
            )
        except (requests.Timeout,ConnectionResetError,ConnectionError,openai.error.ServiceUnavailableError,RemoteDisconnected,APIConnectionError) as e: # this only work if the api response fails due to the API server hanging
            print(e)
            time.sleep(10) # if service is struggling, wait a few moments and try again - keep doing this until no timeout error
            continue
        break 
        

    
    text_return = response['choices'][0]['message']['content']
    
    
    return text_return

In [None]:
# attempted prompts
# p: "\n\nDoes this text contain abuse, Yes or No?\n\n"
# r: 'I\'m not sure what you mean by "abuse'
# p: "\n\nDoes this text contain abusive language, Yes or No?\n\n"
# r: No
# p: "\n\nDoes this text contain abusive language? Respond with either Yes or No.\n\n"
# r: 'No'
# p: \n\nIs this text abusive, Yes or No?\n\n
# r: I think this question violates the Terms of Service
# p: \n\nIs this text using abusive language, Yes or No?\n\n
# r: 'No'
# p: \n\nIt was? Abusive or Not Abusive?\n\n
# r: "I'm not sure what you're asking"
# p: \n\nIt was? Abusive Language or Not Abusive Language?\n\n
# r: I'm not sure what you're askin
# p: \n\nDoes the above user comment contain abuse? Yes or No. \n\n
# r: 'Posted by: Cato | 2005'
# p: "\n\nDoes the above user comment contain abuse, Yes or No? \n\n"
# r: 'No
# p: \n\nAbusive or Not Abusive? \n\n
# r: 'Posted by: The Brickmuppet at 11:47 PM | Comments (1) | Add'
# p: \n\nAbusive or Not Abusive? Which better characterizes the above comment?\n\n
# r: I'm not sure what you're asking
# p: Was that comment Abusive or Not Abusive?
# r: Enter your comment here
# p: \n\nDoes the above user comment contain abusive language, Yes or No? \n\n
# r: No
# p: You are a researcher evaluating abusive language in Wikipedia comments. Does the above comment contain abusive language, Yes or No? 
# r: 'No'

# p:\n\nDoes this text contain negative sentiment, Yes or No?\n\n
# r: No
# p:\n\nDoes this text contain negative sentiment, Yes or No? \n\n
# r: Yes
# p:\n\nDoes the above text contain negative sentiment, Yes or No? \n\n
# r: Yes
# p: \n\nIs this text negative, Yes or No?\n\n
# r:Was the above review useful to you?
# p: \n\nIt was? Negative or Not Negative?\n\n
# r: I'm not sure what to say about this movie
# p:\n\nDoes this text contain negative sentiment, Yes or No?\n\n
# r: \n\nClassify the sentiment of the above text using Positive or Negative.\n\nThe above text is 
# p: ' Negative'
# r: \n\nWhat is the sentiment of the above text using Positive or Negative.\n\nThe above text is 
# p: ' Negative'
# r: "Using Positive or Negative, classify the sentiment of the following movie review.\n\n .... \n\nThe sentiment of the movie review is
# p:' Negative'


binary_abuse_prompt_1 = "\n\nDoes this text contain abuse, Yes or No?\n\n"
sample_text=imdb["text"][1]
prompt = sample_text + binary_abuse_prompt_2
print(prompt)

get_openai_response(prompt)

# Final Zero Shot Prompt Engineering

In [None]:
%%time
# Set global vars
TECH = "zero_shot_prompt_engineering"
TASK = "binary_abuse"

n_eval=-1
eval_set="dev_sample"
model_name = "gpt4"
run_time = "0" # no training
n_train= "0" # no training
seed= "0" # no training
balanced_train=False

# Set dirs
main_dir = os.getcwd()
data_dir = f"../data"
output_dir = f'../results/{TASK}/{TECH}'

# TASK = "binary_abuse"
# True = Abusive; False = Not Abusive
binary_abuse_prompt_1 = "Using one word, does the internet comment contain toxic language, Yes or No?"
binary_abuse_prompt_2 = "Using one word, is this internet comment using toxic language, Yes or No?"
binary_abuse_prompt_3 = "You are a researcher who needs to classify comments on the internet as containing abusive language or not containing abusive language. Using one word, does the internet comment contain abusive language, Yes or No?"
 
# TASK = "binary_movie_sentiment"
# True= Positive; False= Negative
imdb_prompt_1 = "Using one word, does the movie review contain negative sentiment, Yes or No?"
imdb_prompt_2 = "Using one word, classify the sentiment of the movie review using 'Positive' or 'Negative'."
imdb_prompt_3 = "You are a researcher who needs to classify movie reviews as containing negative sentiment or not containing negative sentiment. Using one word, does the movie review contain negative sentiment, Yes or No?"

# Lists for loop
task_list = ["binary_abuse","binary_abuse","binary_abuse","binary_movie_sentiment","binary_movie_sentiment","binary_movie_sentiment"]
prompt_list = [binary_abuse_prompt_1,binary_abuse_prompt_2,binary_abuse_prompt_3,imdb_prompt_1,imdb_prompt_2,imdb_prompt_3]


####################################################################
eval_pred_all = [] # the master lists
eval_gold_all = [] # the master lists
for t in range(1,6):
    task_prompt = prompt_list[t]
    TASK = task_list[t]
    temp_df = binary_abuse
    if TASK == "binary_movie_sentiment":
        temp_df=imdb
    # iterate over the rows
    #for i in range(len(imdb["text"])):
    eval_gold_labels=[] # the t/p lists
    eval_pred=[] # the t/p lists
    for i in range(len(temp_df)):
        # grab the comment
        sample_text=temp_df["text"][i]
        
        # make it into a usable user chat
        if TASK == "binary_movie_sentiment":
            temp_text = "Movie review: " + sample_text
            
        if TASK == "binary_abuse":
            temp_text = "Internet comment: " + sample_text
        
        # for gpt 3 only since we can't use the chat api
        #temp_prompt = temp_text + "\nQuestion: "+ task_prompt +" \nResponse:"
        #eval_pred.append(get_openai_response(temp_prompt)) # Don't think i need the below
        
        # append gpt response to list
        eval_pred.append(get_openai_response_chat(task_prompt,temp_text))       
        
        # append labels to list
        eval_gold_labels.append(temp_df["label"][i])
        
        time.sleep(2) # we should be fine vis-a-vis rate limits, but adding a small delay to help prevent any issues with ServiceUnavailableErrors
        if i % 50 == 0:
            print(t,i)
    
    eval_pred_all.append(eval_pred)
    
    # Need to remember that these are flipped 
    if TASK == "binary_abuse":
        eval_pred_int=list(pd.Series(eval_pred).map(dict(Yes=1, No=0)))
    
    # switch my pos/neg to match the other imdb prompts
    if task_prompt == imdb_prompt_2:
        eval_pred=list(pd.Series(eval_pred).map(dict(Negative="Yes", Positive="No")))
        
    if TASK == "binary_movie_sentiment":
        eval_pred_int=list(pd.Series(eval_pred).map(dict(Yes=0, No=1)))
    
    # Deal with non yes/no answers
    print("Number of improperly coded responses: ",sum(math.isnan(x) for x in eval_pred_int))
    
    cleaned_eval_pred_int = [x for x in eval_pred_int if not math.isnan(x)]
    cleaned_eval_gold_labels = [eval_gold_labels[i] for i, x in enumerate(eval_pred_int) if not math.isnan(x)]
    
    
    cleaned_eval_gold_labels=list(map(int,cleaned_eval_gold_labels))
    
    eval_gold_all.append(cleaned_eval_gold_labels)
    
    results = evaluate(cleaned_eval_gold_labels, cleaned_eval_pred_int)
    

    print(TASK)
    print(task_prompt)
    print(results)
    
    eval_gold_labels=cleaned_eval_gold_labels
    eval_preds=cleaned_eval_pred_int
    eval_result=results
    template=task_prompt

    
    datetime_str = str(datetime.datetime.now())
    results_dict = get_results_dict(TASK, TECH, model_name, run_time,
                    eval_gold_labels, eval_preds, eval_set,
                    n_train, n_eval, balanced_train, seed, datetime_str, template)
    # add test_result to results_dict
    results_dict.update(eval_result)
    save_str = f'mod={model_name}_n={n_train}_bal={balanced_train}_iteration={t}'
    save_results(output_dir, save_str, results_dict)


In [None]:
# Examples of Non-Response/Incorrect Response

# I cannot determine if the internet comment contains toxic language
# I'm sorry, but I cannot determine whether the internet comment contains toxic language without the actual comment
# N/A (not a comment)
# I cannot determine if the internet comment contains toxic language as it appears to be incomplete and lacks context
# Cannot determine as there is no internet comment provided
# I cannot determine if the internet comment contains toxic language
# Personal attack
# I cannot make a judgement without seeing the actual internet

In [None]:
# Save/Load the GPT predictions
#np.save("../results/binary_abuse/zero_shot_prompt_engineering/preds_2.npy",np.array([np.arrSay(x, dtype="object") for x in eval_pred_all],dtype="object"),allow_pickle=True)
#preds=np.load("../results/binary_abuse/zero_shot_prompt_engineering/preds_2.npy",allow_pickle=True)

In [None]:
# Convert JSON to csv
def load_file(dir, file_name):
    with open(dir+file_name) as json_file:
        data = json.load(json_file)
        df = pd.DataFrame.from_dict(data, orient='index')
    return df


zero_shot_list = []
folder="../results/binary_abuse/zero_shot_prompt_engineering/"

# Load the images, and append them to a list.
for filepath in os.listdir(folder):
#    if filepath.endswith((".json")):
    if filepath.startswith(("mod=gpt4")):    
        print(filepath)
        tempfile=load_file(folder,filepath)
        zero_shot_list.append(tempfile)
        
results_df = pd.concat(zero_shot_list, axis = 1).T
results_df["n_used"]= results_df.eval_true.map(len)
results_df["zero_shot_prompt"] = prompt_list
#results_df.to_csv("../results/zero_shot_task_results_gpt4.csv")

In [None]:
# Load in the saved GPT predictions to see why non-response answers occur
binary_abuse["p1"] = preds[0]
binary_abuse["p2"] = preds[1]
binary_abuse["p3"] = preds[2]
# imdb["p1"] = preds[1]
# imdb["p2"] = preds[2]
# imdb["p3"] = preds[3]

# Binary Abuse = 
binary_abuse.loc[(binary_abuse['p3'] != "Yes") & (binary_abuse['p3'] != "No")][["p3"]].values

# Wiki Toxic Review = all different cases where 
print(binary_abuse.loc[(binary_abuse['p1'] != "Yes") & (binary_abuse['p1'] != "No")][["p1","p2","p3"]])
print(binary_abuse.loc[(binary_abuse['p2'] != "Yes") & (binary_abuse['p2'] != "No")][["p1","p2","p3"]])
print(binary_abuse.loc[(binary_abuse['p3'] != "Yes") & (binary_abuse['p3'] != "No")][["p1","p2","p3"]])

# array([["I'm sorry"],
#        ['Cannot determine if the internet comment contains abusive language or not based on the given text'],
#        ['Abusive language: No'],
#        ["I'm sorry"],
#        ['Cannot determine if the internet comment contains abusive language or not as it does not contain any language'],
#        ['I cannot determine if this internet comment contains abusive language or not as it is not a complete sentence and'],
#        ['The given internet comment does not contain any language'],
#        ['The internet comment does not contain a clear indication of abusive language'],
#        ['I cannot determine whether this internet comment contains abusive language or not as it does not contain any text'],
#        ["I'm sorry"],
#        ['Sorry'],
#        ['I cannot determine if the internet comment contains abusive language or not as the comment is incomplete and ends abruptly'],
#        ['I cannot determine if the internet comment contains abusive language or not as it is incomplete and does not contain'],
#        ["I'm sorry"],
#        ['Cannot determine if the internet comment contains abusive language or not based on the given text'],
#        ["I'm sorry"],
#        ["I'm sorry"],
#        ['Sorry'],
#        ["I'm sorry"]], dtype=object)

# p1     p2      p3
# 52     52      18    
# 474    245     52    
# 476    474     245   
# 519    553     474   
# 534    756     491   
# 719    998     519   
# 756    1005    741   
# 975    1162    756   
# 998    1240    914   
# 1005   1254    998   
# 1111   1550    1005  
# 1240   1608    1024  
# 1254   1992    1162  
# 1290           1202  
# 1420           1290  
# 1550           1717  
# 1608           1825  
# 1683           1836  
# 1730           2090 
# 1825          
# 1992

In [None]:
# Movie Review = all different cases where 
print(imdb.loc[(imdb['p1'] != "Yes") & (imdb['p1'] != "No")][["p1","p2","p3"]])
print(imdb.loc[(imdb['p2'] != "Negative") & (imdb['p2'] != "Positive")][["p1","p2","p3"]])
print(imdb.loc[(imdb['p3'] != "Yes") & (imdb['p3'] != "No")][["p1","p2","p3"]])


#  p1    p2    p3   
# 367   230   348  
# 1070  417   496  
#       760   578  
#       868   592  
#       984   1057  
#       1009  1144  
#       1035  1165  
#       1087 

# p1       p2  p3
# No  Neutral  No
# No    Mixed  No

# Sensitivity Analysis

In [None]:
# Load in datasets
tmdb = pd.read_csv("../data/tmdb/movie_sentiment_tmdb.csv")
tmdb["rating_int"] = np.where(tmdb['binary_rating']=='Positive', 1, 0) # pos =1, neg = 0
tmdb.head(25)

In [None]:
# iterate over all to get the number of tokens - imdb
tokenizer = tiktoken.get_encoding("cl100k_base") # shuold be the tokenizer for gpt3.5
token_lengths = []

for i in range(len(tmdb["reviews"])):
    sample_text=tmdb["reviews"][i]
    decoded_text = tokenizer.decode_tokens_bytes(tokenizer.encode(sample_text))
    token_lengths.append(len(decoded_text))

print("TMDB: N=",len(token_lengths),"; Max Token Len=",max(token_lengths),"; Min Token Len=",min(token_lengths),"; Avg Token Len=",sum(token_lengths)/len(token_lengths))
print("\n")
print(tmdb["binary_rating"].value_counts())


# TMDB: N= 855 ; Max Token Len= 1861 ; Min Token Len= 2 ; Avg Token Len= 211.87251461988305


# Positive    627
# Negative    228
# Name: binary_rating, dtype: int64

In [None]:
%%time
# Set global vars
TECH = "zero_shot_prompt_engineering"
TASK = "binary_movie_sentiment_sensitivity"

n_eval=-1
eval_set="dev_sample"
model_name = "gpt3.0"
run_time = "0" # no training
n_train= "0" # no training
seed= "0" # no training
balanced_train=False

# Set dirs
main_dir = os.getcwd()
data_dir = f"../data"
output_dir = f'../results/{TASK}/{TECH}'

# TASK = "binary_movie_sentiment"
# True= Positive; False= Negative
tmdb_prompt_1 = "Using one word, does the movie review contain negative sentiment, Yes or No?"
tmdb_prompt_2 = "Using one word, classify the sentiment of the movie review using 'Positive' or 'Negative'."
tmdb_prompt_3 = "You are a researcher who needs to classify movie reviews as containing negative sentiment or not containing negative sentiment. Using one word, does the movie review contain negative sentiment, Yes or No?"

# Lists for loop
task_list = ["binary_movie_sentiment_sensitivity","binary_movie_sentiment_sensitivity","binary_movie_sentiment_sensitivity"]
prompt_list = [tmdb_prompt_1,tmdb_prompt_2,tmdb_prompt_3]
model_list = ["gpt3.0","gpt3.5","gpt4.0"]

####################################################################
eval_pred_all = [] # the master lists
eval_gold_all = [] # the master lists

# iterate over the three models
for m in range(3):
    model_name = model_list[m]
    # set up api settings - 3.0 uses a different system so don't need these for 3.0
    if model_name=="gpt3.5":
        openai.api_key = "masked"
        openai.api_base ="masked" # gpt 3,3.5
        openai.api_type = "azure"
        openai.api_version = "2023-03-15-preview"
        deployment_name ="masked" 
    if model_name=="gpt4.0":
        openai.api_key = "masked"
        openai.api_base = "masked
        deployment_name ="masked" 
        openai.api_type = "azure"
        openai.api_version = "2023-03-15-preview"
    # iterate over the three prompts   
    for t in range(3):
        task_prompt = prompt_list[t]
        TASK = task_list[t]
        temp_df = tmdb
        
        # iterate over the rows
        eval_gold_labels=[] # the t/p lists
        eval_pred=[] # the t/p lists
        for i in range(len(temp_df)):
            # grab the comment
            sample_text=temp_df["reviews"][i]
            
            # make it into a usable user chat
            temp_text = "Movie review: " + sample_text
            
            if model_name=="gpt3.0":
                # for gpt 3 only since we can't use the chat api
                temp_prompt = temp_text + "\nQuestion: "+ task_prompt +" \nResponse:"
                eval_pred.append(get_openai_response(temp_prompt)) # Don't think i need the below
            
            if model_name!="gpt3.0":
                # append gpt response to list
                eval_pred.append(get_openai_response_chat(task_prompt,temp_text))       
            
            # append labels to list
            eval_gold_labels.append(temp_df["rating_int"][i])
            
            time.sleep(1) # we should be fine vis-a-vis rate limits, but adding a small delay to help prevent any issues with ServiceUnavailableErrors
            if i % 50 == 0:
                print(t,i)
        
        eval_pred_all.append(eval_pred)
        
        # Need to remember that these are flipped 
        
        # switch my pos/neg to match the other imdb prompts
        if task_prompt == tmdb_prompt_2:
            eval_pred=list(pd.Series(eval_pred).map(dict(Negative="Yes", Positive="No")))
            
        if TASK == "binary_movie_sentiment_sensitivity":
            eval_pred_int=list(pd.Series(eval_pred).map(dict(Yes=0, No=1)))
        
        # Deal with non yes/no answers
        print("Number of improperly coded responses: ",sum(math.isnan(x) for x in eval_pred_int))
        
        cleaned_eval_pred_int = [x for x in eval_pred_int if not math.isnan(x)]
        cleaned_eval_gold_labels = [eval_gold_labels[i] for i, x in enumerate(eval_pred_int) if not math.isnan(x)]
        
        
        cleaned_eval_gold_labels=list(map(int,cleaned_eval_gold_labels))
        
        eval_gold_all.append(cleaned_eval_gold_labels)
        
        results = evaluate(cleaned_eval_gold_labels, cleaned_eval_pred_int)
        
    
        print(TASK)
        print(task_prompt)
        print(results)
        
        eval_gold_labels=cleaned_eval_gold_labels
        eval_preds=cleaned_eval_pred_int
        eval_result=results
        template=task_prompt
    
        
        datetime_str = str(datetime.datetime.now())
        results_dict = get_results_dict(TASK, TECH, model_name, run_time,
                        eval_gold_labels, eval_preds, eval_set,
                        n_train, n_eval, balanced_train, seed, datetime_str, template)
        # add test_result to results_dict
        results_dict.update(eval_result)
        save_str = f'mod={model_name}_n={n_train}_bal={balanced_train}_iteration={t}'
        save_results(output_dir, save_str, results_dict)


In [None]:
# Convert JSON to csv
def load_file(dir, file_name):
    with open(dir+file_name) as json_file:
        data = json.load(json_file)
        df = pd.DataFrame.from_dict(data, orient='index')
    return df


zero_shot_list = []
folder="../results/binary_movie_sentiment_sensitivity/zero_shot_prompt_engineering/"

# Load the images, and append them to a list.
for filepath in os.listdir(folder):
#    if filepath.endswith((".json")):
    if filepath.startswith(("mod")):     # include all
        print(filepath)
        tempfile=load_file(folder,filepath)
        zero_shot_list.append(tempfile)
        
results_df = pd.concat(zero_shot_list, axis = 1).T
results_df["n_used"]= results_df.eval_true.map(len)
results_df["zero_shot_prompt"] = prompt_list*3
results_df.head()


In [None]:
# Need to recalculate F1 Scores using the saved arrays

from evaluation import evaluate_dataframe
results_df.index = pd.RangeIndex(start=0, step=1, stop=len(results_df))
# results_df['eval_true'] = results_df['eval_true'].apply(ast.literal_eval)
# results_df['eval_pred'] = results_df['eval_pred'].apply(ast.literal_eval)
# Evaluate column scores
results_df[['acc_new', 'f1_new', 'prec_new', 'recall_new']] = results_df.apply(evaluate_dataframe, axis=1, result_type="expand")
results_df.head()
#results_df.to_csv("../results/zero_shot_task_results_sensitvity.csv")