# Parsing of Llama3 70b outputs

In [None]:
#!pip install openai
#!pip install datasets

In [None]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [None]:
import torch
import time
import datasets
import openai
import json
import pandas as pd
from tqdm import tqdm

In [None]:
def set_seed(seed: int) -> None:
  import os
  import random

  import numpy as np

  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  # When running on the CuDNN backend, two further options must be set
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False
  # Set a fixed value for the hash seed
  os.environ["PYTHONHASHSEED"] = str(seed)
  print(f"Random seed set as {seed}")

set_seed(42)# not sure if needed

In [None]:
df =pd.read_json("llama3_70b/llama3_70B_zero_shot_prompt_answers.json")

with open("llama3_70b/llama3_70B_zero_shot_prompt.json") as json_data:
    d = json.load(json_data)
df["true_labels"] = d["true_labels"]

df["classified_label"] = 0
print(len(df))
df.head()

In [None]:
for answer in tqdm(df.answers):
    print(answer)

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "" #...
client = openai.OpenAI()

In [None]:
def generate(prompt, client):
    messages = [
        {"role": "system", "content": "You are an assistant that evaluates a text and answers a question."},
        {"role": "user", "content": prompt}
    ]
    response = client.chat.completions.create(model="gpt-3.5-turbo-1106", #"gpt-3.5-turbo-1106",gpt-4o
                                              messages=messages,
                                              max_tokens=64,#256
                                              temperature=0.8,
                                              top_p=0.9)

    return response.choices[0].message.content

In [None]:
import re

def create_prompt(prompt, pre, post):
    return pre + "\n"+prompt+ "\n"+post

def get_classification(prompt, client, pre="", post = ""):
    input = create_prompt(prompt, pre, post)
    # Define the string
    text =  generate(input, client)

    # Use re.search to find the first digit
    match = re.search(r'\d', text)

    if match:
        return match.group(), text
    else:
        print("---"*50)
        print(input)
        print("---"*50)
        print(text)
        return 0, text


In [None]:
text = 'The most plausible explanation is not Hypothesis 2. Kat was hungry, so it\'s unlikely she would decide to take a nap instead of eating. \n\nThe most plausible explanation is a hypothesis that is not provided, such as "Kat and her coworkers went to a nearby restaurant" or "Kat brought a lunch from home". However, if I had to choose between the two provided hypotheses, I would say that Hypothesis 1: Kat went to get a salad is more plausible than Hypothesis 2.'
support="You are given a text reasoning for Hypothesis one or two. Return only the number of the hypothesis that in the opinion of the text is more likely. No Yapping:\n"
pre="You are given a text reasoning for Hypothesis one or two. Return only the number of the hypothesis that in the opinion of the text is more likely. Don't give me your opinion, only return the number of the hypothesis the text states is more likely:\n"
post = "Given this text, which hypothesis does the author support more? Only return the number of the hypothesis (1 or 2, no other answers allowed)"
print(create_prompt(text, pre=pre, post=post)) # test if the prompt generation works

In [None]:
i=0
for answer in tqdm(df.answers):
    i+=1

    print(create_prompt(answer, pre, post))
    print("---"*50)
    print("---"*50)
    if i > 10:
        break

In [None]:
#print(get_classification(text, client))
#support="You are given a text reasoning for Hypothesis one or two. Return only the number of the hypothesis that in the opinion of the text is more likely. Don't give me your opinion, only return the number of the hypothesis the text states is more likely:\n"
pre = ""
post = "\nGiven this text, which hypothesis does the author support more? Only return the number of the hypothesis (1 or 2, no other answers allowed)"


In [None]:
answers_list = []
classification_list = []
for answer in tqdm(df.answers):
    classification, text = get_classification(answer, client, pre=pre, post = post)
    answers_list.append(text)
    classification_list.append(classification)
    #print(classification)

## analyze the results

In [None]:
from collections import Counter

In [None]:
Counter(classification_list)

In [None]:
df["classified_label"] = classification_list
df["output_text"] = answers_list

df.to_csv('results_zero_shot_LLAMA_GPT3.5_resolved.csv', index=False)


In [None]:
df = pd.read_csv('results_zero_shot_LLAMA_GPT3.5_resolved.csv')
df

In [None]:
df.head()

In [None]:
#df.true_labels = [int( res) for res in (df.true_labels)]

In [None]:
df_reduced = df[df.classified_label.isin([1, 2])]
len(df_reduced)

In [None]:
df.true_labels[0]

In [None]:
sum(df_reduced.true_labels == df_reduced.classified_label)/len(df_reduced.true_labels)

In [None]:
print(df_reduced[df_reduced.true_labels == df_reduced.classified_label].iloc[5].answers                  )

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

acc = accuracy_score(df_reduced.true_labels , df_reduced.classified_label)
acc

In [None]:
f1 = f1_score(df_reduced.true_labels , df_reduced.classified_label, average='weighted')
f1

In [None]:
df_failed= df[ df.classified_label.isin([0, 3, 4, 9])]
df_failed.head()

In [None]:
for el in (df_failed[df_failed.classified_label != 0]).iterrows():
    print(el[1][2])
    print("--"*150)
    print(el[1][5])
    print("=="*150)
    print("=="*150)

In [None]:
for el in (df_failed[df_failed.classified_label == 0]).iterrows():
    print(el[1][2])
    print("--"*150)
    print(el[1][5])
    print("=="*150)
    print("=="*150)

In [None]:
for el in (df_reduced).iterrows():
    print(el[1][2])
    print("--"*150)
    print(el[1][5])
    print("=="*150)
    print("=="*150)

# END