In [1]:
import openai
import tiktoken
import time
import random
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import argparse
import os
import pickle
import numpy as np
from tqdm.auto import tqdm
import scipy.stats
from evaluate import load

openai.api_key = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
bertscore = load("bertscore")

model = "microsoft/DialogRPT-updown"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)
model.eval()

@torch.no_grad()
def score(ctx, hyp):
  model_input = tokenizer.encode(ctx + hyp, return_tensors="pt")
  result = model(model_input, return_dict=True)
  return torch.sigmoid(result.logits)


def preprocess(preds, refs, num_ignore_chars=9):
    return [pred[9:] for pred in preds], [ref[9:] for ref in refs]



In [3]:
def query_gpt_with_retries(prompt_object, model = "gpt-4-0314", temperature = 1, max_tokens = 100, top_p = 1, frequency_penalty = 0, presence_penalty = 0, stop = None, logit_bias = {}, n = 1, retries = 20):
    # this function queries gpt-3 with retries, because sometimes the api is down
    # the function returns the response from gpt-4
    # prompt_object is a list of dictionaries with role and content
    # For example, prompt_object = [{"role": "system", "content": "Hello"}, {"role": "user", "content": "Hi"}, {"role":"assistant", "content": "How can I help you?"}]
    
    while retries > 0:
        # print ("Trying to query gpt-4 with retries = ", retries)
        if retries == 10:
            time.sleep(10)
        try:
            max_tokens = num_tokens_from_messages(prompt_object, model=model) + 150
            response = openai.ChatCompletion.create(
                model=model,
                messages=prompt_object,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p,
                frequency_penalty=frequency_penalty,
                presence_penalty=presence_penalty,
                logit_bias=logit_bias,
                stop=stop,
                n=n
            )
            return response
        except Exception as e:
            print(str(e))
            retries -= 1
    return None 


def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":  # note: future models may deviate from this
        num_tokens = 0
        for message in messages:
            num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":  # if there's a name, the role is omitted
                    num_tokens += -1  # role is always required and always 1 token
        num_tokens += 2  # every reply is primed with <im_start>assistant
        return num_tokens
    elif model == "gpt-4-0314" or model == "gpt-4":
        num_tokens = 0
        for message in messages:
            num_tokens += 4
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":
                    num_tokens += -1
        num_tokens += 2
        return num_tokens
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
    See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")

In [4]:
# load jsonl 
import os 
import json
import pandas as pd

data = {
    "train": [],
    "dev": [],
}

dataset_names = {
    "train": "train_with-reference",
    "dev": "dev_without-reference"
}

## sample prompt object

# messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": "Who won the world series in 2020?"},
#         {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
#         {"role": "user", "content": "Where was it played?"}
#     ]


for split in dataset_names:
    with open(os.path.join("data", "raw", f"{dataset_names[split]}.jsonl"), "r") as f:
        for line in f:
            data_line = json.loads(line)
            sample = {"context": "", "response": "", "dialogRPTcontext": "", "dialogRPTresponse": ""}
            utterances = data_line["utterances"]
            # [{'text': 'A) pull through', 'speaker': 'student'}, {'text': 'OK great', 'speaker': 'teacher'}, {'text': 'Not sure about the meaning of the second one... Does that person mean that being the prime minister he had to survive??', 'speaker': 'student'}] {'text': 'Ah yes good question - this is a bit ambiguous....', 'speaker': 'teacher'}
            # sample["context"] = "\n".join([f"{x['speaker']}: {x['text']}" for x in utterances])
            # make sample context into a prompt object
            sample["context"] = []
            for i in range(len(utterances)):
                if utterances[i]['speaker'] == 'student':
                    # sample["context"].append({"role": "user", "content": "new utterance"})
                    sample["context"].append({"role": "user", "content": utterances[i]['speaker'] + ": " + utterances[i]["text"]})
                else:
                    sample["context"].append({"role": "assistant", "content": utterances[i]['speaker'] + ": " + utterances[i]["text"]})
                    
            dialogRPTcontext = ''
            for dialogue_line in utterances:
                dialogRPTcontext += "'" + dialogue_line['speaker'] + "': " + dialogue_line['text'] + ' <|endoftext|> '
            sample["dialogRPTcontext"] = dialogRPTcontext
                        
            
            if "response" in data_line.keys():
                response = data_line["response"]
                sample["response"] = response['speaker'] + ": " + response['text']
                
                dialogRPTresponse = "'" + response['speaker'] + "': " + response['text'] + ' <|endoftext|> ' 
                sample["dialogRPTresponse"] = dialogRPTresponse 

            # print (sample)
            data[split].append(sample)

In [5]:
# randomly sample 100 examples from train
avg_dialogRPT_score_gpt4 = 0.0
avg_dialogRPT_score_train = 0.0
avg_bertscore_results = 0.0

random_sample_idx_list = np.random.choice(len(data['train']), len(data['train']), replace=False)
for i, random_sample_idx in enumerate(random_sample_idx_list):
    # if i%10 == 0 and i != 0:
    # print (f"Processed {i} examples")
    # print (f"The dialogRPT score at {i} examples with gpt4 is: ", avg_dialogRPT_score_gpt4/i)
    # print (f"The dialogRPT score at {i} examples  with train is: ", avg_dialogRPT_score_train/i)
    # print (f"The F1 bertscore at {i} examples is: ", avg_bertscore_results/i)

    prompt_object = data['train'][random_sample_idx]['context']
    
    exists = False
    for utterance in prompt_object:
        if "Let's try again, you reconstruct the question" in utterance['content']:
            exists = True  
    if not exists:
        continue
    else:
        print (prompt_object)
    num_few_shot_examples = 3
    # randomly sample from data['train'] and append before prompt_object
    for i in range(num_few_shot_examples):
        sample = random.choice(data['train'])
        prompt_object = sample['context'] + [{'role': 'assistant', 'content': sample['response']}] + [{'role': 'user', 'content': 'new conversation'},] + prompt_object
    prompt_object.insert(0, {"role": "system", "content": "You are acting as a teacher, and you are helping a student learn, be patient, helpful and kind. Don't be super imposing, give short responses to encourage learning, make the student feel comfortable and confident and help them learn."})
    response = data['train'][random_sample_idx]['response']
    
    gpt4_response = query_gpt_with_retries(prompt_object, model = "gpt-4-0314")
    gpt4_response_str = gpt4_response.choices[0]['message']['content']

    dialogRPT_gpt4_response = gpt4_response_str.replace("teacher: ", "'teacher': ") + " |<endoftext|>"
    # print (data['train'][random_sample_idx]['dialogRPTcontext'])
    # print (dialogRPT_gpt4_response, score(data['train'][random_sample_idx]['dialogRPTcontext'], dialogRPT_gpt4_response).item())
    dialogRPTscore_gpt4response = score(data['train'][random_sample_idx]['dialogRPTcontext'], dialogRPT_gpt4_response).item()
    avg_dialogRPT_score_gpt4 += dialogRPTscore_gpt4response

    dialogRPTscore_trainresponse = score(data['train'][random_sample_idx]['dialogRPTcontext'], data['train'][random_sample_idx]['dialogRPTresponse']).item()
    avg_dialogRPT_score_train += dialogRPTscore_trainresponse
    
        
    bertscore_gpt4response = bertscore.compute(predictions=[gpt4_response_str], references=[data['train'][random_sample_idx]['dialogRPTresponse']], lang='en',
                            model_type='roberta-large')['f1'][0]
    avg_bertscore_results += bertscore_gpt4response
    
# print ("The average dialogRPT score with gpt4 is: ", avg_dialogRPT_score_gpt4/100)
# print ("The average dialogRPT score with train is: ", avg_dialogRPT_score_train/100)
# print ("The average F1 bertscore is: ", avg_bertscore_results/100)
    

[{'role': 'assistant', 'content': "teacher: Let's try again, you reconstruct the question"}, {'role': 'user', 'content': 'student: Ok'}, {'role': 'assistant', 'content': "teacher: I don't know whether you have ever watched Star Wars."}, {'role': 'user', 'content': 'student: Have you ever watched Star Wars?'}, {'role': 'assistant', 'content': 'teacher: Yes, well done!'}, {'role': 'assistant', 'content': 'teacher: I wonder if your lizard sleeps a lot.'}, {'role': 'user', 'content': 'student: Does your lizard sleeps a lot'}, {'role': 'assistant', 'content': 'teacher: Almost! Can you spot a mistake?'}, {'role': 'user', 'content': 'student: Sleep'}, {'role': 'assistant', 'content': 'teacher: Correct!'}, {'role': 'assistant', 'content': 'teacher: Ok, now could you change the question into reported speech, please'}, {'role': 'user', 'content': 'student: Õķ'}]
teacher: Great! Let's start with the first one. 0.6589201092720032 0.8280081152915955
[{'role': 'user', 'content': 'student: Ok'}, {'ro