In [29]:
import openai
import tiktoken
import time
import random

openai.api_key = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

In [16]:
def query_gpt_with_retries(prompt_object, model = "gpt-4-0314", temperature = 1, max_tokens = 100, top_p = 1, frequency_penalty = 0, presence_penalty = 0, stop = None, logit_bias = {}, n = 1, retries = 20):
    # this function queries gpt-3 with retries, because sometimes the api is down
    # the function returns the response from gpt-4
    # prompt_object is a list of dictionaries with role and content
    # For example, prompt_object = [{"role": "system", "content": "Hello"}, {"role": "user", "content": "Hi"}, {"role":"assistant", "content": "How can I help you?"}]
    
    while retries > 0:
        # print ("Trying to query gpt-4 with retries = ", retries)
        if retries == 10:
            time.sleep(10)
        try:
            max_tokens = num_tokens_from_messages(prompt_object, model=model) + 150
            response = openai.ChatCompletion.create(
                model=model,
                messages=prompt_object,
                temperature=temperature,
                max_tokens=max_tokens,
                top_p=top_p,
                frequency_penalty=frequency_penalty,
                presence_penalty=presence_penalty,
                logit_bias=logit_bias,
                stop=stop,
                n=n
            )
            return response
        except Exception as e:
            print(str(e))
            retries -= 1
    return None 


def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":  # note: future models may deviate from this
        num_tokens = 0
        for message in messages:
            num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":  # if there's a name, the role is omitted
                    num_tokens += -1  # role is always required and always 1 token
        num_tokens += 2  # every reply is primed with <im_start>assistant
        return num_tokens
    elif model == "gpt-4-0314" or model == "gpt-4":
        num_tokens = 0
        for message in messages:
            num_tokens += 4
            for key, value in message.items():
                num_tokens += len(encoding.encode(value))
                if key == "name":
                    num_tokens += -1
        num_tokens += 2
        return num_tokens
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
    See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")

In [46]:
# load jsonl 
import os 
import json
import pandas as pd

data = {
    "train": [],
    "dev": [],
}

dataset_names = {
    "train": "train_with-reference",
    "dev": "dev_without-reference"
}

## sample prompt object

# messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": "Who won the world series in 2020?"},
#         {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
#         {"role": "user", "content": "Where was it played?"}
#     ]


for split in dataset_names:
    with open(os.path.join("data", "raw", f"{dataset_names[split]}.jsonl"), "r") as f:
        for line in f:
            data_line = json.loads(line)
            sample = {"context": "", "response": ""}
            utterances = data_line["utterances"]
            # [{'text': 'A) pull through', 'speaker': 'student'}, {'text': 'OK great', 'speaker': 'teacher'}, {'text': 'Not sure about the meaning of the second one... Does that person mean that being the prime minister he had to survive??', 'speaker': 'student'}] {'text': 'Ah yes good question - this is a bit ambiguous....', 'speaker': 'teacher'}
            # sample["context"] = "\n".join([f"{x['speaker']}: {x['text']}" for x in utterances])
            # make sample context into a prompt object
            sample["context"] = []
            for i in range(len(utterances)):
                
                if utterances[i]['speaker'] == 'student':
                    # sample["context"].append({"role": "user", "content": "new conversation"})
                    sample["context"].append({"role": "user", "content": utterances[i]['speaker'] + ": " + utterances[i]["text"]})
                else:
                    sample["context"].append({"role": "assistant", "content": utterances[i]['speaker'] + ": " + utterances[i]["text"]})
            if "response" in data_line.keys():
                response = data_line["response"]
                sample["response"] = response['speaker'] + ": " + response['text']
            print (sample)
            data[split].append(sample)

{'context': [{'role': 'user', 'content': 'student: A) pull through'}, {'role': 'assistant', 'content': 'teacher: OK great'}, {'role': 'user', 'content': 'student: Not sure about the meaning of the second one... Does that person mean that being the prime minister he had to survive??'}], 'response': 'teacher: Ah yes good question - this is a bit ambiguous....'}
{'context': [{'role': 'user', 'content': 'student: willpower?)'}, {'role': 'assistant', 'content': 'teacher: Yes!'}, {'role': 'assistant', 'content': 'teacher: So, a bit of willpower, and kids will be super successful!'}, {'role': 'assistant', 'content': 'teacher: Do you think it worked?'}, {'role': 'user', 'content': 'student: yes'}, {'role': 'assistant', 'content': 'teacher: No! :))'}, {'role': 'user', 'content': 'student: why?'}, {'role': 'assistant', 'content': 'teacher: because human beings are way more complex than that'}, {'role': 'assistant', 'content': "teacher: You know how in science it's very important that the results

In [23]:
data['train'][0]['context']

[{'role': 'system',
  'content': "You are acting as a teacher, and you are helping a student learn, be patient, helpful and kind. Don't be super imposing, give short responses to encourage learning, make the student feel comfortable and confident and help them learn."},
 {'role': 'user', 'content': 'student: A) pull through'},
 {'role': 'assistant', 'content': 'teacher: OK great'},
 {'role': 'user',
  'content': 'student: Not sure about the meaning of the second one... Does that person mean that being the prime minister he had to survive??'}]

In [33]:
sample['response']

'teacher: Ooh, good one!'

In [59]:
random_sample_idx = random.randint(0, len(data['train']))
prompt_object = data['train'][random_sample_idx]['context']
num_few_shot_examples = 3
# randomly sample from data['train'] and append before prompt_object
for i in range(num_few_shot_examples):
    sample = random.choice(data['train'])
    print (sample)
    prompt_object = sample['context'] + [{'role': 'assistant', 'content': sample['response']}] + [{'role': 'user', 'content': 'new conversation'},] + prompt_object
prompt_object.insert(0, {"role": "system", "content": "You are acting as a teacher, and you are helping a student learn, be patient, helpful and kind. Don't be super imposing, give short responses to encourage learning, make the student feel comfortable and confident and help them learn."})
response = data['train'][random_sample_idx]['response']

{'context': [{'role': 'assistant', 'content': "teacher: Sorry 'the'"}, {'role': 'assistant', 'content': "teacher: Ahhh sorry <STUDENT> no ..your 'the' is fine actually but you need to delete 'of' ok?"}, {'role': 'user', 'content': "student: both of charts illustrate the different reactions' about ashdown museum before and after  the renovation"}, {'role': 'user', 'content': 'student: both the charts?'}, {'role': 'user', 'content': 'student: sorry'}, {'role': 'user', 'content': "student: visitors' reactions"}, {'role': 'assistant', 'content': 'teacher: Ok thanks so...Botb charts illustrate the different reactions to Ashdown museum before and after its renovation = good'}, {'role': 'user', 'content': 'student: super'}, {'role': 'assistant', 'content': 'teacher: Look at this pattern: reply answer reaction etc TO something ok?'}, {'role': 'user', 'content': "student: it's strange"}, {'role': 'assistant', 'content': 'teacher: Not for me!'}, {'role': 'user', 'content': 'student: ahahha'}, {'

In [60]:
prompt_object

[{'role': 'system',
  'content': "You are acting as a teacher, and you are helping a student learn, be patient, helpful and kind. Don't be super imposing, give short responses to encourage learning, make the student feel comfortable and confident and help them learn."},
 {'role': 'assistant',
  'content': 'teacher: so a verb for each one if you can (different verb )'},
 {'role': 'user', 'content': 'student: use out'},
 {'role': 'user', 'content': 'student: supply'},
 {'role': 'user', 'content': 'student: demand'},
 {'role': 'assistant',
  'content': "teacher: OK nearly! 'Use UP' = good! for the second one"},
 {'role': 'user', 'content': 'student: rely on'},
 {'role': 'assistant',
  'content': "teacher: And the first one = 'running out/disappearing OK?"},
 {'role': 'user', 'content': 'student: yes,ok'},
 {'role': 'assistant',
  'content': "teacher: so the verbs 'supply' and 'demand' are nice verbs in many situations but the problem here is that you have 'very quickly' you see?"},
 {'rol

In [61]:
response

"teacher: Yes...I'm too impatient for yoga but here we are..."

In [62]:
query_gpt_with_retries(prompt_object, model = "gpt-4")

Rate limit reached for default-gpt-4 in organization org-rocrupyvzgcl4yf25rqq6d1v on requests per min. Limit: 200 / min. Please try again in 300ms. Contact support@openai.com if you continue to have issues.
Rate limit reached for default-gpt-4 in organization org-rocrupyvzgcl4yf25rqq6d1v on requests per min. Limit: 200 / min. Please try again in 300ms. Contact support@openai.com if you continue to have issues.
Rate limit reached for default-gpt-4 in organization org-rocrupyvzgcl4yf25rqq6d1v on requests per min. Limit: 200 / min. Please try again in 300ms. Contact support@openai.com if you continue to have issues.
Rate limit reached for default-gpt-4 in organization org-rocrupyvzgcl4yf25rqq6d1v on requests per min. Limit: 200 / min. Please try again in 300ms. Contact support@openai.com if you continue to have issues.
Rate limit reached for default-gpt-4 in organization org-rocrupyvzgcl4yf25rqq6d1v on requests per min. Limit: 200 / min. Please try again in 300ms. Contact support@openai.c

<OpenAIObject chat.completion id=chatcmpl-74eVlwRfYD1pu7lxcwgpcVw5NGU7c at 0x7fe6576e86d0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "teacher: But i think it is important to try different forms of exercise don't you?",
        "role": "assistant"
      }
    }
  ],
  "created": 1681343393,
  "id": "chatcmpl-74eVlwRfYD1pu7lxcwgpcVw5NGU7c",
  "model": "gpt-4-0314",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 18,
    "prompt_tokens": 621,
    "total_tokens": 639
  }
}

"teacher: We get increasingly short term students which is fine but it's more tiring/stressful"