In [3]:
import os 
import json
import pandas as pd


train_f = open(os.path.join("data", "raw", f"train_with-reference.jsonl"), "r")
data = []
for line in train_f:
    data.append(json.loads(line))
train_f.close()

dev_f = open(os.path.join("data", "raw", f"dev_without-reference.jsonl"), "r")
dev_data = []
for line in dev_f:
    dev_data.append(json.loads(line))
dev_f.close()

In [4]:
import random
from itertools import tee

def triplewise(iterable):
    a, b, c = tee(iterable, 3)
    next(b, None)
    next(c, None)
    next(c, None)
    return zip(a, b, c)

def get_trigrams(convo):
    texts = [utterance['text'] for utterance in convo['utterances']]
    trigrams = set(triplewise(texts))
    return trigrams

def has_overlap(convo1, convo2):
    convo1_trigrams = get_trigrams(convo1)
    convo2_trigrams = get_trigrams(convo2)

    return bool(convo1_trigrams.intersection(convo2_trigrams))

In [None]:
random.shuffle(data)
test_size = int(0.1 * len(data))

train_data = data[test_size:]
test_data = data[:test_size]

for convo_test in test_data:
    for convo_train in train_data:
        if has_overlap(convo_test, convo_train):
            train_data.remove(convo_train)
            test_data.append(convo_train)

print("Train set size:", len(train_data))
print("Test set size:", len(test_data))

In [24]:
# check overlap between train and dev
count = 0
ids = set()
for convo in train_data:
    for train_convo in train_data:
        if train_convo != convo and has_overlap(convo, train_convo):
            count += 1
            ids.add(convo['id'])
    #         print("Overlap found!", convo['id'], train_convo['id'])
    #         # print the 2 utterances and the 3 utterances that overlap
    #         print ("Convo 1:", convo['utterances'])
    #         print ("Convo 2:", train_convo['utterances'])
    #         print ("Overlap:", get_trigrams(convo).intersection(get_trigrams(train_convo)))
    #         break
    # break
print ("Number of overlaps between train and dev:", len(ids))

Number of overlaps between train and dev: 1236


In [10]:
import os 
import json
import pandas as pd
import random
from itertools import tee

def triplewise(iterable):
    a, b, c = tee(iterable, 3)
    next(b, None)
    next(c, None)
    next(c, None)
    return zip(a, b, c)

def get_trigrams(convo):
    texts = [utterance['content'] for utterance in convo['context']]
    trigrams = set(triplewise(texts))
    return trigrams

def has_overlap(convo1, convo2):
    convo1_trigrams = get_trigrams(convo1)
    convo2_trigrams = get_trigrams(convo2)

    return bool(convo1_trigrams.intersection(convo2_trigrams))


data = {
    "train": [],
    "dev": [],
    "test": []
}

dataset_names = {
    "train": "train_with-reference",
    "dev": "dev_without-reference"
}

## sample prompt object

# messages=[
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": "Who won the world series in 2020?"},
#         {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
#         {"role": "user", "content": "Where was it played?"}
#     ]


for split in dataset_names:
    with open(os.path.join("data", "raw", f"{dataset_names[split]}.jsonl"), "r") as f:
        for line in f:
            data_line = json.loads(line)
            sample = {"context": "", "response": "", "dialogRPTcontext": "", "dialogRPTresponse": ""}
            utterances = data_line["utterances"]
            # [{'text': 'A) pull through', 'speaker': 'student'}, {'text': 'OK great', 'speaker': 'teacher'}, {'text': 'Not sure about the meaning of the second one... Does that person mean that being the prime minister he had to survive??', 'speaker': 'student'}] {'text': 'Ah yes good question - this is a bit ambiguous....', 'speaker': 'teacher'}
            # sample["context"] = "\n".join([f"{x['speaker']}: {x['text']}" for x in utterances])
            # make sample context into a prompt object
            sample["context"] = []
            for i in range(len(utterances)):
                if utterances[i]['speaker'] == 'student':
                    # sample["context"].append({"role": "user", "content": "new utterance"})
                    sample["context"].append({"role": "user", "content": utterances[i]['speaker'] + ": " + utterances[i]["text"]})
                else:
                    sample["context"].append({"role": "assistant", "content": utterances[i]['speaker'] + ": " + utterances[i]["text"]})
                    
            dialogRPTcontext = ''
            for dialogue_line in utterances:
                dialogRPTcontext += "'" + dialogue_line['speaker'] + "': " + dialogue_line['text'] + ' <|endoftext|> '
            sample["dialogRPTcontext"] = dialogRPTcontext
                        
            
            if "response" in data_line.keys():
                response = data_line["response"]
                sample["response"] = response['speaker'] + ": " + response['text']
                
                dialogRPTresponse = "'" + response['speaker'] + "': " + response['text'] + ' <|endoftext|> ' 
                sample["dialogRPTresponse"] = dialogRPTresponse 

            # print (sample)
            data[split].append(sample)
            
# now split the train set into train and test using the overlap measure
random.shuffle(data["train"])
test_size = int(0.05 * len(data["train"]))
train_data = data["train"][test_size:]
test_data = data["train"][:test_size]

for convo_test in test_data:
    for convo_train in train_data:
        if has_overlap(convo_test, convo_train):
            train_data.remove(convo_train)
            test_data.append(convo_train)
            
data['train'] = train_data
data['test'] = test_data

In [12]:
print ("Train set size:", len(data['train']))
print ("Test set size:", len(data['test']))
print ("Dev set size:", len(data['dev']))

# ratios
print ("Train ratio:", len(data['train']) / (len(data['train']) + len(data['test']) + len(data['dev'])))
print ("Test ratio:", len(data['test']) / (len(data['train']) + len(data['test']) + len(data['dev'])))
print ("Dev ratio:", len(data['dev']) / (len(data['train']) + len(data['test']) + len(data['dev'])))

Train set size: 2147
Test set size: 600
Dev set size: 305
Train ratio: 0.7815799053512923
Test ratio: 0.21842009464870768
Dev ratio: 0.09993446920052425
