In [1]:
import gensim
import numpy as np

# Data preprocessing

Here, the aim is to make the 3 DataLoaders: Train, Val, Test.

In [2]:
train_path = "../../data/friends.train.scene_delim.conll"
test_path = "../../data/friends.test.scene_delim.conll"

In [3]:
def read_conllu(in_p):
    with open(in_p, "r") as f:
        data = f.read()
    data = data.split("\n")
    # iterate through all lines and get fields
    sentences = []
    tokens = []
    for line in data:
        if line.startswith("#"):  # skip comments
            continue
        if line.strip() == "":  # end of sentence
            sentences.append(tokens)
            tokens = []
            continue
        line = line.replace("/  +/g", ' ')
        fields = line.split()
        token = {
            # "doc id": fields[0],
            # "scene id": fields[1],
            # "token id": fields[2],
            "form": fields[3].lower(),
            "speaker": fields[9],
            "referenced": fields[11]
        }
        tokens.append(token)
    return sentences

In [4]:
def combine_scenes(list_scene):
    # combine every 24 scenes into single long scene
    l = len(list_scene)
    fin = []
    for i in range(0, l, 24):
        temp = []
        if i+24<=l:
            part = list_scene[i:i+24]
        else:
            part = list_scene[i:]
        for j in part:
            temp += j
        fin.append(temp)
    return fin

In [5]:
def fix_cont_ref(list_scene):
    # FIX CONTINUOUS REFERENCES
    ## if consecutive words reference the same entity then 
    ## the last words references it.
    for scene in list_scene:
        for token in scene:
            ref = token["referenced"]
            if "(" in ref:
                if ")" not in ref:
                    token["referenced"] = "-"
            ref = token["referenced"]
            # convert reference num to int
            if ref!="-":
                num = ""
                for char in ref:
                    if char.isalnum():
                        num += char
                num = int(num)
            else:
                num = -1
            token["referenced"] = num
    return list_scene

In [6]:
train = fix_cont_ref(combine_scenes(read_conllu(train_path)))
test = fix_cont_ref(combine_scenes(read_conllu(test_path)))

In [7]:
mapping = {
           "idx2char":{},
           "char2idx":{}
          }
with open("../../data/friends_entity_map.txt", "r") as f:
    idx_char = f.read()
idx_char = idx_char.split("\n")[:-1]
for i in idx_char:
    i_c = i.split("\t")
    z = "_".join(i_c[1].split(" "))
    mapping["idx2char"][i_c[0]] = z
    mapping["char2idx"][z] = int(i_c[0])

# Dealing with Referenced
## All charecters with freq < 3 are set to "\<OTH>"

In [8]:
for scene in train:
    for token in scene:
        if token["referenced"]==-1:
            token["referenced"] = "-"
        else:
            token["referenced"] = mapping["idx2char"][str(token["referenced"])]
for scene in test:
    for token in scene:
        if token["referenced"]==-1:
            token["referenced"] = "-"
        else:
            token["referenced"] = mapping["idx2char"][str(token["referenced"])]
count = {}
for scene in train:
    for token in scene:
        ref = token["referenced"]
        if ref in count:
            count[ref] += 1
        else:
            count[ref] = 1
for scene in test:
    for token in scene:
        ref = token["referenced"]
        if ref in count:
            count[ref] += 1
        else:
            count[ref] = 1
# All low freq go to "<OTH>"
to_oth = {}
for char in count:
    if count[char]<=20:              # change from orig paper cut off for extra is 20 instead of 3
        to_oth[char] = "<OTH>"
    else:
        to_oth[char] = char
for scene in train:
    for token in scene:
        token["referenced"] = to_oth[token["referenced"]]
for scene in test:
    for token in scene:
        token["referenced"] = to_oth[token["referenced"]]
# Create new mapping char -> idx
char2idx = {}
for scene in train:
    for token in scene:
        char = token["referenced"]
        if char not in char2idx:
            char2idx[char] = len(char2idx)
for scene in test:
    for token in scene:
        char = token["referenced"]
        if char not in char2idx:
            char2idx[char] = len(char2idx)
for scene in train:
    for token in scene:
        token["referenced"] = char2idx[token["referenced"]]
for scene in test:
    for token in scene:
        token["referenced"] = char2idx[token["referenced"]]

In [9]:
len(count)

402

In [10]:
len(char2idx)

65

In [11]:
import json
with open("../../output/char2idx.json", "w") as f:
    json.dump(char2idx, f)

# Dealing with Speaker
### Replace with vocab from entity list

In [12]:
# Add the Other term to char2idx
fin = {}
for i in mapping["char2idx"]:
    if i in char2idx:
        fin[i] = char2idx[i]
    else:
        fin[i] = char2idx["<OTH>"]
mapping["char2idx"] = fin

In [13]:
for scene in train:
    for token in scene:
        sp = token["speaker"]
        if "," in sp:
            chars = "".join(sp.split("_"))
            chars = chars.split(",")
            t = []
            for i in chars:
                if i=="Chandler":
                    i = "Chandler_Bing"
                if i=="Ross":
                    i = "Ross_Geller"
                if i=="Joey":
                    i = "Joey_Tribbiani"
                if i=="Phoebe":
                    i = "Phoebe_Buffay"
                if i=="Monica":
                    i = "Monica_Geller"
                if i=="Rachel":
                    i = "Rachel_Green"
                if i=="Carol":
                    i = "Carol_Willick"
                if i=="Susan":
                    i = "Susan_Bunch"
                t.append(mapping["char2idx"][i])
            token["speaker"] = t
        elif sp=="Boys":
            chars = ['Chandler_Bing', 'Joey_Tribbiani', 'Ross_Geller']
            t = []
            for i in chars:
                t.append(mapping["char2idx"][i])
            token["speaker"] = t
        elif sp=="Gang" or sp=="Everyone":
            chars = ['Chandler_Bing', 'Joey_Tribbiani', 'Ross_Geller', 'Phoebe_Buffay','Monica_Geller', 'Rachel_Green']
            t = []
            for i in chars:
                t.append(mapping["char2idx"][i])
            token["speaker"] = t
        elif sp=="Mr._Green":
            token["speaker"] = mapping["char2idx"]["Mr._Greene"]
        elif sp=="Joey's_Co-Star":
            token["speaker"] = mapping["char2idx"]["Joey's_Co-star"]
        else:
            token["speaker"] = mapping["char2idx"][token["speaker"]]
for scene in test:
    for token in scene:
        sp = token["speaker"]
        if "," in sp:
            chars = "".join(sp.split("_"))
            chars = chars.split(",")
            t = []
            for i in chars:
                if i=="Chandler":
                    i = "Chandler_Bing"
                if i=="Ross":
                    i = "Ross_Geller"
                if i=="Joey":
                    i = "Joey_Tribbiani"
                if i=="Phoebe":
                    i = "Phoebe_Buffay"
                if i=="Monica":
                    i = "Monica_Geller"
                if i=="Rachel":
                    i = "Rachel_Green"
                if i=="Carol":
                    i = "Carol_Willick"
                if i=="Susan":
                    i = "Susan_Bunch"
                t.append(mapping["char2idx"][i])
            token["speaker"] = t
        elif sp=="Boys":
            chars = ['Chandler_Bing', 'Joey_Tribbiani', 'Ross_Geller']
            t = []
            for i in chars:
                t.append(mapping["char2idx"][i])
            token["speaker"] = t
        elif sp=="Gang" or sp=="Everyone":
            chars = ['Chandler_Bing', 'Joey_Tribbiani', 'Ross_Geller', 'Phoebe_Buffay','Monica_Geller', 'Rachel_Green']
            t = []
            for i in chars:
                t.append(mapping["char2idx"][i])
            token["speaker"] = t
        elif sp=="Mr._Green":
            token["speaker"] = mapping["char2idx"]["Mr._Greene"]
        elif sp=="Joey's_Co-Star":
            token["speaker"] = mapping["char2idx"]["Joey's_Co-star"]
        else:
            token["speaker"] = mapping["char2idx"][token["speaker"]]
print("--------Done---------------")

--------Done---------------


In [14]:
ohe_dim = len(char2idx)
for scene in train:
    for token in scene:
        t = np.zeros(ohe_dim)
        chars = token["speaker"]
        if isinstance(chars, list):
            for i in chars:
                t[i] += 1
        else:
            t[chars] += 1
        token["speaker"] = t
for scene in test:
    for token in scene:
        t = np.zeros(ohe_dim)
        chars = token["speaker"]
        if isinstance(chars, list):
            for i in chars:
                t[i] += 1
        else:
            t[chars] += 1
        token["speaker"] = t

# Dealing with Forms
### convert all "forms" to "vec" using Google's News 300 word2vec

In [15]:
forms2vec = {}
for scene in train:
    for token in scene:
        forms2vec[token["form"]] = 1
for scene in test:
    for token in scene:
        forms2vec[token["form"]] = 1
# load model
model = gensim.models.KeyedVectors.load_word2vec_format('../../../huggin_face/GoogleNews-vectors-negative300.bin.gz', binary=True) 
# generate embedding
for form in forms2vec:
    if form in model:
        forms2vec[form] = model[form]
    else:
        forms2vec[form] = [0]*300

In [16]:
for scene in train:
    for token in scene:
        token["form"] = forms2vec[form]
for scene in test:
    for token in scene:
        token["form"] = forms2vec[form]

# Generate DataLoaders and save them

In [17]:
# split train into train+val
len(train)

565

In [18]:
val = train[500:]
train = train[:500]
test = test

In [19]:
def gen_matrices(dt_in):
    fin = []
    for scene in dt_in:
        X = []
        y = []
        for token in scene:
            t1 = [token["form"], token["speaker"]]
            t2 = [token["referenced"]]
            X.append(t1)
            y.append(t2)
        fin.append([X,y])
    return fin

In [20]:
train= gen_matrices(train)
val = gen_matrices(val)
test = gen_matrices(test)

In [23]:
from torch.utils.data import DataLoader
train_dl = DataLoader(train, batch_size=1, shuffle=True)
val_dl = DataLoader(val, batch_size=1, shuffle=True)
test_dl = DataLoader(test, batch_size=1, shuffle=True)

In [25]:
# Save all DataLoaders locally
import pickle

with open('../../output/train.pkl', 'wb') as f:
    pickle.dump(train_dl, f)
with open('../../output/val.pkl', 'wb') as f:
    pickle.dump(val_dl, f)
with open('../../output/test.pkl', 'wb') as f:
    pickle.dump(test_dl, f)