<a href="https://colab.research.google.com/github/NotARectangle/Honours2021/blob/main/TNGMainTraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



Persona ID file

In [None]:
#set special tokens
def setSpecTokens(model, tokenizer):
    special_tokens = ["PICARD:", "OTHER:", "CRUSHER:", "TROI:", "RIKER:", "DATA:", "LAFORGE:", "WESLEY:", "TASHA:", "WORF:", "COMPUTER:", "PULASKI", "O'BRIEN:" ]
    tokenizer.add_special_tokens({"bos_token": "<bos>", "eos_token": "<eos>", "pad_token": "<pad>", "additional_special_tokens" : special_tokens})
    print(tokenizer.bos_token)
    print(tokenizer.eos_token)
    model.resize_token_embeddings(len(tokenizer))


def prepare_inputs(persona, history, reply, model, tokenizer):

    max_input = 1024  # GPT2 max input sequence length.
    string_input = persona + history + reply;

    # Get scene speakers
    speaker_token_re = "([A-Z]+ ?[A-Z]+ ?:)|[A-Z]:"
    speakers = []
    count = 0
    for input in string_input:

        if re.match(speaker_token_re, input):
            speaker = re.findall(speaker_token_re, input)
            for s in speaker:
                if s not in tokenizer.additional_special_tokens and s != "":
                    string_input[count] = input.replace(s, "OTHER:")
        count += 1

    spek_tokens = tokenizer.additional_special_tokens
    spek_token_ids = tokenizer.encode(spek_tokens)

   
    #encode all inputs
    sequence = [tokenizer.encode(s) for s in string_input]

    currentSpeaker = spek_token_ids[0] # start with selected character
    #currentSpeaker = 0
    token_type_ids = []
    words = []

    for seq in sequence:
        t_type = 0
        if seq == sequence[(len(sequence)-1)]:
            t_type = 1
        for token in seq:
            #cocacennate all tokens in sequence together
            words.append(token)
            token_type_ids.append(t_type)
            

    return words, sequence, token_type_ids


Dataloader file

In [None]:

# load dataset
def load_dataset(filePath):
    data = json.load(open(filePath, 'r', encoding="utf-8"))

    return data

# split dataset in train and test dataset
def seperate_train_test(data):
    personas = data.keys()
    dict = {"Train" : {}, "Test": {}}
    train_utt = []
    test_utt = []
    for persona in personas:
        utter = data[persona]["utterances"]
        u_train, u_test = train_test_split(utter, test_size=0.20)
        dict["Train"][persona] = {"PersonaID": data[persona]["PersonaID"], "utterances": u_train}
        dict["Test"][persona] = {"PersonaID": data[persona]["PersonaID"], "utterances": u_test}
    return dict

# prepares input parameters for model
def prepare_inputs_from_data(data, model, tokenizer):
    input_dict = {"input_ids": [], "token_type_ids":[]
          , "labels": [], "attention_mask": []}
    for person in data:
      print(person)
      persona = data[person]["PersonaID"]
      utterances = data[person]["utterances"]
      index = 0
      trainingLen = len(utterances)
      while index < trainingLen: #less to make it faster for testing
          history = utterances[index]["history"]
          reply = utterances[index]["reply"]
          #tokenize and build word sequence sing prepare inputs
          words, sequence, token_type_ids = prepare_inputs(persona, history, reply, model, tokenizer)
          if len(words) < 300:
              #Add inputs to input_dict
              input_dict["input_ids"].append(words)
              last_token = len(words)-1
              labels = []
              current_pers = tokenizer.encode(person)
              current_pers = current_pers[0]
              in_reply = False
              for seq in sequence:
                  # don't count labels for other personas
                  j = 0
                  label = -100
                  # check if persona in sequence
                  if current_pers in seq:
                      #label = 10                     
                      in_reply = True
                  else:
                      in_reply = False
                      label = -100
                  while j < len(seq):
                      if in_reply is True:
                          label = seq[j]
                      labels.append(label)
                      j += 1

              input_dict["token_type_ids"].append(token_type_ids)
              input_dict["labels"].append(labels)
          index += 1

      print("finished while prep input")

      # add attention mask
      att_Mask = []
      for input in input_dict["input_ids"]:
          item = []
          for i in input:
              if i != tokenizer.pad_token_id:
                  item.append(1)
              else:
                  item.append(0)
          att_Mask.append(item)

      input_dict["attention_mask"] = att_Mask
    return input_dict

 #build tensor dataset.
def convert_to_tensors(input_dict, pad_value):
    tensor_dataset = {}
    length = []
    pad_v = pad_value
    for item in input_dict["input_ids"]:
        length.append(len(item))
    print(max(length))

    for key in input_dict:
        unpad_tensors = []
        for item in input_dict[key]:
            tensor = torch.tensor(item)
            unpad_tensors.append(tensor)
        #pad tensor
        #if key equals labels pad -100 to ignore for calculating loss
        if key == "labels":
            pad_v=-100
        elif key == "attention_mask":
          pad_v = 0
        else:
            pad_v=pad_value
        tensors = pad_sequence(unpad_tensors, batch_first=True, padding_value=pad_v)
        tensor_dataset[key] = tensors

    return tensor_dataset


Training file

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW, AutoModel, Trainer, TrainingArguments

class tng_dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings["input_ids"])


def train(input_dict, model):
    print("start training method")
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model.to(device)
    model.train()
    optim = AdamW(model.parameters(), lr=5e-5)

    train_dataset = tng_dataset(input_dict["train"])
    test_dataset = tng_dataset(input_dict["test"])

    #convert to dataset.
    train_loader = DataLoader(train_dataset, batch_size=10)

    training_args = TrainingArguments(
        output_dir='./results',  
        num_train_epochs=3, 
        per_device_train_batch_size=9,  
        per_device_eval_batch_size=10, 
        warmup_steps=500, 
        weight_decay=0.01,  
        logging_dir='./logs',  
        logging_steps=10,
    )

    trainer = Trainer(
        model=model, 
        args=training_args,  
        train_dataset=train_dataset,  
        eval_dataset=test_dataset 
    )

    trainer.train()
    trainer.save_model("drive/MyDrive/Honours2021/TNGv5")


def config_dir():
    return "/MakeItSo"

Main File from model folder

In [None]:
from transformers import GPT2Tokenizer, pipeline, GPT2LMHeadModel
import json
import re
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
setSpecTokens(model, tokenizer)

filepath = 'drive/MyDrive/Honours2021/tngPersonaData.json'
data = load_dataset(filepath)

newDict = seperate_train_test(data)
#make train and test dict with all characters

train_dict = newDict["Train"]
test_dict = newDict["Test"]
#print(train_dict)
input_dict_train = prepare_inputs_from_data(train_dict, model, tokenizer)
input_dict_test = prepare_inputs_from_data(test_dict, model, tokenizer)
pad_value = tokenizer.pad_token_id
train_tensor_dataset = convert_to_tensors(input_dict_train, pad_value)
test_tensor_dataset = convert_to_tensors(input_dict_test, pad_value)



<bos>
<eos>
PICARD:
finished while prep input
TROI:
finished while prep input
DATA:
finished while prep input
RIKER:
finished while prep input
PICARD:
finished while prep input
TROI:
finished while prep input
DATA:
finished while prep input
RIKER:
finished while prep input
295
293


In [None]:
train_tensor_dataset["input_ids"].size()
print(train_tensor_dataset["token_type_ids"][0])

tensor([    0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
        50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
        50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
        50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
        50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
        50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
        50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 

In [None]:
tensor_dataset = {"train" : train_tensor_dataset, "test" : test_tensor_dataset}

train(tensor_dataset, model)
tokenizer.save_pretrained("drive/MyDrive/Honours2021/TNGv5")

start training method


  # Remove the CWD from sys.path while we load stuff.


Step,Training Loss
10,55.7288
20,41.1148
30,25.519
40,14.6081
50,10.5598
60,8.6067
70,7.589
80,6.8867
90,6.2127
100,5.4395


  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


('drive/MyDrive/Honours2021/TNGv5/tokenizer_config.json',
 'drive/MyDrive/Honours2021/TNGv5/special_tokens_map.json',
 'drive/MyDrive/Honours2021/TNGv5/vocab.json',
 'drive/MyDrive/Honours2021/TNGv5/merges.txt',
 'drive/MyDrive/Honours2021/TNGv5/added_tokens.json')

In [None]:
from transformers import pipeline

modelPath = "drive/MyDrive/Honours2021/TNGv5"
makeItSo = pipeline('text-generation',model=modelPath, tokenizer=modelPath,config={'max_length':1200})

In [None]:
makeItSo("USER: What do you think of this? PICARD: ")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'USER: What do you think of this? PICARD: I am Commander William T Riker.I am Will Riker.I am First Officer William Riker of the Enterprise.What of it, Lieutenant?  William R'}]

In [None]:
makeItSo("TROI: How are you today captain? PICARD:")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'TROI: How are you today captain? PICARD:I am Captain Picard.I am Jean-Luc Picard, of the Federation Starship, USSEnterprise.Sir, if I were a boy, am I going to die on that planet, Number'}]

In [None]:
makeItSo("TROI: Good day Captain. PICARD:")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "TROI: Good day Captain. PICARD:I am Captain Picard.I am Jean-Luc Picard, of the Federation Starship, USSEnterprise.What did you report?  O'Two hundred and twenty three minutes, sir.There"}]

In [None]:
trainingDataPath = "drive/MyDrive/Honours2021/train_eval_main.json"
#save training Dict
with open(trainingDataPath, 'w', encoding='utf-8') as json_file:
  json.dump(newDict, json_file)