In [1]:
########################################################################################################################
## -- libraries and packages -- ########################################################################################
########################################################################################################################
import os
import sys
import torch
sys.path.append(os.path.abspath(".."))
import calm

########################################################################################################################
## -- create text files from csv and load the data as text files -- ####################################################
########################################################################################################################
data_handler_module = calm.data_handler.DataHandler()
path = "../data/dataset/"

data_handler_module.convert_dailydialog_to_txt(path + "train.csv", path + "train.txt")
data_handler_module.convert_dailydialog_to_txt(path + "test.csv", path + "test.txt")
data_handler_module.convert_dailydialog_to_txt(path + "validation.csv", path + "validation.txt")

tr_data = data_handler_module.load_data(path + "train.txt")
te_data = data_handler_module.load_data(path + "test.txt")
va_data = data_handler_module.load_data(path + "validation.txt")

########################################################################################################################
## -- setting up the tokenizer vocab and most common pair lists -- #####################################################
########################################################################################################################

## -- run this if you are generating from scratch (takes around 10 - minutes) -- ##
# tokenizer = calm.tokenization.Tokenizer(tr_data)
# tokenizer.max_vocab_size = 1024
# tokenizer.save_state_path = "./data/vocab/tokenizer_state.pkl"
# tokenizer.generate_vocab()

## -- run this if you are loading what you have already generated (recommended after the first run) -- ##
tokenizer = calm.tokenization.Tokenizer()
tokenizer.load_state("../data/vocab/tokenizer_state.pkl")

## -- test the vocab to see if it is correct -- ##
text = te_data[:1000]
tokens = tokenizer.encoder(text)
d_text = tokenizer.decoder(tokens)
print("original text length:", len(text), " | ", "tokenized length:", len(tokens), " | ", "are they identical:", d_text == text)

########################################################################################################################
## -- turn everything into tensors -- ##################################################################################
########################################################################################################################
tr_data = torch.tensor(tokenizer.encoder(tr_data[:1000]))
te_data = torch.tensor(tokenizer.encoder(te_data[:1000]))
va_data = torch.tensor(tokenizer.encoder(va_data[:1000]))


original text length: 1000  |  tokenized length: 270  |  are they identical: True


In [2]:
params = {
  "val_iter": 1,
  "val_freq": 1,
  "save_path": "../weights/",
  "num_heads": 4, 
  "model_embd": 256, 
  "vocab_size": len(tokenizer.vocab.items()), 
  "batch_size": 64,
  "block_size": 256,
  "hidden": 64,
  "dropout_p": 0.1,
  "num_layers": 2
}

data = {"tr": tr_data, "te": te_data, "va": va_data}
model = calm.CompactAiLanguageModel(parameters = params)
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)
model.train(data, optimizer, epochs = 1)

tr_loss: 7.4563, va_loss: 7.3846, te_loss: 7.4378


In [6]:
model = calm.CompactAiLanguageModel(parameters = params)
model.load_weights(path = "../weights/weights.pth")
X = torch.tensor([tokenizer.encoder("<speaker1> Hello, I am Bean, who are you ?\n")])
print(tokenizer.decoder(model.generate(X, 200).tolist()[0]))

<speaker1> Hello, I am Bean, who are you ?
Can you . I�es we��That's tion �viurlly dones , well gh
<speaker2> if ld take ary ite hat ecapp�abseeks �ere ]nuciget two hat's ms venW�3ence .
<speaker2> You for a give worear love �Whbo�ea' ftoday �or �b nd ort ouncourroom ed to I'm today �will be �. It's 00 Please �1> will be good �meaIt's would you genew 's thother �why h lly ways Oh , pgoing to ace his �one two a good est 't sh 4ble �ight happjust . It cause ounad?
<speaker1> ally ellfor the good ey buy .
<speaker1> Thort is ga�/Pff�. You back inweek ��?
<speaker1> �lo

<speaker1> sir �ttdown *esgu�ough able llay �ect . Th�for you , the with Can I'�two lust that |ut . We not going intereslove pay 
<speaker1> s work �!

<speaker1> ChinalvWell , asmound . The 
