In [1]:
########################################################################################################################
## -- libraries and packages -- ########################################################################################
########################################################################################################################
import os
import sys
import torch
sys.path.append(os.path.abspath(".."))
import calm

########################################################################################################################
## -- create text files from csv and load the data as text files -- ####################################################
########################################################################################################################
data_handler_module = calm.data_handler.DataHandler()
path = "../data/dataset/"

data_handler_module.convert_dailydialog_to_txt(path + "train.csv", path + "train.txt")
data_handler_module.convert_dailydialog_to_txt(path + "test.csv", path + "test.txt")
data_handler_module.convert_dailydialog_to_txt(path + "validation.csv", path + "validation.txt")

tr_data = data_handler_module.load_data(path + "train.txt")
te_data = data_handler_module.load_data(path + "test.txt")
va_data = data_handler_module.load_data(path + "validation.txt")

########################################################################################################################
## -- setting up the tokenizer vocab and most common pair lists -- #####################################################
########################################################################################################################

## -- run this if you are generating from scratch (takes around 10 - minutes) -- ##
# tokenizer = calm.tokenization.Tokenizer(tr_data)
# tokenizer.max_vocab_size = 1024
# tokenizer.save_state_path = "./data/vocab/tokenizer_state.pkl"
# tokenizer.generate_vocab()

## -- run this if you are loading what you have already generated (recommended after the first run) -- ##
tokenizer = calm.tokenization.Tokenizer()
tokenizer.load_state("../data/vocab/tokenizer_state.pkl")

## -- test the vocab to see if it is correct -- ##
text = te_data[:1000]
tokens = tokenizer.encoder(text)
d_text = tokenizer.decoder(tokens)
print("original text length:", len(text), " | ", "tokenized length:", len(tokens), " | ", "are they identical:", d_text == text)

########################################################################################################################
## -- turn everything into tensors -- ##################################################################################
########################################################################################################################
tr_data = torch.tensor(tokenizer.encoder(tr_data[:1000]))
te_data = torch.tensor(tokenizer.encoder(te_data[:1000]))
va_data = torch.tensor(tokenizer.encoder(va_data[:1000]))


original text length: 1000  |  tokenized length: 270  |  are they identical: True


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
params = {
  "device": device,
  "val_iter": 1,
  "val_freq": 1,
  "save_path": "../weights/",
  "num_heads": 4, 
  "model_embd": 256, 
  "vocab_size": len(tokenizer.vocab.items()), 
  "batch_size": 64,
  "block_size": 256,
  "hidden": 64,
  "dropout_p": 0.1,
  "num_layers": 2
}

data = {"tr": tr_data.to(device), "te": te_data.to(device), "va": va_data.to(device)}
model = calm.CompactAiLanguageModel(parameters = params).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)
model.train(data, optimizer, epochs = 1)

tr_loss: 7.4766, va_loss: 7.4169, te_loss: 7.4192


In [3]:
model = calm.CompactAiLanguageModel(parameters = params)
model.load_weights(path = "../weights/weights.pth")
X = torch.tensor([tokenizer.encoder("<speaker1> Hello, I am Bean, who are you ?\n")])
print(tokenizer.decoder(model.generate(X, 200).tolist()[0]))

<speaker1> Hello, I am Bean, who are you ?
panight Ex�am �mo\.
<speaker1> Sit's 3ance here ,n �!ose bub what lly ad are �Nting ally ss ook each . We aystar�? �probmoney rom ank you ��i just buy pe  , to �der mor�self It's stu�elw poguure ?
<speaker1> I ll�Dn _many on ed to . We That's ma��age ually las, the ft �1itect ding ir .
<speaker2> A�much ere's ay I zpay ! ��finS�did you .
<speaker1> Thwhat But ner anks no!
<speaker1> I'll Can I 
much .r2> problother me sir 're ? . Lhmeserid s
<speaker2> W��vi�. Aal epe's I epose ine OKest ce gs make too is before canfood to? I somech ��great Yeaar Hfore tion J�n't ��meything ting YouDo car+`h , en claset me �end 've et me �ten last �ke�cause . W
