In [1]:
########################################################################################################################
## -- libraries and packages -- ########################################################################################
########################################################################################################################
import calm
import torch
import torch.nn as nn
from torch.nn import functional as F
########################################################################################################################
## -- create text files from csv and load the data as text files -- ####################################################
########################################################################################################################
data_handler_module = calm.data_handler.DataHandler()
path = "data/dataset/"

data_handler_module.convert_dailydialog_to_txt(path + "train.csv", path + "train.txt")
data_handler_module.convert_dailydialog_to_txt(path + "test.csv", path + "test.txt")
data_handler_module.convert_dailydialog_to_txt(path + "validation.csv", path + "validation.txt")

tr_data = data_handler_module.load_data(path + "train.txt")
te_data = data_handler_module.load_data(path + "test.txt")
va_data = data_handler_module.load_data(path + "validation.txt")

########################################################################################################################
## -- setting up the tokenizer vocab and most common pair lists -- #####################################################
########################################################################################################################

## -- run this if you are generating from scratch (takes around 10 - minutes) -- ##
# tokenizer = calm.tokenization.Tokenizer(tr_data)
# tokenizer.max_vocab_size = 1024
# tokenizer.save_state_path = "./data/vocab/tokenizer_state.pkl"
# tokenizer.generate_vocab()

## -- run this if you are loading what you have already generated (recommended after the first run) -- ##
tokenizer = calm.tokenization.Tokenizer()
tokenizer.load_state("./data/vocab/tokenizer_state.pkl")

## -- test the vocab to see if it is correct -- ##
text = te_data[:1000]
tokens = tokenizer.encoder(text)
d_text = tokenizer.decoder(tokens)
print("original text length:", len(text), " | ", "tokenized length:", len(tokens), " | ", "are they identical:", d_text == text)

########################################################################################################################
## -- turn everything into tensors -- ##################################################################################
########################################################################################################################
tr_data = torch.tensor(tokenizer.encoder(tr_data[:1000]))
te_data = torch.tensor(tokenizer.encoder(te_data[:1000]))
va_data = torch.tensor(tokenizer.encoder(va_data[:1000]))


original text length: 1000  |  tokenized length: 270  |  are they identical: True


In [None]:
params = {
  "val_iter": 1,
  "val_freq": 1,
  "num_heads": 4, 
  "model_embd": 256, 
  "vocab_size": len(tokenizer.vocab.items()), 
  "batch_size": 64,
  "block_size": 256,
  "hidden": 64,
  "dropout_p": 0.1,
  "num_layers": 2
}

data = {"tr": tr_data, "te": te_data, "va": va_data}
model = calm.CompactAiLanguageModel(parameters = params)
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)
model.train(data, optimizer, epochs = 1000)

7.5034565925598145
7.1756815910339355
6.850719928741455


KeyboardInterrupt: 

In [None]:
X = torch.zeros((1, 1), dtype = torch.long)
X = torch.tensor([tokenizer.encoder("<speaker1> Hello, I am Milad")])
print(tokenizer.decoder(model.generate(X, 200).tolist()[0]))

<speaker1> Hello, I am Miladknow that is tempting but is really not good for our fitness .
<speaker1> What do you mean ? It will help us fat and act silly . Remember last time ?
<speaker1> I guess you are right.But what shall we do ? I don't just a walk over to the play shall we do ? I don't feel like sitting at home .
<speaker2> I sugest a walk over to the gsingsingsong and meet some of our friends et some of our friends .
<speaker1> That's a good idea . I hear Mary and Sally often go there to play pong.Perhaps we can play pong.Perhaps we can make a foursome with Belielielieve it is excellent exercise and fun , we could ask them .
<speaker2> Sounds great to go now .
<speaker2> All right .

<speaker1> 
