In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
! pip install -q wandb
! git clone "https://github.com/amnghd/Persian_poems_corpus.git"
! mkdir "corpus"
! cp "Persian_poems_corpus/normalized/ferdousi_norm.txt" "Persian_poems_corpus/normalized/hafez_norm.txt" "Persian_poems_corpus/normalized/moulavi_norm.txt"
"./corpus/"


fatal: destination path 'Persian_poems_corpus' already exists and is not an empty directory.


In [1]:
from torch import nn
import torch
import wandb
import pandas as pd
from collections import Counter
import os
import itertools

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)


cpu


  return torch._C._cuda_getDeviceCount() > 0


In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
class Config:
    pass


config = Config()
# wandb.init(project="ferdousi-generator", name='all_poem_generic_tune')
# config = wandb.config
config.max_epochs = 50
config.batch_size = 256
config.embedding_size = 512
config.lstm_num_layers = 3
config.lstm_hidden_size = 512
config.sequence_length = 10
config.log_interval = 10
config.learning_rate = 0.001
config.vocab_size = 40000
config.lstm_dropout = 0.2

In [3]:
class Model(nn.Module):
    def __init__(self, dataset, config, device=torch.device('cpu')):
        super(Model, self).__init__()
        self.lstm_size = config.embedding_size
        self.lstm_hidden_size = config.lstm_hidden_size
        self.lstm_dropout = 0.2
        self.embedding_dim = config.embedding_size
        self.num_layers = config.lstm_num_layers
        self.device = device
        self.vocab_size = config.vocab_size
        self.embedding = nn.Embedding(
            num_embeddings=self.vocab_size,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_hidden_size,
            num_layers=self.num_layers,
            dropout=self.lstm_dropout,
        )
        self.fc = nn.Linear(self.lstm_size, self.vocab_size)
        self.to(device)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self.device),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size).to(self.device))

In [14]:
class PoemDataset(torch.utils.data.Dataset):
    def __init__(
            self,
            config,
            device=torch.device('cpu'),
            poet='ferdousi',
            corpus_dir='./Persian_poems_corpus/normalized',
            vocab_path='./vocabulary.txt'
    ):
        self.config = config
        self.device = device

        self.words_by_poet = self.load_words(corpus_dir)
        self.poet = poet

    def preprocess_lines(self, lines, mask_key):
        lines = map(
            lambda i, line:
            f'[BOM_{mask_key}] ' + line + f' [EOS_{mask_key}]' if i % 2 == 0
            else f'[BOM_{mask_key}] ' + line,
            enumerate(lines)
        )
        lines = map(lambda line: line.split(' '), lines)
        words = itertools.chain.from_iterable(lines)
        return words

    def load_words(self, corpus_dir):
        words_by_poet = {}
        for filename in os.listdir(corpus_dir):
            with open(os.path.join(corpus_dir, filename)) as f:
                poet_name = filename.split('_')[0]
                lines = f.readlines()
                words_by_poet[poet_name] = self.preprocess_lines(lines, poet_name)
        return words_by_poet

    def load_vocabulary(self):
        pass

    @property
    def all_poets(self):
        return self.words_by_poet.keys()

    @property
    def poet(self):
        return self._poet

    @poet.setter
    def poet(self, poet):
        self._poet = poet
        if poet == 'all':
            self.words = itertools.chain.from_iterable(self.words_by_poet.values())
        else:
            self.words = self.words_by_poet[poet]
        self.index_to_word = {index: word for index, word in enumerate(self.vocabulary())}
        self.word_to_index = {word: index for index, word in enumerate(self.vocabulary())}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def vocabulary(self):
        all_words = itertools.chain.from_iterable(self.words_by_poet.values())
        word_counts = Counter(all_words)
        vocabulary = sorted(word_counts, key=word_counts.get, reverse=True)
        return vocabulary

    def __len__(self):
        return len(self.words_indexes) - self.config.sequence_length

    def __getitem__(self, index):
        tensors = (
            torch.tensor(self.words_indexes[index:index + self.args.sequence_length]).to(self.device),
            torch.tensor(self.words_indexes[index + 1:index + self.args.sequence_length + 1]).to(self.device),
        )
        return tensors

In [15]:
dataset = PoemDataset(config, device=torch.device('cpu'), poet='all', corpus_dir='../data/poems')

TypeError: <lambda>() missing 1 required positional argument: 'line'

In [None]:
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
import os
import time


def train(dataset, model, args, checkpoint_path='/content/drive/MyDrive/NLP Class/checkpoints'):
    wandb.watch(model)
    model.train()

    dataloader = DataLoader(dataset, batch_size=args.batch_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    print({'batch_count': len(dataloader), 'epoch_count': args.max_epochs})
    for epoch in range(args.max_epochs):
        state_h, state_c = model.init_state(args.sequence_length)
        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()
            print({'epoch': epoch, 'batch': batch, 'loss': loss.item()})
            if batch % args.log_interval == 0:
                wandb.log({"loss": loss})
        try:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
            }, os.path.join(checkpoint_path, f'model_{args.name}_checkpoint_{time.time()}.pt'))
        except:
            pass

In [None]:
def predict(dataset, model, text, next_words=100):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]]).to(device)
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().cpu().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return words

In [None]:
dataset = PoemDataset(config, device)
# get first 10 items in dataset
for i in range(10):
    print(dataset[i])

(tensor([  0,   3,  81, 363, 118,   2,  98,   1, 365, 698]), tensor([  3,  81, 363, 118,   2,  98,   1, 365, 698, 221]))
(tensor([  3,  81, 363, 118,   2,  98,   1, 365, 698, 221]), tensor([  81,  363,  118,    2,   98,    1,  365,  698,  221, 3552]))
(tensor([  81,  363,  118,    2,   98,    1,  365,  698,  221, 3552]), tensor([ 363,  118,    2,   98,    1,  365,  698,  221, 3552,    0]))
(tensor([ 363,  118,    2,   98,    1,  365,  698,  221, 3552,    0]), tensor([ 118,    2,   98,    1,  365,  698,  221, 3552,    0,  363]))
(tensor([ 118,    2,   98,    1,  365,  698,  221, 3552,    0,  363]), tensor([   2,   98,    1,  365,  698,  221, 3552,    0,  363,   81]))
(tensor([   2,   98,    1,  365,  698,  221, 3552,    0,  363,   81]), tensor([  98,    1,  365,  698,  221, 3552,    0,  363,   81,    2]))
(tensor([  98,    1,  365,  698,  221, 3552,    0,  363,   81,    2]), tensor([   1,  365,  698,  221, 3552,    0,  363,   81,    2,  363]))
(tensor([   1,  365,  698,  221, 3552,    0

In [None]:
len(dataset.uniq_words)

17763

In [None]:
model = Model(dataset, config, device)

train(dataset, model, config)

In [None]:
model = Model(dataset, config, device)
chechkpoint = torch.load('/content/drive/MyDrive/NLP Class/checkpoints/model_checkpoint_1654448961.97679.pt',
                         map_location=torch.device('cpu'))
print(chechkpoint['epoch'])
model.load_state_dict(chechkpoint['model_state_dict'])

9


<All keys matched successfully>

In [None]:

print('\n'.join(predict(dataset, model, text='[BOM] توانا بود هر که')))

[BOM]
توانا
بود
هر
که
نامه
سرآرد
خرد
بر
سرش
[BOM]
بگویید
کایند
پوشیده
نو
[BOS]
ستودم
به
پیش
سپاه
بهشت
[BOM]
نگوییم
گفتند
بیدار
هیچ
[BOS]
ز
فرزند
بیدار
بر
لاژورد
[BOM]
و
فرخ
آسیا
چون
نباشد
بگاه
[BOS]
خداوند
خاک
اختر
و
جنگ
دار
[BOM]
که
باشند
او
را
به
تن
را
سپرد
[BOS]
کسی
را
فرستاده
را
بافتن
[BOM]
برین
گونه
آمد
ببالید
و
گوی
[BOS]
سپرهای
شاهان
زرینه
کفش
[BOM]
سوی
خیمه
او
بدخواه
شد
[BOS]
همی
خوار
گویی
همی
بود
شاه
[BOM]
همه
ساختن
و
بنه
برنهاد
[BOS]
به
سر
برنهاد
و
کرانه
دلیر
[BOM]
بخرید
فرزند
هنگام
ننگ
[BOS]


In [None]:
torch.cuda.is_available()

True

In [None]:
# save torch model and configs
import time

torch.save({'model_state_dict': model.state_dict()}, f'../data/checkpoints/model_{time.time()}.pt')

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss,██▆▅▅▆▅▃▃▄▃▃▃▃▂▃▃▃▃▃▃▃▃▂▁▃▃▂▂▂▂▂▂▃▂▁▁▁▁▁

0,1
loss,4.09747
