# From Vanilla RNN to Transformer

## Importations, connexion to Google Drive, set working directory

**Importations**

In [6]:
import os

import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

import torch
import torch.optim as optim
import torch.nn as nn

from torch.utils.data import Dataset, TensorDataset
from torch.utils.data.dataset import random_split
from torch.utils.data import DataLoader
from torch.utils.data import Subset

from torch.utils.tensorboard import SummaryWriter

**Mounting drive**

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

**Setting working directory**

In [2]:
wd = 'drive/MyDrive/KTH/DD2424 - Deep Learning for Data Science/Project/text-generation-project/Notebook'

os.chdir(wd)

## Dataset

In [3]:
class CustomDataset(Dataset):
  def __init__(self, file_path, seq_length):
    self.seq_length = seq_length
    with open(file_path, 'r') as f:
      self.book_data = f.read()
    self.book_chars = list(set(self.book_data))
    self.vocab_size = len(self.book_chars)
    self.char_to_ind = {key: i for i, key in enumerate(self.book_chars)}
    self.ind_to_char = {i: key for i, key in enumerate(self.book_chars)}

  def __getitem__(self, idx):
    e = idx*self.seq_length
    if e < len(self.book_data) - self.seq_length:
      inputs = self.encode(self.book_data[e : e + self.seq_length])
      outputs = self.encode(self.book_data[e + 1 : e + self.seq_length + 1])
      return inputs, outputs

  def __len__(self):
    return len(self.book_data) // self.seq_length

  def encode(self, inputs):
    res = torch.zeros(len(inputs), self.vocab_size)
    col_indices = [self.char_to_ind[c] for c in inputs]
    row_indices = np.arange(len(inputs))
    res[row_indices, col_indices] = 1
    return res

### Creation train/test dataset

In [4]:
# Create Dataset
file_path = 'goblet_book.txt'
seq_length = 25
dataset = CustomDataset(file_path, seq_length)

# Split train/test
break_index = int(0.8 * len(dataset))
train_indices = torch.arange(break_index)
test_indices = torch.arange(break_index, len(dataset))

train_dataset = Subset(dataset, train_indices)
test_dataset = Subset(dataset, test_indices)

# # Create DataLoader
# train_loader = DataLoader(dataset=train_dataset, batch_size=1, shuffle=False)
# test_loader = DataLoader(dataset=test_dataset, batch_size=1, shuffle=False)

## RNN

### class

In [5]:
class VanillaRNN(nn.Module):
  def __init__(self, hidden_size, vocab_size, seq_length):
    super().__init__()
    self.hidden_size = hidden_size
    self.vocab_size = vocab_size

    self.rnn = nn.RNN(input_size=vocab_size,
                      hidden_size=hidden_size,
                      num_layers=1,
                      nonlinearity='tanh',
                      bias=True)
    self.fc = nn.Linear(hidden_size, vocab_size)

  def forward(self, x, h0):
    h, ht = self.rnn(x, h0)
    o = self.fc(h)
    return o, ht

  def synthetize(self, xt, ht_1, n, ind_to_char):
    text = ''
    for i in range(n):
      with torch.no_grad():
        h, ht_1 = self.rnn(xt, ht_1)
        o = self.fc(h)
        p = torch.softmax(o, dim=1)
        ii = torch.multinomial(p.squeeze(), 1)
        xt = torch.zeros_like(xt)
        xt[0,ii] = 1
        text += ind_to_char[ii.item()]

    print(text, '\n')

  def loss_fct(self, o, y_true):
    probabilities = torch.softmax(o, dim=1)
    col_indices = torch.argmax(y_true, dim=1)
    row_indices = torch.arange(y_true.shape[0])
    return - torch.sum(torch.log(probabilities[row_indices, col_indices]))

### Training

In [7]:
#- Parameters
hidden_size = 100
vocab_size = dataset.vocab_size

# #- Tensorboard
# writer = SummaryWriter(log_dir='runs/exp4')

#- Make use of GPU if possible
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#- Defining the model
model = VanillaRNN(hidden_size, vocab_size, seq_length).to(device)

#- Defining the optimizer
optimizer = optim.Adagrad(model.parameters(), lr=0.1)

#- Training loop
n_epochs = 3
iteration = 0
pbar1 = tqdm(range(n_epochs), total=n_epochs, position=0, leave=True)
for epoch in pbar1:
  hprev = torch.zeros(1, hidden_size).to(device)
  pbar2 = tqdm(enumerate(train_dataset), total=len(train_dataset), position=1, leave=False)
  for i, (x_batch, y_batch) in pbar2:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    o_batch, hprev = model(x_batch, hprev.detach())
    optimizer.zero_grad()
    loss = model.loss_fct(o_batch, y_batch)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
    optimizer.step()
    if iteration==0:
      smooth_loss = loss.item()
    else:
      smooth_loss = 0.999*smooth_loss + 0.001*loss.item()
    if iteration%500==0:
      print(f'iter = {iteration}, smooth_loss = {smooth_loss}')
      model.synthetize(x_batch[0:1], hprev, 200, dataset.ind_to_char)
    # writer.add_scalar('Smooth loss', smooth_loss, iteration)
    # writer.flush()
    iteration+=1

# writer.close()

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/35440 [00:00<?, ?it/s]

iter = 0, smooth_loss = 108.9354476928711
 /G GB.GO O 
ON  RE  I   A PBCEYEPY ETnRBT F NR PToOTPGH OBOB O   BT !DB •OD  GTPTOT ITTGET RT ATTP TB B BG nRTTB GB POTTO VRRpB  GO LTPTLEPT BD  ATBG E   OTOPTRNP 4uw BBTGP  AB?POpPOPEYEETPP   G TTGB 

iter = 500, smooth_loss = 99.55672022699184
nng totG koLik Zzind Ln hel sl Linis  iun se LsL aresgit Lio hDusi
y WMasc
Lepip ube La t ios ve achi ng Ho sN aa3 riin the ;,d the ls fl	sp hed qsa ks red bos t le est go tbouse neugLndhot hLt t tmdo 

iter = 1000, smooth_loss = 86.30694607288888
es hed,ousewde sulit rhinn sime ringe hleved oume bes faluthinls choad to batd !T the he his raiw igh fpe the hit vopbfaic nts athe s huerthithe heve, bimhen wosh atodrudr!id. mhyarer wocand seny. Tro 

iter = 1500, smooth_loss = 76.47332189294276
 in souveme Sad , Horad ungitherisy seve t aad trlolr, his shey silt hemeciagcnras on iwen  ais ohodin/ hin t- har mer nora umof anlere'd ranc tfroed pd ween giphered Hifgce pev t ofimee itoly wang wa 

iter = 2000,

  0%|          | 0/35440 [00:00<?, ?it/s]

iter = 35500, smooth_loss = 48.29747371347091
ing that hand alacked what wormald't Aulred chater. . wis acen a tope shaiced whayen thered at She liosginisaudly ut torre on ith wast.
"Theme said cto mooded and at steeled us the jit frosed wilking  

iter = 36000, smooth_loss = 49.07395401134505
pirted of a welled at memoor, said, buts beticed ge in lobme.  Mr. 
"Thaidlay doold bifevelt, forand snilking sablaged - To Wriuwoveed of wele ross fors he'd theye sooced bomiding the reick to heep as 

iter = 36500, smooth_loss = 49.09429601703954
d youdy hove reucked in, mein, dil mothirous food howso at nottaby.
	Monted macined to enve lood, ; he groung on the matht thee Long acd ow her her not oppiat.  Dor.  herore vom I hav sbefing coumhami 

iter = 37000, smooth_loss = 49.11227747242673
't seibles.  He remwam treeriat?  Hawry't of he the lowed fleen uthine shit cordaitt in that batore ovan, ths of betes -"
	"You rought!"
Bear's. bes priat abbfad doortacroeathing mailing enges out he  

iter

  0%|          | 0/35440 [00:00<?, ?it/s]

iter = 71000, smooth_loss = 47.05010445214926
ln-oked scry.  "To Scmapporshidewhs thirge gre the his neanded nexelly's his lookiwners a endened them."
Ron "As the grint ore rimu was dayedrly the iint.  And so fore, pulk babteh and and ind warn cl 

iter = 71500, smooth_loss = 47.9179881866019
 the yele, Worale?" lessur coulded, wiice.  A detaind he not were the priodevase mo quhe wever dombley, mosen and Kryonts he kize bomo to I's mirame lit flaite.
		The it be and Poteedbly, west his bat 

iter = 72000, smooth_loss = 47.83429941639292
he Lright firost owb?"
	I weral froat mowt tom -'r youdld intoing the gold was retcol all, the reene cate as that was the beet regled flave wiyw falw peame whim fldou boifed.  "I would apkom just - sa 

iter = 72500, smooth_loss = 48.11157702571126
de, and thes, sighered, and drying and ward, buckes.  Dratluy and lars quace buge are browby said the besteed mal beed was bomtund and a  Hengger lound fire himsele reesteed flls, wiun to hillode scak 

iter 

## LSTM

### class

In [15]:
class LSTM(nn.Module):
  def __init__(self, hidden_size, vocab_size, seq_length):
    super().__init__()
    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    self.seq_length = seq_length

    self.lstm = nn.LSTM(input_size=vocab_size,
                        hidden_size=hidden_size,
                        num_layers=1,
                        batch_first=True)
    self.fc = nn.Linear(hidden_size, vocab_size)

  def forward(self, x, h0, c0):
    h, (ht, ct) = self.lstm(x, (h0, c0))
    y = self.fc(h)
    return y, ht, ct

  def synthetize(self, ht_1, ct_1, xt, n, ind_to_char):
    text = ''
    for t in range(n):
      with torch.no_grad():
        h, (ht_1, ct_1) = self.lstm(xt, (ht_1, ct_1))
        yt = self.fc(h)
        probabilities = torch.softmax(yt, dim=1)
        ii = torch.multinomial(probabilities.squeeze(), 1)
        xt = torch.zeros_like(xt)
        xt[0][ii] = 1
        text += ind_to_char[ii.item()]

    print(text, '\n')

  def loss_fct(self, outputs, y_true):
    probabilities = torch.softmax(outputs, dim=1)
    col_indices = torch.argmax(y_true, dim=1)
    row_indices = torch.arange(y_true.shape[0])
    return - torch.sum(torch.log(probabilities[row_indices, col_indices]))

### Training

In [17]:
#- Parameters
hidden_size = 100
vocab_size = dataset.vocab_size

# #- Tensorboard
# writer = SummaryWriter(log_dir='runs/exp2')

#- Make use of GPU if possible
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

#- Defining model
model = LSTM(hidden_size, vocab_size, seq_length).to(device)

#- Defining optimizer
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.1)

#- Training loop
n_epochs = 3
iteration = 0
pbar1 = tqdm(range(n_epochs), total=n_epochs, position=0, leave=True)
for epoch in pbar1:
  hprev = torch.zeros(1, hidden_size).to(device)
  cprev = torch.zeros(1, hidden_size).to(device)
  pbar2 = tqdm(enumerate(train_dataset), total=len(train_dataset), position=1, leave=False)
  for i, (x_batch, y_batch) in pbar2:
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    optimizer.zero_grad()
    y_pred, hprev, cprev = model(x_batch, hprev.detach(), cprev.detach())
    loss = model.loss_fct(y_pred, y_batch)
    loss.backward()
    optimizer.step()
    if iteration==0:
      smooth_loss = loss.item()
    else:
      smooth_loss = 0.999*smooth_loss + 0.001*loss.item()
    if iteration%500==0:
      print(f"iter: {iteration}, smooth loss: {smooth_loss}")
      model.synthetize(hprev, cprev, x_batch[0:1], 200, dataset.ind_to_char)
    # writer.add_scalar('Smooth loss', smooth_loss, iteration)
    # writer.flush()
    iteration += 1
# writer.close()

cpu


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/35440 [00:00<?, ?it/s]

iter: 0, smooth loss: 109.81901550292969
wRwTNRRE T YRPPHTR ANRTD HRROrAA L  R R T NT TEBRHTO R   TLLgE PT'EPR GR    T PPBHTR RGR T PBPALREO  G  RT   R T HN L2G  R T RLR DYOG    ABY TPR  ATN RNE  R4 TRRTRTT  R  TRDHRLR   LL  HO P NA D TTTLRP 

iter: 500, smooth loss: 90.67877039795579
MTndtow 3atdece upceiw l" f  tr iftLer. "A Le, qod noos, Itee-I He tor hint or bopmthicte, boblipk 'o the a rleredd.  "s tod."" n "P- un gor.  Hi stem of wo"sed arsto gy the Frans Eing, "
mconky pisne 

iter: 1000, smooth loss: 75.54407390559486
ed whem ard flald the romre the Frack and anding.
	He qut yrus, heor..o....EAng rusopee enownor...hmerinc ored was mas.  I hermiag ider pever... he reoked, museongy a are haid a shing.  I  ireontargen 

iter: 1500, smooth loss: 65.82922460801132
 so notry tuhias than and beer.
	Horet and had eniver -if brethitgem he curde eroul.  Has livey a'dliisus, has Sighg and thare.  Bithd the  oneHemsent.  The becond from; the bangert he comaadyiss bela 

iter: 2000, smooth 

  0%|          | 0/35440 [00:00<?, ?it/s]

iter: 35500, smooth loss: 37.82091471188862
er Sirdler that noteemen indizeusted put and I mack bespere, if bo desloot, and words had did about verbone been it - you did was Krum him, the right? Everyous, layed.  He shot pooms live for old your 

iter: 36000, smooth loss: 38.91810012181894
working get out the stabed throuccy.  "as sackingled anyblint, Harry me se-ful the Geattly now, on you could his dark as your time them." To neeled beninds.  Harry, his had awaid the nighthing thrugal 

iter: 36500, smooth loss: 38.571493719278834
n.  I cold your was Harry had fourd...."
	Fred iming wand, talls aftioned herpice, it likens, Ron to wike the cold I foundogh.  "It where," said Him to migated a man befarty helf as Harry alveanuse of 

iter: 37000, smooth loss: 38.48879503377742
alding as we hold used for Harry glanes; My impused to dy coot down the desk.
	The ofsed the was closer.  "Your his felans, theresing a been, and put Pot arbul cham as Was coming to grinking any said, 

iter: 37500

  0%|          | 0/35440 [00:00<?, ?it/s]

iter: 71000, smooth loss: 36.18797201204241
d only.  The sabine evering sout his serter, and Potter them to the reports would filaple and he gassing to convire," said Hermione, clament in suddenly the purmionsly heard me't speat intered it talk 

iter: 71500, smooth loss: 37.202800959499726
m, what it torked, and follight converst Sickeson, and bot to Hang," said Bangervelf askr hose, ac in have seemed-to tear - artice to be asked his boning out her," said Hermione, arms if it of his bed 

iter: 72000, smooth loss: 36.89831070281311
hat.  Onced he was his thend't right, don't his fooud the witarcheinapition, you you, but thought once ifly.  "It was icseed and said tistcrace.  He was elfly had nerse as glitter, you'll ircold under 

iter: 72500, smooth loss: 36.81855770594738
k, when it.  He bod has looked you was now ever a top theught she know' two a night.  Maybelow very turned talls; and you couldn't and here sminally many lew was asvoomed undelous Mottake gray weat.
	 

iter: 73000

# GitHub command

In [20]:
!git add

On branch michel-branch
Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m./[m

nothing added to commit but untracked files present (use "git add" to track)


# Tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir=runs