<a href="https://colab.research.google.com/github/Sauhardya007/Scrabble_analysis.ipynb/blob/main/Copy_of_Transformer_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

In [None]:
class InputEmbedding(nn.Module):
  def __init__(self, d_model:int,vocab_size:int):
    super().__init__()
    self.d_model=d_model
    self.vocab_size=vocab_size
    self.embedding=nn.Embedding(vocab_size,d_model)

  def forward(self,x):
    return self.embedding(x) * math.sqrt(self.d_model)


In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self,d_model:int,dropout:float,seq_len:int):
    super().__init__()
    self.dropout=nn.Dropout(dropout)
    self.d_model=d_model
    self.seq_len=seq_len
    #create a matrix of (seq_len,d_model)
    pe=torch.zeros(seq_len,d_model)
    #creating a vector of (Seq_len,1)
    position=torch.arange(0,seq_len,dtype=torch.float).unsqueeze(1)
    div_term=torch.exp(torch.arange(0,d_model,2).float() * (-math.log(10000.0)/d_model))
    #apply sin to even posns
    pe[:,0::2]=torch.sin(position * div_term)
    #apply cos to odd posns
    pe[:,1::2]=torch.cos(position * div_term)
    pe=pe.unsqueeze(0)
    self.register_buffer('pe',pe)


  def forward(self,x):
    x=x+self.pe[:, :x.size(1),:]
    return self.dropout(x)


In [None]:
class EncoderLayer(nn.Module):
  def __init__(self,d_model:int,ffd_hidden:int,num_heads:int,dropout:float):
    super().__init__()
    self.layer_norm2=nn.LayerNorm(d_model)

    #ffd network
    self.linear2=nn.Linear(ffd_hidden,d_model)
    self.dropout1=nn.Dropout(dropout)
    self.activation=nn.ReLU()
    self.linear1=nn.Linear(d_model,ffd_hidden)

    #Multi-head attention
    self.dropout2=nn.Dropout(dropout)
    self.layer_norm1=nn.LayerNorm(d_model)
    self.multi_head_attn=nn.MultiheadAttention(d_model,num_heads,dropout=dropout,batch_first=True)
  def forward(self,tgt):
    tgt2,_=self.multi_head_attn(tgt,tgt,tgt)
    tgt_residual=tgt+tgt2
    tgt_norm=self.layer_norm1(tgt_residual)
    tgt_dropout=self.dropout2(tgt_norm)

    #pass through the feed forward network
    tgt3=self.linear2(self.activation(self.linear1(tgt_dropout)))
    tgt_residual2=tgt_dropout+tgt3
    tgt_norm2=self.layer_norm2(tgt_residual2)
    return tgt_norm2



In [None]:
class Encoder(nn.Module):
  def __init__(self,vocab_size, d_model, seq_len, ffd_hidden, num_heads, dropout, num_layers):
    super().__init__()
    self.embedding = InputEmbedding(d_model, vocab_size)
    self.pos_encoding = PositionalEncoding(d_model, dropout, seq_len)

    self.Layers=nn.ModuleList([EncoderLayer(d_model,ffd_hidden,num_heads,dropout) for i in range(num_layers)])

  def forward(self,x):
    x=self.embedding(x)
    x=self.pos_encoding(x)
    for Layer in self.Layers:
      x=Layer(x)
    return x

In [None]:
#Decoder Code

In [None]:
class transformerdecoderlayer(nn.Module):
  def __init__(self,d_model,ffd_hidden,num_heads,dropout):
    super().__init__()
    self.dropout3=nn.Dropout(dropout)
    self.layer_norm3=nn.LayerNorm(d_model)

    self.linear3=nn.Linear(ffd_hidden,d_model)
    self.activation=nn.ReLU()
    self.linear4=nn.Linear(d_model,ffd_hidden)

    #Multi_cross_head atention
    self.dropout4=nn.Dropout(dropout)
    self.layer_norm4=nn.LayerNorm(d_model)
    self.multihead_cross_attn=nn.MultiheadAttention(d_model,num_heads,dropout=dropout,batch_first=True)

    #masked multi_head self attention
    self.dropout5=nn.Dropout(dropout)
    self.layer_norm5=nn.LayerNorm(d_model)
    self.multihead_self_attn=nn.MultiheadAttention(d_model,num_heads,dropout=dropout,batch_first=True)

    #the forward method
  def forward(self,tgt,memory,tgt_mask=None,memory_mask=None,tgt_key_padding_mask=None,memory_key_padding_mask=None):
    tgt2=self.multihead_self_attn(tgt,tgt,tgt,attn_mask=tgt_mask,key_padding_mask=tgt_key_padding_mask)[0]
    tgt2=tgt2+tgt
    tgt2_norm=self.layer_norm5(tgt2)
    tgt2_dropout=self.dropout5(tgt2_norm)
    #cross_attention block
    tgt3=self.multihead_cross_attn(tgt2_dropout,memory,memory,attn_mask=memory_mask,key_padding_mask=memory_key_padding_mask)[0]
    tgt3=tgt3+tgt2
    tgt3_norm=self.layer_norm4(tgt3)
    tgt3_dropout=self.dropout4(tgt3_norm)

    #feed forward network
    tgt4=self.linear4(tgt3_dropout)
    tgt4=self.activation(tgt4)
    tgt5=self.linear3(tgt4)
    tgt5_norm=self.layer_norm3(tgt5)
    tgt5_dropout=self.dropout3(tgt5_norm)

    tgt5_with_residual=tgt5_norm+tgt3_dropout
    tgt5_with_norm=self.layer_norm3(tgt5_with_residual)
    tgt_5_with_residual=self.dropout3(tgt5_with_norm)
    return tgt_5_with_residual



In [None]:
class Decoder(nn.Module):
  def __init__(self,vocab_size,d_model,seq_len,ffd_hidden,num_heads,dropout,num_layers):
    super().__init__()
    self.embedding=InputEmbedding(d_model,vocab_size)
    self.pos_encoding=PositionalEncoding(d_model,dropout,seq_len)

    #create decoder layers
    decoder_layers=[]
    for i in range(num_layers):
      decoder_layers.append(transformerdecoderlayer(d_model,ffd_hidden,num_heads,dropout))
    self.decoder_layers=nn.ModuleList(decoder_layers)

    self.linear_layer=nn.Linear(d_model,vocab_size)
    self.d_model=d_model
  def forward(self,tgt,memory,tgt_mask=None,memory_mask=None,tgt_key_padding_mask=None,memory_key_padding_mask=None):
    tgt=self.embedding(tgt)
    #optional scaling
    tgt=tgt*math.sqrt(self.d_model)
    tgt=self.pos_encoding(tgt)

    for layer in self.decoder_layers:
      tgt=layer(tgt,memory,tgt_mask,memory_mask,tgt_key_padding_mask,memory_key_padding_mask)
    raw_scores=self.linear_layer(tgt)
    return raw_scores
 #   probabilities=F.softmax(raw_scores,dim=-1)
 #   return probabilities

In [None]:
#for encoder
batch_size=32
d_model=512
seq_len=100
vocab_size=1000
x = torch.randint(0, vocab_size, (batch_size, seq_len))
y=torch.randint(0,vocab_size,(batch_size,seq_len))
encoder = Encoder( vocab_size, d_model, seq_len, ffd_hidden=2048, num_heads=8, dropout=0.1, num_layers=6)
out = encoder(x)

NameError: name 'Encoder' is not defined

In [None]:
#for decoder
vocab_size=100
d_model=4#should be 512
max_len=3
num_decoder_layers=1
dim_ffd=6#generally 2048
dropout=0.1
n_heads=1
num_layers=2#typically six

#create our decoder model
model=Decoder(vocab_size, d_model, max_len, dim_ffd, n_heads, dropout,num_layers=1)

#create dummy inputs
y=torch.randint(0, vocab_size, (2, max_len))
x=torch.randn(2,max_len,d_model)
output=model(y,x)

#criterion
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

output=model(y,x)
loss=criterion(output.view(-1,vocab_size),y.view(-1))
print(loss)

tensor(4.4412, grad_fn=<NllLossBackward0>)


In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    # Forward pass
    output = model(y, x)

    # Compute loss
    loss = criterion(output.view(-1, vocab_size), y.view(-1))

    # Backward pass + optimize
    optimizer.zero_grad()   # clear old gradients
    loss.backward()         # backprop
    optimizer.step()        # update parameters

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/10], Loss: 4.4004
Epoch [2/10], Loss: 4.5425
Epoch [3/10], Loss: 4.5300
Epoch [4/10], Loss: 4.3042
Epoch [5/10], Loss: 4.4423
Epoch [6/10], Loss: 4.6995
Epoch [7/10], Loss: 4.4409
Epoch [8/10], Loss: 4.4027
Epoch [9/10], Loss: 4.4102
Epoch [10/10], Loss: 4.4401
