In [12]:

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

import json
import pandas as pd


import numpy as np
import math

import copy
from tqdm.notebook import tqdm,trange

torch.cuda.is_available()

True

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output
        
    def split_heads(self, x):
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
        
    def combine_heads(self, x):
        batch_size, _, seq_length, d_k = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))
        
        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output

In [14]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [15]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [16]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x

In [17]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [18]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        # self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        # self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, src_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        # tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3).cuda()
        seq_length = src.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length), diagonal=1)).bool().cuda()
        # tgt_mask = tgt_mask & nopeak_mask
        return src_mask

    def forward(self, src):
        src_mask = self.generate_mask(src)
        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        # tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        # dec_output = tgt_embedded
        # for dec_layer in self.decoder_layers:
        #     dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(enc_output)
        return output

In [22]:
df = pd.read_csv('data/pad.csv')
inputSeq = np.array(df)[:10000]
print(np.array(inputSeq).shape) 
maxSeqLen = len(inputSeq[0])

with open('data/Tokens.json','r') as file:
    token_json = json.load(file)
    totalWords = token_json['len']

(10000, 20)


In [23]:
src_vocab_size = totalWords
tgt_vocab_size = totalWords
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
max_seq_length = maxSeqLen
dropout = 0.1
batch_size = 100

transformer = Transformer(src_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout)

# Generate random sample data
#src_data = torch.randint(1, src_vocab_size, (64, max_seq_length)).cuda()  # (batch_size, seq_length)
# tgt_data = torch.randint(1, tgt_vocab_size, (64, max_seq_length)).cuda() 
totalBatch = len(inputSeq) // batch_size
batchs = np.array_split(inputSeq,totalBatch)

#src_data = torch.from_numpy(inputSeq)
# tgt_data = torch.from_numpy(np.array(df[100:100*2])).cuda()



In [25]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

transformer.train().cuda()
tltBatch = len(batchs)
epochs = trange(1)
for epoch in epochs:
    epochs.set_description(f"Epoch: ")
    batchs = tqdm(batchs)
    for batch in batchs:
        batchs.set_description(f"Batch:")
        src_data = torch.from_numpy(batch).cuda()
        optimizer.zero_grad()
        output = transformer(src_data).cuda()
        print(output.contiguous().view(-1, src_vocab_size))
        print(src_data.contiguous().view(-1))
        
        loss = criterion(output.contiguous().view(-1, src_vocab_size), src_data.contiguous().view(-1))
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch+1}, Loss: {loss.item()}")
    

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

tensor([[-4.4265, -5.0490,  1.4680,  ..., -4.3636, -4.2087, -4.4470],
        [-4.3326, -4.8448,  1.0340,  ..., -4.1303, -4.0614, -4.6553],
        [-3.9553, -4.8019,  0.9093,  ..., -4.1488, -3.6080, -4.5769],
        ...,
        [-4.7869, -3.2582,  0.0934,  ..., -4.6631, -3.2635, -3.0883],
        [-3.5401, -2.8633, -2.5240,  ..., -3.3818, -2.6081, -3.4737],
        [-4.4511, -2.8698, -0.0337,  ..., -4.1199, -4.0434, -2.9503]],
       device='cuda:0', grad_fn=<ViewBackward0>)
tensor([ 0,  0,  0,  ...,  4, 11, 61], device='cuda:0')
tensor([[-5.1321, -4.6769,  1.1062,  ..., -4.0788, -4.2510, -4.1701],
        [-5.0708, -5.0576,  1.1571,  ..., -4.6285, -4.2680, -4.2691],
        [-5.0853, -4.7767,  2.1042,  ..., -4.3560, -3.8143, -3.7373],
        ...,
        [-3.5716, -1.8376,  0.4286,  ..., -3.6733, -3.9881, -4.5770],
        [-5.4106, -3.6485,  0.0994,  ..., -5.0905, -4.9083, -3.8663],
        [-4.6126, -3.5079, -0.3621,  ..., -3.3348, -4.0091, -3.3136]],
       device='cuda:0', gra

## Loss:0.02 for 100 epoch in 64 batch
## Loss:0.03 for 100 epoch in 100 batch

In [46]:
bs, sl, token =output.size()
atten = 0
h = 0
for i,value in enumerate(output[0][0]):
    if value > 0:print((i,value)) 


(2, tensor(3.6363, device='cuda:0', grad_fn=<UnbindBackward0>))
(4, tensor(1.4634, device='cuda:0', grad_fn=<UnbindBackward0>))
(5, tensor(0.4537, device='cuda:0', grad_fn=<UnbindBackward0>))
(6, tensor(0.6927, device='cuda:0', grad_fn=<UnbindBackward0>))
(7, tensor(0.6798, device='cuda:0', grad_fn=<UnbindBackward0>))
(9, tensor(12.2442, device='cuda:0', grad_fn=<UnbindBackward0>))
(11, tensor(4.2397, device='cuda:0', grad_fn=<UnbindBackward0>))
(12, tensor(0.8190, device='cuda:0', grad_fn=<UnbindBackward0>))
(13, tensor(2.1314, device='cuda:0', grad_fn=<UnbindBackward0>))
(14, tensor(1.3759, device='cuda:0', grad_fn=<UnbindBackward0>))
(15, tensor(2.2000, device='cuda:0', grad_fn=<UnbindBackward0>))
(18, tensor(0.1677, device='cuda:0', grad_fn=<UnbindBackward0>))
(21, tensor(3.4096, device='cuda:0', grad_fn=<UnbindBackward0>))
(23, tensor(1.7988, device='cuda:0', grad_fn=<UnbindBackward0>))
(24, tensor(1.0362, device='cuda:0', grad_fn=<UnbindBackward0>))
(27, tensor(1.7357, device='cu

In [31]:

h

9

In [55]:
tgt_data[0]
token_json['words']
tokens={}
for i in list(token_json['words'].keys()):
    tokens[token_json['words'][i]] = i 
for i in tgt_data[0]:
    if i.item() !=0 : print(tokens[i.item()],end=" ")

do you have any favorite 