In [1]:
#A pytorch implementation of a transformer
import torch
import torch.nn as nn
import math
import pandas as pd
from TransformLib.Transformer import Decoder
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn.functional as F


In [2]:
class LanguageModel(nn.Module):
    def __init__(self,vocab_size:int,d_model:int,n_layers:int,h:int,d_ff:int,max_seq_len:int = 512):
        super().__init__()
        self.decode = Decoder(vocab_size,d_model,n_layers,h,d_ff,max_seq_len)
        self.output = nn.Linear(d_model,vocab_size)


    def forward(self,x,target_mask):
        x = self.decode(x,None,None,target_mask)
        x = self.output(x)

        return x



In [3]:
#Loading data
df = pd.read_csv('dataset.csv')
data = df['text'].tolist()

wordlist = []

for i in data:
    words = i.split()
    for w in words:
        wordlist.append(w.lower())

word_indices = dict({"<CLS>":0,"<SEP>":1,"<PAD>":2})
 
index = 3
for w in set(wordlist):
    word_indices.update({w:index})
    index+=1
print(word_indices)

{'<CLS>': 0, '<SEP>': 1, '<PAD>': 2, 'date': 3, 'changing': 4, 'information': 5, 'applying': 6, 'unsuccessful': 7, 'issued': 8, '?': 9, 'increased': 10, 'printed': 11, 'works': 12, 'working': 13, 'actual': 14, 'involving': 15, 'from': 16, 'time': 17, 'unable': 18, 'facing': 19, 'education': 20, 'shop': 21, 'interest': 22, 'because': 23, 'corrected': 24, 'password': 25, 'system': 26, 'through': 27, 'otp': 28, 'that': 29, 'charge': 30, 'bill': 31, 'fund': 32, 'updated': 33, 'previously': 34, 'week': 35, '"': 36, 'exact': 37, ',': 38, 'via': 39, 'how': 40, 'intermediaries': 41, 'opened': 42, 'mobile': 43, 'been': 44, 'withdrawn': 45, 'correction': 46, 'loss': 47, 'provided': 48, 'get': 49, 'passbook': 50, 'pos': 51, 'process': 52, 'not': 53, 'processed': 54, 'insufficient': 55, 'purchasing': 56, 'correct': 57, 'started': 58, 'cause': 59, 'check': 60, 'able': 61, 'daily': 62, 'although': 63, 'closing': 64, 'has': 65, 'withdrew': 66, 'yesterday': 67, 'updating': 68, '7': 69, 'into': 70, 'ba

In [4]:
vocab_size = len(word_indices)
d_model = 32
n_layers = 8
h = 8
d_ff = 128
max_len = 64
batch_size = 8


lm = LanguageModel(vocab_size,d_model,n_layers,h,d_ff,max_len)

In [5]:
#Convert text into inp ids
inp = []
for sentence in data:
    tokens = []
    tokens.append(0)
    for word in sentence.split():
        id = word_indices[word.lower()]
        tokens.append(id)
    tokens.append(1)
    
   

    inp.append(tokens)


print(inp)

[[0, 163, 215, 101, 244, 229, 262, 73, 230, 218, 163, 291, 53, 49, 29, 134, 257, 258, 141, 63, 163, 120, 151, 113, 219, 273, 101, 6, 229, 262, 141, 1], [0, 163, 42, 244, 79, 240, 206, 211, 71, 218, 306, 163, 164, 228, 50, 163, 128, 29, 228, 162, 225, 246, 276, 11, 110, 212, 141, 1], [0, 163, 215, 101, 200, 241, 229, 262, 306, 163, 42, 228, 240, 206, 211, 71, 73, 230, 218, 163, 291, 53, 49, 115, 257, 258, 141, 1], [0, 257, 258, 163, 291, 53, 49, 115, 200, 241, 229, 262, 116, 163, 215, 73, 230, 206, 211, 71, 141, 1], [0, 163, 167, 136, 305, 101, 146, 307, 150, 156, 228, 71, 240, 73, 230, 218, 257, 258, 212, 246, 53, 33, 141, 1], [0, 163, 80, 197, 265, 113, 43, 150, 29, 97, 34, 243, 156, 228, 240, 293, 80, 197, 74, 228, 79, 150, 141, 1], [0, 163, 297, 228, 200, 241, 229, 262, 67, 141, 147, 163, 80, 197, 98, 228, 200, 241, 229, 262, 141, 113, 142, 5, 246, 195, 201, 141, 1], [0, 299, 98, 228, 200, 241, 229, 262, 23, 163, 297, 228, 200, 241, 229, 262, 67, 141, 151, 113, 142, 5, 246, 195, 201

In [6]:
#Creating a dataset
X =[]
y = []

for i in inp:
    x = [0,]
    for j in range(1,len(i)-1):
        
        x.append(i[j])
        pad = []
        for k in range(0,64-len(x)):
            pad.append(2)
        #Adding padding
        fin = x+pad

        #Append        
        X.append(fin)
        y.append(i[j+1])




In [7]:
X_tensor = torch.tensor(X,dtype=torch.int)
y_tensor = torch.tensor(y)

In [8]:
#Splitting the data and creating a dataset
X_train,X_test,y_train,y_test = train_test_split(X_tensor,y_tensor,train_size=0.8,shuffle=True)
dataset = TensorDataset(X_train,y_train)
dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=True)

In [None]:
#Setting model hyperparameters
epochs = 50
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(lm.parameters(),lr=5e-4)

In [15]:
#Training
lm.train()
for epoch in range(epochs):
    loss_val = 0
    for X_batch,y_batch in dataloader:
        #Creating a mask
        batch_size = X_batch.size(0)
        seq_len = X_batch.size(1)
        
        #Creating a casual mask
        casual_mask = torch.tril(torch.ones(seq_len, seq_len)).bool()  # (seq_len, seq_len)
        casual_mask = casual_mask.unsqueeze(0) 
        casual_mask = casual_mask.repeat(batch_size,  1, 1) 
        
        
        y_pred = lm(X_batch,casual_mask)[:, -1, :]
        loss = loss_fn(y_pred,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_val += loss.item()

    print(epoch,". LOSS::",loss_val/len(dataloader))


0 . LOSS:: 4.87255537051421
1 . LOSS:: 4.847856238484383
2 . LOSS:: 4.822704485975779
3 . LOSS:: 4.8036259813950615
4 . LOSS:: 4.781799190319502
5 . LOSS:: 4.765793140117939
6 . LOSS:: 4.749238291612039
7 . LOSS:: 4.735228060529782
8 . LOSS:: 4.714361125460038
9 . LOSS:: 4.698558622827897


In [None]:
#Training accuracy
lm.eval()
y_pred = lm(X_train,None)[:,-1,:]
y_pred = torch.softmax(y_pred,dim=-1)

predicted= y_pred.argmax(dim=-1)
target = y_train.argmax(dim=-1)
acc = (predicted==y_train).float().mean()

print("Accuracy ",acc.item()*100)

Accuracy  tensor(8.1325)


In [25]:
#Testing accuracy
lm.eval()
y_pred = lm(X_test,None)[:,-1,:]
y_pred = torch.softmax(y_pred,dim=-1)

predicted= y_pred.argmax(dim=-1)

acc = (predicted==y_test).float().mean()

print("Accuracy ",acc.item()*100)

Accuracy  6.009615212678909


In [26]:
#Now we actually generate text

def generate_text(model,prompt:str,token_dict:dict,temperature:float,max_new_tokens: int)->str:

    #We make an inverse dictionary
    inv_token_dict = {v: k for k,v in token_dict.items() }

    pad_token = token_dict['<PAD>']
    
    encoded_inp = [token_dict.get(ch.lower(),pad_token) for ch in prompt.split()]
    encoded_inp.insert(0,0)
    inp_ids = torch.tensor([encoded_inp],dtype=torch.long)

    model.eval()
    generated = inp_ids.clone()

    for i in range(max_new_tokens):
        logits = model(generated,None)

        next_token_logits = logits[:,-1,:]/ temperature
        probs = F.softmax(next_token_logits,dim=-1)
        
        #Sample from distribution
        next_token = torch.multinomial(probs,num_samples=1)
        generated = torch.cat([generated,next_token],dim=-1)
        if next_token.item() == 1: #EOS
            break


    output_tokens = generated[0].tolist()
    return " ".join([inv_token_dict[tok] for tok in output_tokens])

In [27]:
p = str(input())
print(generate_text(lm,p,word_indices,0.1,20))

<CLS> i applied for a credit card updating apply intermediaries shows works got cash problem month end any detail are error failure - my printed works card
