In [63]:
#A pytorch implementation of a transformer
import torch
import torch.nn as nn
import math
import pandas as pd
from TransformLib.Transformer import Decoder
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.optim as optim
import torch.nn.functional as F


In [64]:
class LanguageModel(nn.Module):
    def __init__(self,vocab_size:int,d_model:int,n_layers:int,h:int,d_ff:int,max_seq_len:int = 512):
        super().__init__()
        self.decode = Decoder(vocab_size,d_model,n_layers,h,d_ff,max_seq_len)
        self.output = nn.Linear(d_model,vocab_size)


    def forward(self,x,target_mask):
        x = self.decode(x,None,None,target_mask)
        x = self.output(x)

        return x



In [65]:
#Loading data
df = pd.read_csv('dataset.csv')
data = df['text'].tolist()

wordlist = []

for i in data:
    words = i.split()
    for w in words:
        wordlist.append(w.lower())

word_indices = dict({"<CLS>":0,"<SEP>":1,"<PAD>":2})
 
index = 3
for w in set(wordlist):
    word_indices.update({w:index})
    index+=1
print(word_indices)

{'<CLS>': 0, '<SEP>': 1, '<PAD>': 2, 'related': 3, 'mobile': 4, 'credit': 5, 'birth': 6, 'you': 7, 'card': 8, 'error': 9, '.': 10, 'complete': 11, 'amount': 12, 'digital': 13, 'gets': 14, 'sorry': 15, 'password': 16, 'online': 17, 'correct': 18, 'what': 19, 'name': 20, 'date': 21, 'updating': 22, 'are': 23, 'information': 24, 'out': 25, 'pos': 26, 'attached': 27, 'making': 28, 'as': 29, 'approved': 30, 'gateway': 31, 'any': 32, 'i': 33, 'settlement': 34, '?': 35, 'successful': 36, 'using': 37, 'many': 38, 'change': 39, 'when': 40, 'application': 41, 'insufficient': 42, 'recognised': 43, 'after': 44, 'reset': 45, 'works': 46, 'large': 47, 'also': 48, 'atm': 49, 'security': 50, 'applying': 51, 'ten': 52, 'personal': 53, 'cash': 54, 'shows': 55, 'things': 56, 'number': 57, 'your': 58, 'process': 59, 'facing': 60, 'has': 61, 'cascading': 62, 'want': 63, 'on': 64, 'status': 65, 'collected': 66, 'system': 67, 'activation': 68, 'entering': 69, 'please': 70, 'old': 71, 'another': 72, 'kolkata'

In [66]:
vocab_size = len(word_indices)
d_model = 32
n_layers = 8
h = 8
d_ff = 128
max_len = 64
batch_size = 8


lm = LanguageModel(vocab_size,d_model,n_layers,h,d_ff,max_len)

In [67]:
#Convert text into inp ids
inp = []
for sentence in data:
    tokens = []
    tokens.append(0)
    for word in sentence.split():
        id = word_indices[word.lower()]
        tokens.append(id)
    tokens.append(1)
    
   

    inp.append(tokens)


print(inp)

[[0, 33, 261, 257, 286, 5, 8, 186, 203, 95, 33, 121, 255, 204, 198, 153, 161, 280, 10, 129, 33, 117, 168, 232, 210, 144, 257, 51, 5, 8, 10, 1], [0, 33, 260, 286, 307, 295, 110, 58, 151, 95, 40, 33, 220, 131, 294, 33, 146, 198, 131, 120, 20, 226, 127, 237, 64, 214, 10, 1], [0, 33, 261, 257, 87, 215, 5, 8, 40, 33, 260, 131, 295, 110, 58, 151, 186, 203, 95, 33, 121, 255, 204, 32, 161, 280, 10, 1], [0, 161, 280, 33, 121, 255, 204, 32, 87, 215, 5, 8, 275, 33, 261, 186, 203, 110, 58, 151, 10, 1], [0, 33, 139, 297, 41, 257, 211, 277, 57, 78, 131, 151, 295, 186, 203, 95, 161, 280, 214, 226, 255, 108, 10, 1], [0, 33, 63, 105, 39, 232, 4, 57, 198, 183, 305, 189, 78, 131, 295, 224, 63, 105, 82, 131, 307, 57, 10, 1], [0, 33, 77, 131, 87, 215, 5, 8, 179, 10, 96, 33, 63, 105, 182, 131, 87, 215, 5, 8, 10, 232, 3, 24, 226, 248, 124, 10, 1], [0, 70, 182, 131, 87, 215, 5, 8, 192, 33, 77, 131, 87, 215, 5, 8, 179, 10, 168, 232, 3, 24, 226, 248, 124, 10, 1], [0, 33, 139, 297, 41, 257, 211, 131, 4, 57, 78, 

In [68]:
#Creating a dataset
X =[]
y = []

for i in inp:
    x = [0,]
    for j in range(1,len(i)-1):
        
        x.append(i[j])
        pad = []
        for k in range(0,64-len(x)):
            pad.append(2)
        #Adding padding
        fin = x+pad

        #Append        
        X.append(fin)
        y.append(i[j+1])




In [69]:
X_tensor = torch.tensor(X,dtype=torch.int)
y_tensor = torch.tensor(y)

In [70]:
#Splitting the data and creating a dataset
X_train,X_test,y_train,y_test = train_test_split(X_tensor,y_tensor,train_size=0.8,shuffle=True)
dataset = TensorDataset(X_train,y_train)
dataloader = DataLoader(dataset,batch_size=batch_size,shuffle=True)

In [71]:
#Setting model hyperparameters
epochs = 10
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(lm.parameters(),lr=5e-5)

In [None]:
#Training
lm.train()
for epoch in range(epochs):
    loss_val = 0
    for X_batch,y_batch in dataloader:
        #Creating a mask
        batch_size = X_batch.size(0)
        seq_len = X_batch.size(1)
        
        #Creating a casual mask
        casual_mask = torch.tril(torch.ones(seq_len, seq_len)).bool()  # (seq_len, seq_len)
        casual_mask = casual_mask.unsqueeze(0) 
        casual_mask = casual_mask.repeat(batch_size,  1, 1) 
        
        
        y_pred = lm(X_batch,casual_mask)[:, -1, :]
        loss = loss_fn(y_pred,y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_val += loss.item()

    print(epoch,". LOSS::",loss_val/len(dataloader))


0 . LOSS:: 5.1012709851448355


In [None]:

lm.eval()
y_pred = lm(X_test,None)[:,-1,:]
y_pred = torch.softmax(y_pred,dim=-1)



In [None]:

predicted= y_pred.argmax(dim=-1)
target = y_test.argmax(dim=-1)
acc = (predicted==target).float().mean()

print("Accuracy ",acc*100)

Accuracy  tensor(0.)


In [None]:
#Now we actually generate text

def generate_text(model,prompt:str,token_dict:dict,temperature:float,max_new_tokens: int)->str:

    #We make an inverse dictionary
    inv_token_dict = {v: k for k,v in token_dict.items() }

    pad_token = token_dict['<PAD>']
    
    encoded_inp = [token_dict.get(ch.lower(),pad_token) for ch in prompt.split()]
    encoded_inp.insert(0,0)
    inp_ids = torch.tensor([encoded_inp],dtype=torch.long)

    model.eval()
    generated = inp_ids.clone()

    for i in range(max_new_tokens):
        logits = model(generated,None)

        next_token_logits = logits[:,-1,:]/ temperature
        probs = F.softmax(next_token_logits,dim=-1)
        
        #Sample from distribution
        next_token = torch.multinomial(probs,num_samples=1)
        generated = torch.cat([generated,next_token],dim=-1)
        if next_token.item() == 1: #EOS
            break


    output_tokens = generated[0].tolist()
    return " ".join([inv_token_dict[tok] for tok in output_tokens])

In [None]:
p = str(input())
print(generate_text(lm,p,word_indices,0.1,20))

<CLS> credit till fullfill copy intermediaries has mentioned not updated or get have information any - allowed birth . what how intermediaries
