In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from tqdm import tqdm
import sklearn
from torch.utils.data import DataLoader, Dataset
import string
import checker

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
train_data = pd.read_csv('train_data.csv')
eval_data = pd.read_csv('eval_data.csv')

In [4]:
char_vocab = [ord(char) - 96 for char in list(string.ascii_lowercase)]
char_vocab = [0] + char_vocab

def to_word(x):
    output = [chr(i + 96) for i in x]
    str = ''
    return str.join(output)

In [5]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.data = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data.iloc[idx]

        original_sen = [ord(char) - 96 for char in list(sample['Sentence'])]
        transformed_sen = [ord(char) - 96 for char in list(sample['Transformed sentence'])]

        features = torch.tensor(original_sen, dtype=torch.long)
        target = torch.tensor(transformed_sen, dtype=torch.long)

        if self.transform:
            features = self.transform(features)

        return features, target

In [20]:
batch_size = 500
learning_rate = 0.1
hidden_size1 = 256
hidden_size2 = 30
num_epochs = 1
num_classes = 1000

sequence_length = 8
num_layers = 1
input_size = 1000

In [21]:
train_dataset = CustomDataset(train_data)
eval_dataset = CustomDataset(eval_data)

train_loader = DataLoader(dataset=train_dataset, batch_size= batch_size, shuffle = True)
eval_loader = DataLoader(dataset = eval_dataset, batch_size = batch_size, shuffle = False)

In [30]:
class MyModel(nn.Module):

    def __init__(self, num_layers, input_size, hidden_size1, hidden_size2, num_classes):
        super(MyModel, self).__init__()
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        self.num_classes = num_classes

        self.embedding = nn.Embedding(num_embeddings=len(char_vocab), embedding_dim=input_size)
        self.encoder = nn.LSTM(input_size = input_size, hidden_size=hidden_size1, num_layers=num_layers, dropout = 0.7, batch_first = True, bidirectional=True)
        self.decoder = nn.LSTM(input_size = input_size+hidden_size2, hidden_size=hidden_size2, num_layers=num_layers, dropout = 0.7, batch_first = True)
        self.dropout = nn.Dropout(p = 0.7)
        self.linear = nn.Linear(hidden_size2, 20)
        self.bridge = nn.Linear(2 * hidden_size1, hidden_size2)
        self.relu = nn.ReLU()
        self.softmax = nn.LogSoftmax(dim = -1)
        self.output = nn.Linear(20, num_classes)

    def forward(self, x, target=None):
        if target != None:
            embeddings = torch.zeros(x.size(0), x.size(1), self.input_size)
            embeddings_target = torch.zeros(target.size(0), target.size(1), self.input_size)
            for i in range(len(x)):
                input_indicies = [char_vocab.index(char) for char in x[i]]
                input_indicies_target = [char_vocab.index(char) for char in target[i]]

                embeddings[i] = self.embedding(torch.tensor(input_indicies))
                embeddings_target[i] = self.embedding(torch.tensor(input_indicies_target))

            h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size1)
            c0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size1)

            out, (_, _) = self.encoder(embeddings, (h0, c0))
            out1 = out[:, -1, :self.hidden_size1]
            out2 = out[:, 0, :self.hidden_size1]
            out = torch.cat([out1, out2], -1)

            out = self.bridge(out)
            out = self.relu(out)
            

            c1 = torch.zeros(self.num_layers, out.size(0), out.size(1))
            h1 = out.repeat(self.num_layers,1,1)

            embeddings_target = torch.cat([embeddings_target, out.unsqueeze(1).repeat(1, sequence_length, 1)], -1)

            final_out, (_, _) = self.decoder(embeddings_target, (h1, c1))

            final_out = self.linear(final_out)
            final_out = self.relu(final_out)
            final_out = self.dropout(final_out)
            final_out = self.output(final_out)
            #final_out = self.softmax(final_out)

            return final_out

        else:
            return self.inference(x)

    def inference(self, x):
        embeddings = torch.zeros(x.size(0), x.size(1), self.input_size)
        for i in range(len(x)):
            input_indicies = [char_vocab.index(char) for char in x[i]]
            embeddings[i] = self.embedding(torch.tensor(input_indicies))
        
        h0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size1)
        c0 = torch.zeros(2*self.num_layers, x.size(0), self.hidden_size1)

        out1, (_, _) = self.encoder(embeddings, (h0, c0))
        out2 = out1[:, -1, :self.hidden_size1]
        out3 = out1[:, 0, :self.hidden_size1]
        out1 = torch.cat([out2, out3], -1)

        out1 = self.bridge(out1)
        out1 = self.relu(out1)

        c1 = torch.zeros(self.num_layers, out1.size(0), out1.size(1))
        h1 = out1.repeat(self.num_layers, 1, 1)

        output_sequence = torch.zeros(sequence_length, x.size(0), num_classes)
        predictions = torch.zeros(sequence_length, batch_size)
        input = self.embedding(torch.tensor([0])).view(1, -1)
        input = input.unsqueeze(0).expand(x.size(0), -1, -1)
        input = torch.cat([input, out1.unsqueeze(1)], -1)

        for i in range(sequence_length):
            out, (h, c) = self.decoder(input, (h1, c1))

            out = self.linear(out)
            out = self.relu(out)
            out = self.dropout(out)
            out = self.output(out)
            #out = self.softmax(out)
            output_sequence[i] = out.squeeze(1)

            c1 = c

            pred_index = self.lookup_table(out.squeeze(1))
            predictions[i] = pred_index
            input = self.embedding(pred_index.to(torch.int32)).view(x.size(0), 1, -1)
            input = torch.cat([input, out1.unsqueeze(1)], -1)
        predictions = predictions.view(-1, sequence_length)
        pred = to_word(predictions[0].to(torch.int32))


        return output_sequence.view(x.size(0), sequence_length, -1), predictions, pred

    def lookup_table(self, out):
        min_dist = float('inf')
        min_index = 0
        pred_index = torch.zeros(out.size(0))

        for i in range(len(out)):
            for j in range(len(char_vocab)):
                embed = self.embedding(torch.tensor([j]))
                dist = torch.norm(out[i] - embed, p=2)
                if dist < min_dist:
                    min_dist = dist
                    min_index = j
            min_dist = float('inf')
            pred_index[i] = min_index
            min_index = 0
        
        return pred_index



        

        
        

In [31]:
model = MyModel(num_layers, input_size, hidden_size1, hidden_size2, num_classes)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [32]:
n_total_steps = len(train_loader)


patience = 10
patience_counter = 0
best_epoch = 0
best_val = float("inf")

for epoch in range(num_epochs):
    for i, (X_train, Y_train) in enumerate(train_loader):
        input = torch.zeros(Y_train.size(0), Y_train.size(1))
        for j in range(len(Y_train)):
            input[j] = torch.cat((torch.tensor([0]), Y_train[j, :-1]), 0)
        input = input.to(device)

        X_train = X_train.to(device)

        outputs = model(X_train, input)

        Y_train_embed = torch.zeros(batch_size, sequence_length, input_size)
        for j in range(len(Y_train)):
            input_inidcies = [char_vocab.index(char) for char in Y_train[j]]
            Y_train_embed[j] = model.embedding(torch.tensor(input_inidcies))
        

        loss = criterion(outputs, Y_train_embed)

        optimizer.zero_grad()        
        loss.backward()

        optimizer.step()


        if (i+1)%7 == 0:
            print(f'epoch {epoch+1}/{num_epochs}, step {i+1}/{n_total_steps}, loss  {loss.item():.4f}')

    val_loss = 0.0
    with torch.no_grad():
        for i, (X_val, Y_val) in enumerate(eval_loader):
            X_val = X_val.to(device)
            outputs, _, _ = model(X_val)
            
            Y_val_embed = torch.zeros(batch_size, sequence_length, input_size)
            for i in range(len(Y_train)):
                input_inidcies = [char_vocab.index(char) for char in Y_val[i]]
                Y_val_embed[i] = model.embedding(torch.tensor(input_inidcies))

            val_loss += criterion(outputs, Y_val_embed)

    val_loss /= len(eval_loader)
    print(f'val loss {val_loss}')

    if val_loss < best_val:
        best_val = val_loss
        best_epoch = epoch
        patience_counter = 0
    else:
        patience_counter+=1

    if patience_counter >= patience:
        print(f'EARLY STOPPING AT EPOCH {epoch + 1} BEST EPOCH IS {best_epoch + 1}')
        break


KeyboardInterrupt: 

In [None]:
with torch.no_grad():
    for i, (X_val, Y_val) in enumerate(eval_loader):
        X_val = X_val.to(device)
        outputs, predictions, _ = model(X_val)
        
        Y_val_embed = torch.zeros(batch_size, sequence_length, input_size)
        for i in range(len(Y_train)):
            input_inidcies = [char_vocab.index(char) for char in Y_val[i]]
            Y_val_embed[i] = model.embedding(torch.tensor(input_inidcies))

        print(criterion(outputs, Y_val_embed))

        for j in range(len(Y_val)):
            print('-----------')
            print(f'Original string : {to_word(X_val[j])}')
            print(f'Transformed String : {to_word(Y_val[j])}')
            print(f'Predicted String: {to_word(predictions[j].to(torch.int32))}')


tensor(0.0793)
-----------
Original string : atxwvepa
Transformed String : ldlhgjuj
Predicted String: ````````
-----------
Original string : iaayjhyr
Transformed String : mffolhhl
Predicted String: ````````
-----------
Original string : nmnqjobz
Transformed String : xjwbqnnq
Predicted String: ````````
-----------
Original string : puembxfo
Transformed String : nqxwxmtb
Predicted String: ````````
-----------
Original string : kpxqtnng
Transformed String : dlbdbgvx
Predicted String: ````````
-----------
Original string : nqalgpgq
Transformed String : fwkibsou
Predicted String: ````````
-----------
Original string : bryqeggb
Transformed String : ulafntih
Predicted String: ````````
-----------
Original string : soactzsr
Transformed String : ontitdlb
Predicted String: ````````
-----------
Original string : fjopbuvo
Transformed String : epprjfot
Predicted String: ````````
-----------
Original string : ldqwmeda
Transformed String : sdgztwup
Predicted String: ````````
-----------
Original stri

In [19]:
checker.evaluate(model)

Obtaining results for training data:
Train dataset results:
Number of predictions with 0 correct predictions: 7000
Number of predictions with 1 correct predictions: 0
Number of predictions with 2 correct predictions: 0
Number of predictions with 3 correct predictions: 0
Number of predictions with 4 correct predictions: 0
Number of predictions with 5 correct predictions: 0
Number of predictions with 6 correct predictions: 0
Number of predictions with 7 correct predictions: 0
Number of predictions with 8 correct predictions: 0
Points: 0.0
Obtaining metrics for eval data:
Eval dataset results:
Number of predictions with 0 correct predictions: 2000
Number of predictions with 1 correct predictions: 0
Number of predictions with 2 correct predictions: 0
Number of predictions with 3 correct predictions: 0
Number of predictions with 4 correct predictions: 0
Number of predictions with 5 correct predictions: 0
Number of predictions with 6 correct predictions: 0
Number of predictions with 7 correc