In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np

Preprocessing and loading data

In [3]:
def embedd(data):
    x = []
    y = []

    for _, row in data.iterrows():
        x_word = row['Sentence']
        y_word = row['Transformed sentence']

        x_letters = list(x_word)
        y_letters = list(y_word)

        x_numericalized_word = [ord(letter) - ord('a') + 1 for letter in x_letters]
        y_numericalized_word = [0] + [ord(letter) - ord('a') + 1 for letter in y_letters]

        x.append(x_numericalized_word)
        y.append(y_numericalized_word)

    return x, y

train_data = pd.read_csv('train_data.csv')
x_train, y_train = embedd(train_data)

x_train = np.array(x_train)
y_train = np.array(y_train)

y_train_dup = np.copy(y_train)
y_train_dup = np.delete(y_train_dup, 0, 1)
y_train = np.delete(y_train, -1, 1)

x_train = torch.LongTensor(x_train)
y_train = torch.LongTensor(y_train)
y_train_dup = torch.LongTensor(y_train_dup)

Positional Encoding Class

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEncoding, self).__init__()
        self.positional_encoding = self.get_positional_encoding(d_model, max_len)
        self.dropout = nn.Dropout(0.1)

    def get_positional_encoding(self, d_model, max_len):
        position = torch.arange(0, max_len).view(-1, 1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        return pe

    def forward(self, x):
        x = x + self.positional_encoding[:x.size(0), :]
        return self.dropout(x)

Transformer Class

In [5]:
class TransformerWithPositionalEncoding(nn.Module):
    def __init__(self, n_tokens, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_len, dropout=0.2):
        super(TransformerWithPositionalEncoding, self).__init__()
        self.embedding = nn.Embedding(n_tokens, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.transformer = nn.Transformer(d_model, nhead, num_encoder_layers, num_decoder_layers, dropout=dropout, batch_first=True,activation='relu', dim_feedforward=300)
        self.fc = nn.Linear(d_model, n_tokens)

    def forward(self, src, tgt):
        src = self.embedding(src)
        src = self.pos_encoder(src)
        tgt = self.embedding(tgt)
        tgt = self.pos_encoder(tgt)
        output = self.transformer(src, tgt)
        output = self.fc(output)
        return output

Train function

In [6]:
def train(model, loader, optimizer, criterion, num_epochs, batch_size):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        for x_batch, y_batch, y_dup_batch in loader:

            optimizer.zero_grad()
            output = model(x_batch, y_batch)
            loss = criterion(output.view(-1, 27), y_dup_batch.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print('Epoch: {}, Loss: {}'.format(epoch, total_loss / len(loader)))

Training the model

In [7]:
n_tokens = 27  
d_model = 300
dropout = 0.2
num_epochs = 100
batch_size = 32
nhead = 2
num_encoder_layers = 2
num_decoder_layers = 2
max_len = x_train.shape[0]
dim_feedforward = 300

train_dataset = torch.utils.data.TensorDataset(x_train, y_train, y_train_dup)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

model = TransformerWithPositionalEncoding(n_tokens, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_len, dropout)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

train(model, train_loader, optimizer, criterion, num_epochs, batch_size)

Epoch: 0, Loss: 2.9905494010611755
Epoch: 1, Loss: 2.50844508218983
Epoch: 2, Loss: 2.408306003161217
Epoch: 3, Loss: 2.3678059654148744
Epoch: 4, Loss: 2.336822781932953
Epoch: 5, Loss: 2.3244878004675043
Epoch: 6, Loss: 2.309739756257567
Epoch: 7, Loss: 2.300274648622835
Epoch: 8, Loss: 2.2858866186446796
Epoch: 9, Loss: 2.2816131964121777
Epoch: 10, Loss: 2.271703597073141
Epoch: 11, Loss: 2.264275477901441
Epoch: 12, Loss: 2.256296829545879
Epoch: 13, Loss: 2.254573964637164
Epoch: 14, Loss: 2.243974271974607
Epoch: 15, Loss: 2.2387758596847047
Epoch: 16, Loss: 2.230601513222472
Epoch: 17, Loss: 2.222371438866881
Epoch: 18, Loss: 2.214102838681713
Epoch: 19, Loss: 2.2077877989642696
Epoch: 20, Loss: 2.196656235821171
Epoch: 21, Loss: 2.193313281830043
Epoch: 22, Loss: 2.1835959327819685
Epoch: 23, Loss: 2.179123483292044
Epoch: 24, Loss: 2.1674943645250853
Epoch: 25, Loss: 2.163578525525794
Epoch: 26, Loss: 2.155235687891642
Epoch: 27, Loss: 2.150220865528333
Epoch: 28, Loss: 2.141

Saving and loading the model

In [8]:
torch.save(model.state_dict(), 'model_final.pt')
model.load_state_dict(torch.load('model_final.pt'))

<All keys matched successfully>

Predict function

In [9]:
def predict(model, string):
    model.eval()
    x_letters = list(string)
    x_numericalized_word = [ord(letter) - ord('a') + 1 for letter in x_letters]
    x_numericalized_word = torch.LongTensor(x_numericalized_word)
    x_numericalized_word = x_numericalized_word.unsqueeze(0)
    y_numericalized_word = [0] 
    y_numericalized_word = torch.LongTensor(y_numericalized_word)
    y_numericalized_word = y_numericalized_word.unsqueeze(0)

    for i in range(8):
        #print(x_numericalized_word.shape, y_numericalized_word.shape)
        output = model(x_numericalized_word, y_numericalized_word)
        output = output.argmax(dim=2)
        y_numericalized_word = torch.cat((y_numericalized_word, output[:, -1].unsqueeze(0)), dim=1)

    y_numericalized_word = y_numericalized_word.squeeze(0)
    y_numericalized_word = y_numericalized_word.tolist()
    y_numericalized_word = [chr(letter + ord('a') - 1) for letter in y_numericalized_word]
    y_word = ''.join(y_numericalized_word)
    return y_word

Checker script

In [11]:
def predict(model, input_str):
    model.eval()
    input_str = [ord(c) - 96 for c in input_str]
    input_str = np.array([np.array(input_str)])
    input_str = torch.from_numpy(input_str)
    output = model(input_str, input_str)
    output = output.detach().numpy()
    output = output.argmax(axis=2)
    output = output[0]
    output = [chr(c + 96) for c in output]
    output = "".join(output)
    return output

# Function to check how many characters match in the two strings
def check(pred: str, true: str):
    correct = 0
    for a, b in zip(pred, true):
        if a == b:
            correct += 1
    return correct

# Function to score the model's performance
def evaluate(model):
    print("Obtaining metrics for eval data:")
    eval_data = pd.read_csv("eval_data.csv").to_numpy()
    results = {
        "pred": [],
        "true": [],
        "score": [],
    }
    correct = [0 for _ in range(9)]
    for x, y in eval_data:
        pred = predict(model, x)
        print(f"Predicted: {pred}, True: {y}")
        score = check(pred, y)
        results["pred"].append(pred)
        results["true"].append(y)
        results["score"].append(score)

        correct[score] += 1
    print("Eval dataset results:")
    for num_chr in range(9):
        print(
            f"Number of predictions with {num_chr} correct predictions: {correct[num_chr]}"
        )
    points = sum(correct[4:6]) * 0.5 + sum(correct[6:])
    marks = round(min(2, points / 1400 * 2) * 2) / 2  # Rounds to the nearest 0.5
    print(f"Points: {points}")
    print(f"Marks: {marks}")
    # Save predicitons and true sentences to inspect manually if required.
    pd.DataFrame.from_dict(results).to_csv("results_eval.csv", index=False)


evaluate(model)

Obtaining metrics for eval data:
Predicted: xaaxaaax, True: ldlhgjuj
Predicted: yyyayyay, True: mffolhhl
Predicted: qnqnnnnn, True: xjwbqnnq
Predicted: eoxfpebe, True: nqxwxmtb
Predicted: nknknxxp, True: dlbdbgvx
Predicted: ggngpgpg, True: fwkibsou
Predicted: gbggbbbg, True: ulafntih
Predicted: rssrssrs, True: ontitdlb
Predicted: ovfoooof, True: epprjfot
Predicted: daddqdad, True: sdgztwup
Predicted: uuuguguq, True: sqcdadyx
Predicted: fqouuoqq, True: ojsllopa
Predicted: araaanna, True: zdtsndhk
Predicted: zqvzyzfq, True: kiwtuwyj
Predicted: fffffbfb, True: jxfwiaky
Predicted: aaqivinr, True: cxqzjrox
Predicted: tttrefte, True: vyakrkdv
Predicted: vtttzzvd, True: rfoaeevr
Predicted: bbbdgaga, True: bgbcvwei
Predicted: rrrarrra, True: yrozhdru
Predicted: irooxiii, True: gnulhwmv
Predicted: ribigibg, True: kflttcgt
Predicted: xxzzxxwr, True: tlcmfsqf
Predicted: gvmmbvee, True: kwnizswj
Predicted: nwnwwwnn, True: ebyojfqs
Predicted: sstsnsbb, True: eyfqqwxb
Predicted: huuuhuhu, True: prgz