In [1]:
import torch
import torch.nn as nn
from torchtext.vocab import GloVe
import torch.optim as optim
import pickle
import os
import random
import nltk
import json
import re
import numpy as np
import pandas as pd
import subprocess

from nltk import word_tokenize
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from time import time
from transformers import BertTokenizer
from transformers import BertModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ARGS():
    def __init__(self):
        self.model_type = "lstm_lstm"
        self.data_dir = "/kaggle/input/math-data"
        self.batch_size = 32
        self.num_workers = 4
        self.epochs = 80
        self.en_hidden = 512
        self.de_hidden = 512
        self.en_num_layers = 1
        self.de_num_layers = 1
        self.embed_dim = 300
        self.processed_data = "/kaggle/input/math-data"
        self.checkpoint_dir = "/kaggle/working/checkpoints"
        self.result_dir = "/kaggle/working/results"

In [2]:
os.mkdir("/kaggle/working/results")
os.mkdir("/kaggle/working/checkpoints")

In [3]:
# import shutil
# shutil.rmtree("/kaggle/working/results")
# shutil.rmtree("/kaggle/working/checkpoints")

UTILS

In [4]:
def tokenize_problem(problem):
    return word_tokenize(problem)

def tokenize_formula(formula):
    all_formulas = formula.split("|")
    formula_split = []
    for s in all_formulas:
        s_brk = re.match(r"([a-zA-Z]+)\(([^,]+),([^)]+)\)", s)
        if s_brk:
            formula_split.extend([s_brk.group(1),"(", s_brk.group(2),",", s_brk.group(3), ")","|"])
    return formula_split


def build_vocab(pth):
    # make file encoder.vocab dn decoder.vocab 
    # data is the directory which contain train.json
    
    encoder_vocab = set()
    decoder_vocab = set()
    encoder_word2idx = {}
    encoder_idx2word = {}
    decoder_word2idx = {}
    decoder_idx2word = {}
    pth2 = os.path.join(pth, "train.json")
    data = json.load(open(pth2))
    for d in data:
        problem = d["Problem"]
        linear_formula = d["linear_formula"]
        problem = tokenize_problem(problem)
        linear_formula = tokenize_formula(linear_formula)
        for p in problem:
            encoder_vocab.add(p)
        for l in linear_formula:
            decoder_vocab.add(p)

    encoder_vocab = list(encoder_vocab)
    decoder_vocab = list(decoder_vocab)
    encoder_vocab.sort()
    decoder_vocab.sort()

    for i, word in enumerate(encoder_vocab):
        encoder_word2idx[word] = i
        encoder_idx2word[i] = word
    for i, word in enumerate(decoder_vocab):
        decoder_word2idx[word] = i
        decoder_idx2word[i] = word

    with open(os.path.join(pth, "encoder.vocab"), "x") as file:
        for word in encoder_vocab:
            file.write(word + "\n")
    with open(os.path.join(pth, "decoder.vocab"), "x") as file:
        for word in decoder_vocab:
            file.write(word + "\n")

    # make pickle file for word2idx and idx2word
    with open(os.path.join(pth, "encoder_word2idx.pickle"), "xb") as file:
        pickle.dump(encoder_word2idx, file)
    with open(os.path.join(pth, "encoder_idx2word.pickle"), "xb") as file:
        pickle.dump(encoder_idx2word, file)
    with open(os.path.join(pth, "decoder_word2idx.pickle"), "xb") as file:
        pickle.dump(decoder_word2idx, file)
    with open(os.path.join(pth, "decoder_idx2word.pickle"), "xb") as file:
        pickle.dump(decoder_idx2word, file)

    print("Encoder Vocab Size = {}, Decoder Vocab Size = {}".format(len(encoder_vocab), len(decoder_vocab)) )
    print("Encoder word2idx Size = {}, Decoder word2idx Size = {}".format(len(encoder_word2idx), len(decoder_word2idx)))

def load_checkpoint(args, chkpt = "best"):

    if chkpt == "best":
        model_name = os.path.join(args.checkpoint_dir, "best_loss_checkpoint_{}.pth".format(args.model_type))
        status_file = os.path.join(args.checkpoint_dir, "best_loss_chkpt_status_{}.json".format(args.model_type))
    else:
        model_name = os.path.join(args.checkpoint_dir, "latest_checkpoint_{}.pth".format(args.model_type))
        status_file = os.path.join(args.checkpoint_dir, "latest_chkpt_status_{}.json".format(args.model_type))

    assert os.path.isfile(model_name), f"Model path/name invalid: {model_name}"
    
    net = torch.load(model_name)
    with open(status_file, "r") as file:
        model_dict = json.load(file)
    print(f"\n|--------- Model Load Success. Trained Epoch: {str(model_dict['epoch'])}")

    return net


ENCODER BLOCK

In [5]:
SPL_TOKEN = ["<pad>", "<unk>", "<sos>", "<eos>"]

class GloveEmbeddings():
    def __init__(self, embed_dim,  word_to_idx):
        self.embed_dim = embed_dim
        self.word_to_idx = word_to_idx
        self.spl_tokens = SPL_TOKEN
        self.vocab_size = len(word_to_idx)

    def get_embedding_matrix(self):
        # Load pre-trained GloVe embeddings
        glove = GloVe(name='6B', dim=self.embed_dim)
        embed_matrix = torch.zeros((self.vocab_size, self.embed_dim))

        embed_matrix[0] = torch.zeros(self.embed_dim)    # Padding token
        for i in range(1,len(SPL_TOKEN)):            
            embed_matrix[i] = torch.randn(self.embed_dim)    # Start-of-sentence token
            
        for k, v in self.word_to_idx.items():
            if k in SPL_TOKEN:
                continue
            else:            
                if k in glove.stoi:
                    embed_matrix[v] = torch.tensor(glove.vectors[glove.stoi[k]])
                else:
                    embed_matrix[v] = embed_matrix[1]
        return embed_matrix


class LSTMEncoder(nn.Module):
    # Bidirectional LSTM Encoder
    def __init__(self, input_size , embed_dim, hidden_units =1024, num_layers = 1, embed_matrix = None, p = 0.3):
        super(LSTMEncoder, self).__init__()
        # parameters
        self.input_size = input_size
        self.embed_dim = embed_dim
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embed_matrix = embed_matrix
        self.embedding = nn.Embedding(input_size, embed_dim)
        if embed_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embed_matrix)
        else:
            self_embedding  = nn.Embedding(input_size, embed_dim, padding_idx = 0)

        self.LSTM = nn.LSTM(input_size=embed_dim, hidden_size=hidden_units, num_layers = num_layers, batch_first = True, bidirectional = True)
        self.hidden = nn.Linear(2*hidden_units, hidden_units)   
        self.cell = nn.Linear(2*hidden_units, hidden_units)
        

    def forward(self,x):
        # apply dropout to the embeddings 
        x = self.dropout(self.embedding(x))
        # apply LSTM
        output, (hidden,cell) = self.LSTM(x)
        hidden = self.hidden(torch.cat((hidden[0:1], hidden[1:2]), dim = 2)) # concatenate the forward and backward LSTM hidden states
        cell = self.cell(torch.cat((cell[0:1], cell[1:2]), dim = 2)) # concatenate the forward and backward LSTM cell states
        return output, (hidden, cell)
    

#BERT encoder
class BertEncoder(nn.Module):
    def __init__(self, model_type, bert_tune_layers, hidden_units=1024):
        super(BertEncoder, self).__init__()
        self.model_type = model_type
        self.hidden_units = hidden_units
        self.bert_tune_layers = bert_tune_layers
        bert_model = BertModel.from_pretrained('bert-base-cased')

        if self.model_type == "bert_lstm_attn_frozen":
            print("BERT Encoder with frozen embeddings.")
            for name, param in bert_model.named_parameters():
                param.requires_grad = False
    
        elif self.model_type == "bert_lstm_attn_tuned":
            self.bert_tune_layers = bert_tune_layers
            print("BERT Encoder with tuned embeddings.")
            if self.bert_tune_layers != -1:
                for param in bert_model.parameters():
                    param.requires_grad = False
                for param in bert_model.encoder.layer[-self.bert_tune_layers:].parameters():
                    param.requires_grad = True
            
        print("Total Bert Params = {}, Total Trainable Params = {}".format(sum(p.numel() for p in bert_model.parameters()), sum(p.numel() for p in bert_model.parameters() if p.requires_grad)))
        self.encoder = bert_model

    def forward(self, x, attn_mask):
        outputs = self.encoder(input_ids = x, attention_mask = attn_mask)
        encodings = outputs.last_hidden_state
        return encodings

DECODER BLOCK

In [6]:
class LSTMDecoder(nn.Module):
    def __init__(self, input_size, embed_dim, hidden_units = 1024, num_layers = 1, p = 0.3):
        super(LSTMDecoder, self).__init__()
        # parameters
        self.input_size = input_size
        self.embed_dim = embed_dim
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embed_dim)
        self.LSTM = nn.LSTM(input_size = embed_dim, hidden_size = hidden_units, num_layers = num_layers, batch_first = True)
        self.fc = nn.Linear(in_features = hidden_units, out_features = input_size)
    def forward(self, x, hidden_cell):
        # apply dropout to the embeddings
        x = self.dropout(self.embedding(x))
        x = x.unsqueeze(1) # unsqueeze the embeddings to add a dimension
        # apply LSTM
        out, (h_t, c_t) = self.LSTM(x, hidden_cell)
        out = self.fc(out)
        return out, (h_t, c_t)
    
class AttentionNetwork(nn.Module):
    def __init__(self, hidden_units):
        super(AttentionNetwork, self).__init__()
        self.hidden_units = hidden_units
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.attn = nn.Linear(hidden_units * 3, hidden_units)
        self.v = nn.Linear(hidden_units, 1, bias=False)

    def forward(self, ht, encoder_out):
        ht = ht.repeat(encoder_out.shape[1], 1 , 1)
        ht = ht.transpose(0,1)
        energy = torch.cat([ht, encoder_out], dim=2)
        energy = self.attn(energy)
        energy = self.relu(energy)        
        attention = self.v(energy)
        attention = attention.squeeze(2)
        attention_weights = self.softmax(attention)
        attention_weights = attention_weights.unsqueeze(1)
        context = torch.bmm(attention_weights, encoder_out)
        return context, attention_weights
    
class LSTMAttnDecoder(nn.Module):
    def __init__(self, input_size, embed_dim, hidden_units=1024, num_layers=1, p = 0.3, bidirectional=False):
        super(LSTMAttnDecoder, self).__init__()
        self.input_size = input_size
        self.embed_dim = embed_dim
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, self.embed_dim, padding_idx=0)
        self.attention = AttentionNetwork(hidden_units)
        self.LSTM = nn.LSTM(2*hidden_units + embed_dim, hidden_units, num_layers = num_layers, bidirectional = bidirectional, dropout=p, batch_first=True)
        self.fc = nn.Linear(hidden_units, input_size)

    def forward(self, x, h0_c0, encoder_out):
        x = self.dropout(self.embedding(x)) #
        x = x.unsqueeze(1)
        context, _ = self.attention(h0_c0[0], encoder_out)
        x = torch.cat([x, context], dim=2)
        decoder_out, (ht, ct) = self.LSTM(x, h0_c0)
        out = self.fc(decoder_out)
        return out, (ht, ct)
    

class AttentionNetworkBert(nn.Module):
    def __init__(self, hidden_units):
        super(AttentionNetworkBert, self).__init__()
        self.hidden_units = hidden_units
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.attn = nn.Linear(hidden_units * 2, hidden_units)
        self.v = nn.Linear(hidden_units, 1, bias=False)

    def forward(self, ht, encoder_out): 
        ht = ht.repeat(encoder_out.shape[1], 1 , 1)
        ht = ht.transpose(0,1)
        energy = torch.cat([ht, encoder_out], dim=2)
        energy = self.attn(energy)
        energy = self.relu(energy)        
        attention = self.v(energy)
        attention = attention.squeeze(2)
        attention_weights = self.softmax(attention)
        attention_weights = attention_weights.unsqueeze(1)
        context = torch.bmm(attention_weights, encoder_out)
        return context, attention_weights

class LSTMAttnDecoderBert(nn.Module):
    def __init__(self, input_size, embed_dim, hidden_units=1024, num_layers=1, p = 0.3, bidirectional=False):
        super(LSTMAttnDecoderBert, self).__init__()
        self.input_size = input_size
        self.embed_dim = embed_dim
        self.hidden_units = hidden_units
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, self.embed_dim, padding_idx=0)
        self.attention = AttentionNetworkBert(hidden_units)
        self.LSTM = nn.LSTM(hidden_units + embed_dim, hidden_units, num_layers = num_layers, bidirectional = bidirectional, dropout=p, batch_first=True)
        self.fc = nn.Linear(hidden_units, input_size)

    def forward(self, x, h0_c0, encoder_out):
        x = self.dropout(self.embedding(x))
        x = x.unsqueeze(1)
        context, attention_weights = self.attention(h0_c0[0], encoder_out)
        x = torch.cat([x, context], dim=2)
        decoder_out, (ht, ct) = self.LSTM(x, h0_c0)
        out = self.fc(decoder_out)
        
        return out, (ht, ct)

SEQ2SEQ

In [26]:
class Seq2Seq(nn.Module):
    def __init__(self,args):
        super(Seq2Seq, self).__init__()
        self.args=args

        #extra see if needed 
        self.model_type = args.model_type
        self.embed_dim = args.embed_dim        
        self.encoder_hidden_units = args.en_hidden
        self.decoder_hidden_units = args.de_hidden
        self.encoder_num_layers = args.en_num_layers
        self.decoder_num_layers = args.de_num_layers
        self.processed_data = args.processed_data

        self.encoder_word2idx = self.get_encoder_word2idx()
        self.decoder_word2idx = self.get_decoder_word2idx()
        self.encoder_input_size = len(self.encoder_word2idx)
        self.decoder_input_size = len(self.decoder_word2idx)
        self.encoder = self.get_encoder()
        self.decoder = self.get_decoder()

    def get_encoder_word2idx(self):
        with open(os.path.join(self.processed_data, "encoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(self.processed_data, "encoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)
                
        
        return word2idx
    
    def get_decoder_word2idx(self):
        
        with open(os.path.join(self.processed_data, "decoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(self.processed_data, "decoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)
        
        return word2idx

    def get_encoder(self):
        print("Loading GloVe embeddings...")
        glove = GloveEmbeddings(self.embed_dim, self.encoder_word2idx)
        embedding_matrix = glove.get_embedding_matrix()
        print("Loading Encoder...")
        encoder = LSTMEncoder(input_size = self.encoder_input_size, embed_dim = self.embed_dim, 
                              hidden_units=self.encoder_hidden_units, num_layers=self.encoder_num_layers, p = 0.3, embed_matrix=embedding_matrix).to(device)
        
        return encoder
    
    def get_decoder(self):
        print("Loading Seq2Seq LSTM Decoder...")
        decoder = LSTMDecoder(input_size = self.decoder_input_size, embed_dim = self.embed_dim, 
                          hidden_units=self.decoder_hidden_units, num_layers=self.decoder_num_layers, p = 0.3).to(device)
        return decoder
    
    def forward(self, problem, linear_formula, tf_ratio=0.6):
        batch_size = problem.shape[0]
        target_len = linear_formula.shape[1]

        _ ,(hidden, cell) = self.encoder(problem)
        target_vocab_size = self.decoder_input_size
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)
        x =  linear_formula[:,0]
        for t in range(1, target_len):
            output,( hidden, cell) = self.decoder(x,( hidden, cell))
            output = output.squeeze(1)
            outputs[:,t,:] = output
            x = output.argmax(dim=1)
            x = linear_formula[:,t] if random.random() < tf_ratio else x
            # x = output.argmax(1)
        return outputs

SEQ2SEQ ATTENTION

In [8]:
class Seq2SeqAttn(nn.Module):
    def __init__(self,args):
        super(Seq2SeqAttn, self).__init__()
        self.args = args
        self.model_type = args.model_type
        self.embed_dim = args.embed_dim
        self.encoder_hidden_units = args.en_hidden
        self.decoder_hidden_units = args.de_hidden
        self.encoder_num_layers = args.en_num_layers
        self.decoder_num_layers = args.de_num_layers
        self.processed_data = args.processed_data

        self.encoder_word2idx = self.get_encoder_word2idx()
        self.decoder_word2idx = self.get_decoder_word2idx()
        self.encoder_input_size = len(self.encoder_word2idx)
        self.decoder_input_size = len(self.decoder_word2idx)
        self.encoder = self.get_encoder()
        self.decoder = self.get_decoder()

        self.start_token = self.decoder_word2idx["<sos>"]
        self.end_token = self.decoder_word2idx["<eos>"]

    def get_encoder_word2idx(self):
        with open(os.path.join(self.processed_data, "encoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        
        return word2idx

    def get_decoder_word2idx(self):
        with open(os.path.join(self.processed_data, "decoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)        
        
        return word2idx

    def get_encoder(self):
        print("Loading GloVe embeddings...")
        glove = GloveEmbeddings(self.embed_dim, self.encoder_word2idx)
        embedding_matrix = glove.get_embedding_matrix()
        print("Loading Encoder...")
        encoder = LSTMEncoder(input_size = self.encoder_input_size, embed_dim = self.embed_dim, 
                              hidden_units=self.encoder_hidden_units, num_layers=self.encoder_num_layers, p = 0.3, embed_matrix=embedding_matrix)
        return encoder

    def get_decoder(self):
        decoder = LSTMAttnDecoder(input_size = self.decoder_input_size, embed_dim = self.embed_dim, 
                        hidden_units=self.decoder_hidden_units, num_layers=self.decoder_num_layers, p = 0.3)
        return decoder

    def forward(self, problem, linear_formula, tf_ratio=0.6):
        batch_size = problem.shape[0]
        target_len = linear_formula.shape[1]
        
        encoder_out ,(hidden, cell) = self.encoder(problem)
        target_vocab_size = self.decoder_input_size
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(device)
        x =  linear_formula[:,0]
        for t in range(1, target_len):
            output,( hidden, cell) = self.decoder(x,( hidden, cell), encoder_out)
            output = output.squeeze(1)
            outputs[:,t,:] = output
            x = output.argmax(dim=1)
            x = linear_formula[:,t] if random.random() < tf_ratio else x # teacher force ratio
        return outputs

BERT2SEQUENCE ATTENTION

In [9]:
class Bert2SeqAttn(nn.Module):
    def __init__(self, args):
        super(Bert2SeqAttn, self).__init__()
        self.args = args
        self.model_type = args.model_type
        self.embed_dim = args.embed_dim        
        self.encoder_hidden_units = args.en_hidden
        self.decoder_hidden_units = args.de_hidden
        self.encoder_num_layers = args.en_num_layers
        self.decoder_num_layers = args.de_num_layers
        self.processed_data = args.processed_data
        self.encoder_word2idx = self.get_encoder_word2idx()
        self.decoder_word2idx = self.get_decoder_word2idx()
        self.encoder_input_size = len(self.encoder_word2idx)
        self.decoder_input_size = len(self.decoder_word2idx)
        self.encoder = self.get_encoder()
        self.decoder = self.get_decoder()
        self.start_token = self.decoder_word2idx["<sos>"]
        self.end_token = self.decoder_word2idx["<eos>"]

    def get_encoder_word2idx(self):
        with open(os.path.join(self.processed_data, "encoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        
        return word2idx
    
    def get_decoder_word2idx(self):
        
        with open(os.path.join(self.processed_data, "decoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)        
        
        return word2idx

    def get_encoder(self):
        print("Loading Bert Encoder...")

        encoder = BertEncoder(self.model_type, self.args.bert_tune_layers, self.encoder_hidden_units)
        return encoder

    def get_decoder(self):
        print("Loading Seq2Seq LSTM Attention Decoder...")
        
        decoder = LSTMAttnDecoderBert(input_size = self.decoder_input_size, embed_dim = self.embed_dim, 
                    hidden_units=self.decoder_hidden_units, num_layers=self.decoder_num_layers, p = 0.3)
            
        return decoder

    def forward(self, problem, attn_mask, linear_formula, tf_ratio=0.6):
        batch_size = problem.shape[0]
        max_target_len = linear_formula.shape[1]
        
        encoder_out = self.encoder(problem, attn_mask)

        target_vocab_size = self.decoder_input_size
        outputs = torch.zeros(batch_size, max_target_len, target_vocab_size).to(device)
        words = torch.zeros(batch_size, max_target_len).to(device)
        
        hidden = torch.zeros(1, batch_size, self.decoder_hidden_units).to(device)
        cell = torch.zeros(1, batch_size, self.decoder_hidden_units).to(device)
        x = linear_formula[:,0]            
        words[:, 0] = linear_formula[:,0]
        for t in range(1, max_target_len):
            # print("DECODER INPUT SHAPES x.shape, hidden.shape, cell.shape, encoder_out.shape", x.shape, hidden.shape, cell.shape, encoder_out.shape)
            output, (hidden, cell) = self.decoder(x, (hidden, cell), encoder_out)
#             print("Seq2seq out shape", output.shape)
            output = output.squeeze(1)
            outputs[:,t,:] = output
            x = output.argmax(dim=1)
            x = linear_formula[:,t] if random.random() < tf_ratio else x
            words[:, t] = x
        return outputs, words

DATASET

TEXTTOMATH

In [10]:
class TextToMathDataset(Dataset):
    def __init__(self, file_path, data_prefix = "train"):
        self.file_path = file_path
        pth = os.path.join(self.file_path, f"{data_prefix}.json")
        
        print(pth)
        self.data  = json.load(open(pth))
        self.data = self.data[:100]
        print("Dataset Length =", len(self.data))
#         print("Dataset Length =", len(self.data))

        with open(os.path.join(file_path, "encoder.vocab"), "r") as file:
            vocab = file.readlines()
        self.encoder_vocab = vocab
        
        with open(os.path.join(file_path, "decoder.vocab"), "r") as file:
            vocab = file.readlines()
        self.decoder_vocab = vocab

        with open(os.path.join(file_path, "encoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(file_path, "encoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)

        self.en_word2idx = word2idx
        self.en_idx2word = idx2word

        with open(os.path.join(file_path, "decoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(file_path, "decoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)
            
        self.de_word2idx = word2idx
        self.de_idx2word = idx2word

        print("Encoder Vocab Size = {}, Decoder Vocab Size = {}".format(len(self.en_word2idx), len(self.de_word2idx)))
        print("Encoder word2idx Size = {}, Decoder word2idx Size = {}".format(len(self.en_word2idx), len(self.de_word2idx)))

    def __len__(self):  
        return len(self.data)
    

    def __getitem__(self, index):
        problem = self.data[index]["Problem"]
        problem = ["<sos>"] + tokenize_problem(problem) + ["<eos>"]

        linear_formula = self.data[index]["linear_formula"]
        formula_split = ["<sos>"] +tokenize_formula(linear_formula) + ["<eos>"]
        
        aa = self.data[index]["answer"]
        problem = [self.en_word2idx[q] if q in self.en_word2idx else self.en_word2idx["<unk>"] for q in problem]
        formula_split = [self.de_word2idx[q] if q in self.de_word2idx else self.de_word2idx["<unk>"] for q in formula_split]
        
        answer = {"Problem" : problem , "linear_formula" : formula_split, "answer" : aa }
        return answer
    

def collate(batch):
    max_len_problem = max([len(sample['Problem']) for sample in batch])
    max_len_formula = max([len(sample['linear_formula']) for sample in batch])

    
    problem_lens = torch.zeros(len(batch), dtype=torch.long)
    padded_problem = torch.zeros((len(batch), max_len_problem), dtype=torch.long)
    
    formula_lens = torch.zeros(len(batch), dtype=torch.long)
    padded_formula = torch.zeros((len(batch), max_len_formula), dtype=torch.long)

    answers = torch.zeros(len(batch), dtype=torch.long)
    for idx in range(len(batch)):
        problem = batch[idx]['Problem']
        linear_formula = batch[idx]['linear_formula']
        ans = batch[idx]['answer']
        prob_len = len(problem)
        lf_len = len(linear_formula)
        problem_lens[idx] = prob_len
        formula_lens[idx] = lf_len
        problem_tensor = torch.LongTensor(problem)
        linear_formula_tensor = torch.LongTensor(linear_formula)
        padded_problem[idx, :prob_len] = problem_tensor
        padded_formula[idx, :lf_len] = linear_formula_tensor
        answers[idx] = ans
        
    ret = {'problem': padded_problem, 'problem_lens': problem_lens, 'linear_formula': padded_formula, 'formula_lens': formula_lens, 'answer' : answers}

    return ret


TEXT2MATHBERT

In [11]:
class Text2MathBertDataset(Dataset):
    def __init__(self, file_path, data_prefix = "train"):
        self.file_path = file_path
        pth = os.path.join(self.file_path, f"{data_prefix}.json")
        
        print(pth)
        self.data  = json.load(open(pth))
        self.data = self.data[:100]
        print("Dataset Length =", len(self.data))
        
        with open(os.path.join(file_path, "decoder.vocab"), "r") as file:
            vocab = file.readlines()
        self.decoder_vocab = vocab
        
        with open(os.path.join(file_path, "decoder_word2idx.pickle"), "rb") as file:
            word2idx = pickle.load(file)
        with open(os.path.join(file_path, "decoder_idx2word.pickle"), "rb") as file:
            idx2word = pickle.load(file)
            
        self.de_word2idx = word2idx
        self.de_idx2word = idx2word

        self.en_tokenizer =  BertTokenizer.from_pretrained("bert-base-cased")

        print("Encoder Vocab Size = , Decoder Vocab Size = {}".format( len(self.de_word2idx)))
        
    def __len__(self):        
        return len(self.data)
    
    def __getitem__(self, index):
        problem = self.data[index]["Problem"]
        problem = self.en_tokenizer.encode(problem)
        
        linear_formula = self.data[index]["linear_formula"]
        formula_split = ["<sos>"] +tokenize_formula(linear_formula) + ["<eos>"]
        formula_split = [self.de_word2idx[q] if q in self.de_word2idx else self.de_word2idx["<unk>"] for q in formula_split]
        aa = self.data[index]["answer"]

        answer = {"Problem" : problem , "linear_formula" : formula_split, "answer" : aa }
            
        return answer

def collate_bert(batch):
    
    max_len_problem = max([len(sample['Problem']) for sample in batch])
    max_len_formula = max([len(sample['linear_formula']) for sample in batch])
    
    problem_lens = torch.zeros(len(batch), dtype=torch.long)
    padded_problem = torch.zeros((len(batch), max_len_problem), dtype=torch.long)
    problem_attn_mask = torch.zeros((len(batch), max_len_problem), dtype=torch.long)
    
    formula_lens = torch.zeros(len(batch), dtype=torch.long)
    padded_formula = torch.zeros((len(batch), max_len_formula), dtype=torch.long)

    answers = torch.zeros(len(batch), dtype=torch.long)
    for idx in range(len(batch)):
        
        problem = batch[idx]['Problem']
        linear_formula = batch[idx]['linear_formula']
        
        prob_len = len(problem)
        lf_len = len(linear_formula)
        problem_lens[idx] = prob_len
        formula_lens[idx] = lf_len

        
        padded_problem[idx, :prob_len] = torch.LongTensor(problem)
        problem_attn_mask[idx, :prob_len] = torch.ones((1, prob_len), dtype=torch.long)

        padded_formula[idx, :lf_len] = torch.LongTensor(linear_formula)

        answers[idx] = batch[idx]['answer']
        
    ret = {'problem': padded_problem, 'problem_lens': problem_lens, 'problem_attn_mask': problem_attn_mask, 'linear_formula': padded_formula, 'formula_lens': formula_lens, 'answer' : answers}

    return ret


BEAM SEARCH

In [12]:
def beam_search(args, model, en_ht, en_ct, start_token, end_token, max_target_len = 80, beam_size = 10):
    beam = [([start_token], (en_ht, en_ct), 0)]
#     print("Beam Search input hidden and cell shape, start and end tokens", en_ht.shape, en_ct.shape, start_token, end_token)

    i = 0
    while i < max_target_len -1:
#         print("i : " , i)
        new_beam = []
        for sequence, (ht, ct), score in beam:
            prev_token = [sequence[-1]] #get first token for each beam
            prev_token = torch.LongTensor(prev_token).to(device)

            decoder_out, (ht, ct) = model.decoder(prev_token, (ht, ct)) #pass through decoder

            decoder_out = decoder_out.squeeze(1)
            top_info = decoder_out.topk(beam_size, dim=1) #get top k=beam_size possible word indices and their values
            top_vals, top_inds = top_info

            for j in range(beam_size):
                new_word_idx = top_inds[0][j]                
                new_seq = sequence + [new_word_idx.item()]
                new_word_prob = torch.log(top_vals[0][j])
                updated_score = score - new_word_prob
                new_candidate = (new_seq, (ht, ct), updated_score)
                new_beam.append(new_candidate)

        # new_beam = sorted(new_beam, reverse=False, key=lambda x: x[2])
        new_beam.sort(key=lambda x: x[2])
        beam = new_beam[:beam_size]
        i += 1
        

    best_candidate = beam[0][0] #return best candidate based on score
    decoded_words = torch.zeros(1, max_target_len)

    for t in range(max_target_len):
        decoded_words[:, t] = torch.LongTensor([best_candidate[t]])
    
    return decoded_words

In [13]:
def beam_search_attn_decoder(args, model, encoder_out, en_ht, en_ct, start_token, end_token, max_target_len = 80, beam_size = 3):
    beam = [([start_token], (en_ht, en_ct), 0)]
    # print("Beam Search input hidden and cell shape, start and end tokens", en_ht.shape, en_ct.shape, start_token, end_token)

    i = 0
    while i < max_target_len -1:
        new_beam = []
        for sequence, (ht, ct), score in beam:
            prev_token = [sequence[-1]] #get first token for each beam
            prev_token = torch.LongTensor(prev_token).to(device)

            decoder_out, (ht, ct) = model.decoder(prev_token, (ht, ct), encoder_out) #pass through decoder

            decoder_out = decoder_out.squeeze(1)
            top_info = decoder_out.topk(beam_size, dim=1) #get top k=beam_size possible word indices and their values
            top_vals, top_inds = top_info

            for j in range(beam_size):
                new_word_idx = top_inds[0][j]                
                new_seq = sequence + [new_word_idx.item()]
                new_word_prob = torch.log(top_vals[0][j])
                updated_score = score - new_word_prob
                new_candidate = (new_seq, (ht, ct), updated_score)
                new_beam.append(new_candidate)

        # new_beam = sorted(new_beam, reverse=True, key=lambda x: x[2])
        new_beam.sort(key=lambda x: x[2])
        beam = new_beam[:beam_size]
        i += 1

    best_candidate = beam[0][0] #return best candidate based on score
    decoded_words = torch.zeros(1, max_target_len)

    for t in range(max_target_len):
        decoded_words[:, t] = torch.LongTensor([best_candidate[t]])
    
    return decoded_words

HELPER FUNCTIONS

In [14]:
def convert_idx_sentence(args, output, problem, linear_formula, answer, de_idx2word, en_idx2word, mode):
    """
    write to a file named my_output.json
    format for all problem should be
    {
        "Problem" : <problem>
        "answer" : <answer>
        "predicted" : <predicted_output>
        "linear_formula" : <original linear formula>
    }
    """
    output = output.cpu().detach().numpy()
    linear_formula = list(linear_formula)
    problem = list(problem)
    batch_size = output.shape[0]
    if(mode == "dev"):
        output_file = os.path.join("/kaggle/working/",f"dev_output.json")
    elif (mode == "test"):
        output_file = os.path.join("/kaggle/working/",f"test_output.json")
    else:
        print("Mode not recognized")
        return 
    # assert len(output) == len(linear_formula)

    # convert all the tensors into words usinf idx2word dictionary
    for b in range(batch_size):
        prb = ""
        ans = ""
        predicted = ""
        lf = ""
        for i in range(len(output[b])):
#             if(mode == "test"):
#                 print("Size : ",  len(de_idx2word))
#                 print("output[b][i] : " , output[b][i])
#                 print("temp ",de_idx2word[(int)(output[b][i])] )
            temp = de_idx2word[(int)(output[b][i])]
            predicted += temp
            if(temp =="<eos>"):
                break

        for i in range(len(linear_formula[b])):
            temp = de_idx2word[(int)(linear_formula[b][i].item())]
            lf += temp 
            if(temp =="<eos>"):
                break

        for i in range(len(problem[b])):
            temp= en_idx2word[(int)(problem[b][i].item())]
            prb += temp + " "
            if(temp == "<eos>"):
                break

        
        ans += str(answer[b].item())

        data = {
            "Problem" : prb,
            "answer" : ans,
            "predicted" : predicted,
            "linear_formula" : lf
        }

        with open(output_file, "a") as file:
            json.dump(data, file)
            file.write("\n")
    return


INFER

In [15]:
def evaluator_dev(args, model):
    # on current model given , need to modify for any model
    dev_dataset = TextToMathDataset(args.processed_data, "dev")
    dev_loader = DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate, num_workers=args.num_workers)
    de_word2idx = dev_dataset.de_word2idx
    de_idx2word = dev_dataset.de_idx2word
    en_idx2word = dev_dataset.en_idx2word

    model.eval()

    print("Evaluating model on val data on given model")

    start_token = de_word2idx["<sos>"]
    end_token = de_word2idx["<eos>"]

    target_vocab_size = len(de_word2idx)
    decoder_hidden_units = args.de_hidden

    #file to be made
    my_file = "my_output.json"
    my_file = os.path.join(args.processed_data, my_file)

    for i, batch in enumerate(dev_loader):
        problem = batch['problem'].to(device)
        linear_formula = batch['linear_formula'].to(device)
        answer = batch['answer'].to(device)
        
        batch_size = problem.shape[0]
        max_target_len = linear_formula.shape[1]
        # max_target_len = 500

        words = torch.zeros(batch_size, max_target_len).to(device)

        output , (hidden, cell)  = model.encoder(problem)

        #beam search
        for b in range(batch_size):
#             print(f"at i: {i} and inside batch :{b}")
            words[b,:] = beam_search(args, model , hidden[:,b,:].unsqueeze(1), cell[:,b,:].unsqueeze(1), start_token, end_token, max_target_len = max_target_len, beam_size = 10)
        # convert_idx_sentence(args, words, problem, linear_formula, answer)
        convert_idx_sentence(args, words, problem, linear_formula, answer, de_idx2word, en_idx2word, "dev")
    return

TRAIN

In [16]:
def evaluator_dev_attn(args, model):
    # on current model given , need to modify for any model
    dev_dataset = TextToMathDataset(args.processed_data, "dev")
    dev_loader = DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=True, collate_fn=collate, num_workers=args.num_workers)
    de_word2idx = dev_dataset.de_word2idx
    de_idx2word = dev_dataset.de_idx2word
    en_idx2word = dev_dataset.en_idx2word

    model.eval()

    print("Evaluating model on val data on given model")

    start_token = de_word2idx["<sos>"]
    end_token = de_word2idx["<eos>"]

    target_vocab_size = len(de_word2idx)
    decoder_hidden_units = args.de_hidden

    #file to be made
    my_file = "my_output.json"
    my_file = os.path.join(args.processed_data, my_file)

    for i, batch in enumerate(dev_loader):
        problem = batch['problem'].to(device)
        linear_formula = batch['linear_formula'].to(device)
        answer = batch['answer'].to(device)

        batch_size = problem.shape[0]
        max_target_len = linear_formula.shape[1]
        #max_target_len = 500

        words = torch.zeros(batch_size, max_target_len).to(device)
        output, (hidden,cell)= model.encoder(problem)
    
        #beam search
        for b in range(batch_size):
#             print(f" i : {i} , b : {b}")
            words[b,:] = beam_search_attn_decoder(args, model , output[b,:,:].unsqueeze(0), hidden[:,b,:].unsqueeze(1), cell[:,b,:].unsqueeze(1), start_token, end_token, max_target_len = max_target_len, beam_size = 10)
        convert_idx_sentence(args, words, problem, linear_formula, answer, de_idx2word, en_idx2word, "test")
    # print("Running evaluation script for test ... ")
    return

TRAIN AND DEV

In [17]:
def train_S2S():
    print(device)
    args = ARGS()
    pth = "/kaggle/input/math-data"
    args.processed_data = os.path.join(pth, "processed_data")
    args.data_dir = os.path.join(pth, "data")
    print(args.processed_data)
    train_dataset = TextToMathDataset(args.processed_data, "train")
    dev_dataset = TextToMathDataset(args.processed_data, "dev")
    train_loader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True, num_workers = args.num_workers, collate_fn=collate)
    dev_loader = DataLoader(dev_dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.num_workers, collate_fn=collate)

    #------------------------------------------------
    model = Seq2Seq(args).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001) 
#     schedulers = {
#         "stepLR" : torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1, last_epoch=- 1, verbose=False),
#         "cosineLR" : torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, verbose=False)
#     }

#     current_scheduler = schedulers["cosineLR"]
    # ----------------------------------
    loss_tracker = defaultdict(list)
    time_tracker = defaultdict(list)
    # val_accuracy_tracker = defaultdict(list)
    min_loss = 1000000
    best_epoch = 0
    start = time()

    for epoch in range(args.epochs):
        print("\n\n-------------------Epoch = ", epoch, "------------------------------\n")
        model.train()
        epoch_loss =[]
        total_loss = 0
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            prb = batch["problem"].to(device)
            lf = batch["linear_formula"].to(device)

            output = model(prb, lf)

            output = output.reshape(-1, output.shape[2])
            lf = lf.reshape(-1)
            
            loss = criterion(output, lf)
            loss.backward()
            epoch_loss.append(loss.item())
            total_loss += loss.item()
            optimizer.step()

        end1 = time()
        print(f"Epoch {epoch}, Loss = {total_loss/len(train_loader)}")

        dev_loss =[]
        model.eval()
        for i, batch in enumerate(dev_loader):
            prb = batch["problem"].to(device)
            lf = batch["linear_formula"].to(device)

            output  = model(prb, lf)

            output = output.reshape(-1, output.shape[2])
            lf = lf.reshape(-1)

            loss = criterion(output, lf)
            dev_loss.append(loss.item())


        end2 = time()
        avg_dev_loss = np.mean(dev_loss)
        avg_epoch_loss = np.mean(epoch_loss)

        loss_tracker["train"].append(avg_epoch_loss)
        loss_tracker["dev"].append(avg_dev_loss)

        print(f"Epoch {epoch}, Train Loss = {avg_epoch_loss}, Dev Loss = {avg_dev_loss}")
        # print(f"Epoch Time = {end2 - end1}")
        time_tracker["train"].append(round((end1 - start)/60 , 2))
        # loss_tracker["dev"].append(avg_dev_loss)

        with open(os.path.join(args.result_dir, "loss_tracker{}.json".format(args.model_type)), "w") as outfile:
            json.dump(loss_tracker, outfile)

        torch.save(model, os.path.join(args.checkpoint_dir, "latest_checkpoint_{}.pth".format(args.model_type)))

        model_state = {
                'epoch': epoch,
                'train_loss' : avg_epoch_loss,
                'prev_best_epoch': best_epoch
        }

        with open(os.path.join(args.checkpoint_dir, "latest_chkpt_status_{}.json".format(args.model_type)), "w") as outfile:
            json.dump(model_state, outfile)

        #save the model whose loss is minimum
        if avg_dev_loss < min_loss:
            min_loss = avg_dev_loss
            best_epoch = epoch
            torch.save(model, os.path.join(args.checkpoint_dir, "best_checkpoint_{}.pth".format(args.model_type)))
            print("Best Model saved at epoch = ", epoch)

    print("Training and Dev Complete for Seq2Seq model")
    print("Dev evaluater ...")
    
    #all epochs done
    #load the best model and return
    best_model = torch.load(os.path.join(args.checkpoint_dir, "best_checkpoint_{}.pth".format(args.model_type)))
#     evaluator_dev(args, best_model)
    print("Dev test")
    evaluator_dev(args, best_model)
    return args , best_model
    

In [18]:
def train_S2SAttn():
    print(device)
    args = ARGS()
    pth = "/kaggle/input/math-data"
    args.processed_data = os.path.join(pth, "processed_data")
    args.data_dir = os.path.join(pth, "data")
    print(args.processed_data)

    train_dataset = TextToMathDataset(args.processed_data, "train")
    dev_dataset = TextToMathDataset(args.processed_data, "dev")
    train_loader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True, num_workers = args.num_workers, collate_fn=collate)
    dev_loader = DataLoader(dev_dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.num_workers, collate_fn=collate)

    #------------------------------------------------
    model = Seq2SeqAttn(args).to(device)

    criterion = nn.CrossEntropyLoss(ignore_index = 0)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)  #adam or SGD ??
    schedulers = {
        "stepLR" : torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1, last_epoch=- 1, verbose=False),
        "cosineLR" : torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, verbose=False)
    }

    current_scheduler = schedulers["cosineLR"]
    # ----------------------------------
    loss_tracker = defaultdict(list)
    time_tracker = defaultdict(list)
    # val_accuracy_tracker = defaultdict(list)
    min_loss = 1000000
    best_epoch = 0
    start = time()

    for epoch in range(args.epochs):
        print("\n\n-------------------Epoch = ", epoch, "------------------------------\n")
        model.train()
        epoch_loss =[]
        total_loss = 0
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            prb = batch["problem"].to(device)
            lf = batch["linear_formula"].to(device)

            output = model(prb, lf)

            output = output.reshape(-1, output.shape[2])
            lf = lf.reshape(-1)
            
            loss = criterion(output, lf)
            loss.backward()
            epoch_loss.append(loss.item())
            total_loss += loss.item()
            optimizer.step()

        current_scheduler.step()
        end1 = time()
        print(f"Epoch {epoch}, Loss = {total_loss/len(train_loader)}")

        dev_loss =[]
        model.eval()
        for i, batch in enumerate(dev_loader):
            prb = batch["problem"].to(device)
            lf = batch["linear_formula"].to(device)

            output = model(prb, lf)

            output = output.reshape(-1, output.shape[2])
            lf = lf.reshape(-1)

            loss = criterion(output, lf)
            dev_loss.append(loss.item())


        end2 = time()
        avg_dev_loss = np.mean(dev_loss)
        avg_epoch_loss = np.mean(epoch_loss)

        loss_tracker["train"].append(avg_epoch_loss)
        loss_tracker["dev"].append(avg_dev_loss)

        print(f"Epoch {epoch}, Train Loss = {avg_epoch_loss}, Dev Loss = {avg_dev_loss}")
        # print(f"Epoch Time = {end2 - end1}")
        time_tracker["train"].append(round((end1 - start)/60 , 2))
        with open(os.path.join(args.result_dir, "loss_tracker{}.json".format(args.model_type)), "w") as outfile:
            json.dump(loss_tracker, outfile)

        torch.save(model, os.path.join(args.checkpoint_dir, "latest_checkpoint_{}.pth".format(args.model_type)))

        model_state = {
                'epoch': epoch,
                'train_loss' : avg_epoch_loss,
                'prev_best_epoch': best_epoch
        }

        with open(os.path.join(args.checkpoint_dir, "latest_chkpt_status_{}.json".format(args.model_type)), "w") as outfile:
            json.dump(model_state, outfile)

        #save the model whose loss is minimum
        if avg_dev_loss < min_loss:
            min_loss = avg_dev_loss
            best_epoch = epoch
            torch.save(model, os.path.join(args.checkpoint_dir, "best_checkpoint_{}.pth".format(args.model_type)))
            print("Best Model saved at epoch = ", epoch)

    print("Training and Dev Complete for Seq2Seq Attn model")
    #all epochs done
    #load the best model and return
    best_model = torch.load(os.path.join(args.checkpoint_dir, "best_checkpoint_{}.pth".format(args.model_type)))
    evaluator_dev(args,best_model)
    return args, best_model


In [19]:
def train_BertS2SAtten():
    print(device)
    args = ARGS()
    pth = "/kaggle/input/math-data"
    args.processed_data = os.path.join(pth, "processed_data")
    args.data_dir = os.path.join(pth, "data")
    print(args.processed_data)
    # build_vocab(args.processed_data)
    train_dataset = Text2MathBertDataset(args.processed_data, "train")
    dev_dataset = Text2MathBertDataset(args.processed_data, "dev")
    train_loader = DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True, num_workers = args.num_workers, collate_fn=collate)
    dev_loader = DataLoader(dev_dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.num_workers, collate_fn=collate)

    #------------------------------------------------
    model = Bert2SeqAttn(args).to(device)

    criterion = nn.CrossEntropyLoss(ignore_index = 0)
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)  #adam or SGD ??
#     schedulers = {
#         "stepLR" : torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1, last_epoch=- 1, verbose=False),
#         "cosineLR" : torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, verbose=False)
#     }

#     current_scheduler = schedulers["cosineLR"]
    # ----------------------------------
    loss_tracker = defaultdict(list)
    time_tracker = defaultdict(list)
    # val_accuracy_tracker = defaultdict(list)
    min_loss = 1000000
    best_epoch = 0
    start = time()

    for epoch in range(args.epochs):
        print("\n\n-------------------Epoch = ", epoch, "------------------------------\n")
        model.train()
        epoch_loss =[]
        total_loss = 0
        for i, batch in enumerate(train_loader):
            optimizer.zero_grad()

            prb = batch["problem"].to(device)
            lf = batch["linear_formula"].to(device)
            attn_mask = batch["problem_attn_mask"].to(device)

            output, _ = model(prb, attn_mask, lf)

            output = output.reshape(-1, output.shape[2])
            lf = lf.reshape(-1)

            loss = criterion(output, lf)
            loss.backward()
            epoch_loss.append(loss.item())
        
            total_loss += loss.item()
            optimizer.step()

        end1 = time()
        print(f"Epoch {epoch}, Loss = {total_loss/len(train_loader)}")

        dev_loss =[]
        model.eval()
        for i, batch in enumerate(dev_loader):
            prb = batch["problem"].to(device)
            lf = batch["linear_formula"].to(device)
            attn_mask = batch["problem_attn_mask"].to(device)

            output, _ = model(prb, attn_mask, lf)

            output = output.reshape(-1, output.shape[2])
            lf = lf.reshape(-1)

            loss = criterion(output, lf)
            dev_loss.append(loss.item())

        end2 = time()
        avg_dev_loss = np.mean(dev_loss)
        avg_epoch_loss = np.mean(epoch_loss)

        loss_tracker["train"].append(avg_epoch_loss)
        loss_tracker["dev"].append(avg_dev_loss)

        print(f"Epoch {epoch}, Train Loss = {avg_epoch_loss}, Dev Loss = {avg_dev_loss}")
        # print(f"Epoch Time = {end2 - end1}")
        time_tracker["train"].append(round((end1 - start)/60 , 2))
        with open(os.path.join(args.result_dir, "loss_tracker{}.json".format(args.model_type)), "w") as outfile:
            json.dump(loss_tracker, outfile)

        torch.save(model, os.path.join(args.checkpoint_dir, "latest_checkpoint_{}.pth".format(args.model_type)))

        model_state = {
                'epoch': epoch,
                'train_loss' : avg_epoch_loss,
                'prev_best_epoch': best_epoch
        }

        with open(os.path.join(args.checkpoint_dir, "latest_chkpt_status_{}.json".format(args.model_type)), "w") as outfile:
            json.dump(model_state, outfile)

        #save the model whose loss is minimum
        if avg_dev_loss < min_loss:
            min_loss = avg_dev_loss
            best_epoch = epoch
            torch.save(model, os.path.join(args.checkpoint_dir, "best_checkpoint_{}.pth".format(args.model_type)))
            print("Best Model saved at epoch = ", epoch)

    print("Training and Dev Complete for Bert2SeqAttn model")
    #all epochs done
    #load the best model and return
    best_model = torch.load(os.path.join(args.checkpoint_dir, "best_checkpoint_{}.pth".format(args.model_type)))
    return best_model

TEST 

In [20]:
def test_S2S(args,model):
    print(device)
#     args = ARGS()
    pth = "/kaggle/input/math-data"
#     args.processed_data = os.path.join(pth, "processed_data")
#     args.data_dir = os.path.join(pth, "data")
    print(args.processed_data)

    test_dataset = TextToMathDataset(args.processed_data, "test")
    test_loader = DataLoader(test_dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.num_workers, collate_fn=collate)
    dev_dataset = TextToMathDataset(args.processed_data, "dev")
    dev_loader = DataLoader(test_dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.num_workers, collate_fn=collate)
    de_word2idx = test_dataset.de_word2idx
    en_word2idx = test_dataset.en_word2idx
    
    de_idx2word = test_dataset.de_idx2word
    en_idx2word = test_dataset.en_idx2word
    # decoder_hidden_units = model.decoder_hidden_units
    #------------------------------------------------
    # model = torch.load(os.path.join(args.checkpoint_dir, "latest_checkpoint_Seq2Seq.pth")).to(device) #load the latest checkpoint
    # model = torch.load(os.path.join(args.checkpoint_dir, "best_checkpoint_Seq2Seq.pth")).to(device) #load the best checkpoint
    # my_file = ".json"
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index = 0)
    loss_tracker = defaultdict(list)

    start_token = de_word2idx["<sos>"]
    end_token = de_word2idx["<eos>"]
    print(f" Start Token {start_token} and end token {end_token}")
    for i, batch in enumerate(test_loader):
        problem = batch['problem'].to(device)
        linear_formula = batch['linear_formula'].to(device)
        answer = batch['answer'].to(device)

        batch_size = problem.shape[0]
        max_target_len = linear_formula.shape[1]
        #max_target_len = 500

        words = torch.zeros(batch_size, max_target_len).to(device)
        output ,(hidden, cell) = model.encoder(problem)

        #beam search
        for b in range(batch_size):
            print(f" i : {i} , b : {b}")
            words[b,:] = beam_search(args, model , hidden[:,b,:].unsqueeze(1), cell[:,b,:].unsqueeze(1), start_token, end_token, max_target_len = max_target_len, beam_size = 10)
        convert_idx_sentence(args, words, problem, linear_formula, answer, de_idx2word, en_idx2word, "test")
    # print("Running evaluation script for test ... ")
    # subprocess.call(f"python3 evaluator.py {my_file}")
    print("Testing Complete. JSON created")
    return


In [21]:
def test_S2S_Atten(args, model):
    print(device)
#     args = ARGS()
    pth = "/kaggle/input/math-data"
#     args.processed_data = os.path.join(pth, "processed_data")
#     args.data_dir = os.path.join(pth, "data")
    print(args.processed_data)

    test_dataset = TextToMathDataset(args.processed_data, "test")
    test_loader = DataLoader(test_dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.num_workers, collate_fn=collate)
    de_word2idx = test_dataset.de_word2idx
    en_word2idx = test_dataset.en_word2idx
    
    de_idx2word = test_dataset.de_idx2word
    en_idx2word = test_dataset.en_idx2word
    # decoder_hidden_units = model.decoder_hidden_units
    #------------------------------------------------
    # model = torch.load(os.path.join(args.checkpoint_dir, "latest_checkpoint_Seq2Seq.pth")).to(device) #load the latest checkpoint
    # model = torch.load(os.path.join(args.checkpoint_dir, "best_checkpoint_Seq2Seq.pth")).to(device) #load the best checkpoint
    # my_file = ".json"
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index = 0)
    loss_tracker = defaultdict(list)

    start_token = de_word2idx["<sos>"]
    end_token = de_word2idx["<eos>"]
    print("start token : " , start_token)
    for i, batch in enumerate(test_loader):
        problem = batch['problem'].to(device)
        linear_formula = batch['linear_formula'].to(device)
        answer = batch['answer'].to(device)

        batch_size = problem.shape[0]
        max_target_len = linear_formula.shape[1]
        #max_target_len = 500

        words = torch.zeros(batch_size, max_target_len).to(device)
        output, (hidden,cell)= model.encoder(problem)
    
        #beam search
        for b in range(batch_size):
#             print(f" i : {i} , b : {b}")
            words[b,:] = beam_search_attn_decoder(args, model , output[b,:,:].unsqueeze(0), hidden[:,b,:].unsqueeze(1), cell[:,b,:].unsqueeze(1), start_token, end_token, max_target_len = max_target_len, beam_size = 10)
        convert_idx_sentence(args, words, problem, linear_formula, answer, de_idx2word, en_idx2word, "test")
    # print("Running evaluation script for test ... ")
    # subprocess.call(f"python3 evaluator.py {my_file}")
    print("Testing Complete. JSON created")
    return


In [22]:
def test_BertS2SAtten(model):
    print(device)
    args = ARGS()
    pth = "/kaggle/input/math-data"
    args.processed_data = os.path.join(pth, "processed_data")
    args.data_dir = os.path.join(pth, "data")
    print(args.processed_data)

    test_dataset = Text2MathBertDataset(args.processed_data, "test")
    test_loader = DataLoader(test_dataset, batch_size = args.batch_size, shuffle = False, num_workers = args.num_workers, collate_fn=collate_bert)
    de_word2idx = test_dataset.de_word2idx
    en_word2idx = test_dataset.en_word2idx
    # decoder_hidden_units = model.decoder_hidden_units
    #------------------------------------------------
    # model = torch.load(os.path.join(args.checkpoint_dir, "latest_checkpoint_Seq2Seq.pth")).to(device) #load the latest checkpoint
    # model = torch.load(os.path.join(args.checkpoint_dir, "best_checkpoint_Seq2Seq.pth")).to(device) #load the best checkpoint
    # my_file = ".json"
    model.eval()
    criterion = nn.CrossEntropyLoss(ignore_index = 0)
    loss_tracker = defaultdict(list)

    start_token = de_word2idx["<sos>"]
    end_token = de_word2idx["<eos>"]
    for i, batch in enumerate(test_loader):
        problem = batch['problem'].to(device)
        linear_formula = batch['linear_formula'].to(device)
        answer = batch['answer'].to(device)


        attn_mask = batch['problem_attn_mask'].to(device)
        batch_size = problem.shape[0]
        max_target_len = linear_formula.shape[1]
        #max_target_len = 500

        output = model.encoder(problem, attn_mask)
        hidden = torch.zeros(1, batch_size, model.decoder_hidden_units).to(device)
        cell = torch.zeros(1, batch_size, model.decoder_hidden_units).to(device)


        for b in range(batch_size):
            words = beam_search_attn_decoder(args, model , output[b,:,:].unsqueeze(0), hidden[:,b,:].unsqueeze(1), cell[:,b,:].unsqueeze(1), start_token, end_token, max_target_len = max_target_len, beam_size = 10)
        convert_idx_sentence(args, words, problem, linear_formula, answer, de_word2idx, en_word2idx, "test")

EVALUATOR RUN

In [23]:
# import gc 
# gc.collect()

In [24]:
# torch.cuda.empty_cache()

MAIN

In [27]:
if __name__ == "__main__":
    args, model = train_S2S()
    test_S2S(args,model)

cuda
/kaggle/input/math-data/processed_data
/kaggle/input/math-data/processed_data/train.json
Dataset Length = 19791
Encoder Vocab Size = 9998, Decoder Vocab Size = 113
Encoder word2idx Size = 9998, Decoder word2idx Size = 113
/kaggle/input/math-data/processed_data/dev.json
Dataset Length = 2961
Encoder Vocab Size = 9998, Decoder Vocab Size = 113
Encoder word2idx Size = 9998, Decoder word2idx Size = 113
Loading GloVe embeddings...


  embed_matrix[v] = torch.tensor(glove.vectors[glove.stoi[k]])


Loading Encoder...
Loading Seq2Seq LSTM Decoder...


-------------------Epoch =  0 ------------------------------

Epoch 0, Loss = 0.5183365292762901
Epoch 0, Train Loss = 0.5183365292762901, Dev Loss = 0.4364081326351371
Best Model saved at epoch =  0


-------------------Epoch =  1 ------------------------------

Epoch 1, Loss = 0.4363763352106769
Epoch 1, Train Loss = 0.4363763352106769, Dev Loss = 0.41652331989939495
Best Model saved at epoch =  1


-------------------Epoch =  2 ------------------------------

Epoch 2, Loss = 0.4287314866586725
Epoch 2, Train Loss = 0.4287314866586725, Dev Loss = 0.4185991117390253


-------------------Epoch =  3 ------------------------------

Epoch 3, Loss = 0.42042519652881993
Epoch 3, Train Loss = 0.42042519652881993, Dev Loss = 0.4109187445012472
Best Model saved at epoch =  3


-------------------Epoch =  4 ------------------------------

Epoch 4, Loss = 0.41545360371687107
Epoch 4, Train Loss = 0.41545360371687107, Dev Loss = 0.4021707973493

KeyboardInterrupt: 