In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import re
import numpy as np
import spacy
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

In [None]:
# Download with: python -m spacy download en_core_web_sm
spacy_eng = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """
    Preprocess the input text by performing operations such as lowercasing,
    removing punctuation, and removing extra whitespace.
    """
    text = text.lower()  # Lowercase the text
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = text.strip()  # Remove leading and trailing spaces
    return text

In [1]:
class Vocab:
    def __init__(self, min_freq):
        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.stoi = {v: k for k, v in self.itos.items()}
        self.min_freq = min_freq

    def __len__(self):
        return len(self.itos)

    @staticmethod
    def tokenize(text):
        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]

    def build_vocab(self, sentences):
        freqs = {}
        idx = 4
        for sentence in sentences:
            for word in self.tokenize(sentence):
                freqs[word] = freqs.get(word, 0) + 1
                if freqs[word] == self.min_freq:
                    self.stoi[word] = idx
                    self.itos[idx] = word
                    idx += 1

    def numericalize(self, text):
        return [self.stoi.get(token, self.stoi["<UNK>"]) for token in self.tokenize(text)]

In [None]:
class QADataset(Dataset):
    def __init__(self, file_path, min_freq=5):
        self.df = pd.read_csv(file_path)
        self.questions = self.df["question"].apply(clean_text)
        self.answers = self.df["answer"].apply(clean_text)
        self.vocab = Vocab(min_freq)
        self.vocab.build_vocab(self.questions.tolist() + self.answers.tolist())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        question = self.questions.iloc[idx]
        answer = self.answers.iloc[idx]

        question_ids = [self.vocab.stoi["<SOS>"]] + self.vocab.numericalize(question) + [self.vocab.stoi["<EOS>"]]
        answer_ids = [self.vocab.stoi["<SOS>"]] + self.vocab.numericalize(answer) + [self.vocab.stoi["<EOS>"]]

        return torch.tensor(question_ids), torch.tensor(answer_ids)
    
class BatchCollator:
    def __init__(self, pad_idx):
        self.pad_idx = pad_idx

    def __call__(self, batch):
        questions, answers = zip(*batch)
        questions_padded = pad_sequence(questions, batch_first=False, padding_value=self.pad_idx)
        answers_padded = pad_sequence(answers, batch_first=False, padding_value=self.pad_idx)
        return questions_padded, answers_padded

In [None]:
def create_data_loader(file_path, batch_size=32, num_workers=8, shuffle=True, pin_memory=True):
    dataset = QADataset(file_path)
    pad_idx = dataset.vocab.stoi["<PAD>"]
    
    loader = DataLoader(dataset, 
                        batch_size=batch_size,
                        num_workers=num_workers,
                        shuffle=shuffle,
                        pin_memory=pin_memory,
                        collate_fn=BatchCollator(pad_idx))
    
    return loader, dataset

In [None]:
## Implementation of Encoder and decoder architecture using the context vectors for attention mechanism
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, bidirectional=True, dropout=dropout)
        #self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        #self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, x):
        x = self.embedding(x)
        encoder_outputs, (hidden, cell) = self.lstm(x)
        #hidden = self.fc_hidden(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        #cell = self.fc_cell(torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1))
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1).unsqueeze(0)
        cell = torch.cat((cell[-2,:,:], cell[-1,:,:]), dim=1).unsqueeze(0)
        return encoder_outputs, hidden, cell
    
"""
#Implementation using BERT for better semantic understanding
class ContextualEncoder(nn.Module):
    def __init__(self, hidden_size, num_layers, p):
        super(ContextualEncoder, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.lstm = nn.LSTM(self.bert.config.hidden_size, hidden_size, num_layers, bidirectional=True, dropout=dropout)
        #self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        #self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)

    def forward(self, x):
        with torch.no_grad():
            outputs = self.bert(x)
        encoder_outputs = outputs.last_hidden_state
        hidden, cell = self.lstm(encoder_states)
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))
        return encoder_outputs, hidden, cell
"""

class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers, dropout):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(hidden_size*2 + embed_size, hidden_size, num_layers, dropout=dropout)
        self.attention = nn.Linear(hidden_size*3, 1)
        self.fc_out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_outputs, hidden, cell):
        x = x.unsqueeze(0)
        embed = self.dropout(self.embedding(x))
        sequence_length = encoder_outputs.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        energy = torch.tanh(self.attention(torch.cat((h_reshaped, encoder_outputs), dim=2)))
        attention_weights = torch.softmax(energy, dim=0)
        context_vector = torch.einsum("snk,snl->knl", attention_weights, encoder_outputs)
        lstm_input = torch.cat((context_vector, embed), dim=2)
        outputs, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
        predictions = self.fc_out(outputs).squeeze(0)
        return predictions, hidden, cell

In [None]:
## Implementation of Sequence to Sequence model using LSTM cells 
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(src.device)
        encoder_outputs, hidden, cell = self.encoder(src)

        x = trg[0]
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(x, encoder_outputs, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = trg[t] if torch.rand(1).item() < teacher_forcing_ratio else best_guess

        return outputs

In [None]:
# define hyperparameters
epochs=100
lr=3e-4
embed_size=300
hidden_size=1024
num_layers=1
dropout=0.5
batch_size=32

# Initialize and Train the Model
def train(qa_file, epochs=epochs, lr=lr, embed_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, batch_size=batch_size, num_workers=8, shuffle=True, pin_memory=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_loader, dataset = create_data_loader(qa_file, batch_size=batch_size, num_workers=num_workers, shuffle=shuffle, pin_memory=pin_memory)

    input_size = len(dataset.vocab)
    output_size = len(dataset.vocab)

    encoder = Encoder(input_size, embed_size, hidden_size, num_layers, dropout).to(device)
    decoder = Decoder(input_size, embed_size, hidden_size, output_size, num_layers, dropout).to(device)
    model = Seq2Seq(encoder, decoder).to(device)

    optimizer = optim.Adam(model.parameters(), lr=lr)
    pad_idx = dataset.vocab.stoi["<PAD>"]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

    for epoch in range(epochs):
        model.train()
        for batch in data_loader:
            questions, answers = batch
            questions, answers = questions.to(device), answers.to(device)

            optimizer.zero_grad()
            output = model(questions, answers)
            output = output[1:].view(-1, output.shape[-1])
            answers = answers[1:].reshape(-1)
            loss = criterion(output, answers)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
            optimizer.step()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}")


train("path/to/your/qa_dataset.csv")
        
plt.figure(figsize=(12, 6))

# Plot training and validation losses
plt.plot(range(1, epochs + 1), train_losses, label='Train')
#plt.plot(range(1, epochs + 1), val_losses, label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
encoder.save("encoder_final")
decoder.save("decoder_final")