### Mounting Google Drive - to run when start Google Colab

In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

    # Import the required libraries
from sklearn.model_selection import train_test_split

import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import time
import math

import pprint  # for pretty printing
pp = pprint.PrettyPrinter()

import os

# Paths setup
if "IN_COLAB" in os.environ:
    ROOT_PATH = '/content/drive/MyDrive/Colab Notebooks/'
else:
    ROOT_PATH = './'



Mounted at /content/drive


### Concatenate batch corpus - no need relunch it unless new pre-processed corpus

In [None]:
# List of file paths to concatenate
file_paths = [
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_1.txt',
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_2.txt',
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_3.txt',
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_4.txt',
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_5.txt',
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_6.txt',
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_7.txt',
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_8.txt',
    '/content/drive/MyDrive/Colab Notebooks/corpus_cleaned_lyrics_batch_9.txt'
]
# Output file path
CORPUS_UNCLEANED_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/concatenated_preprocessed_corpus.txt'

# Function to concatenate files
def concatenate_files(file_paths, output_file_path):
    with open(output_file_path, 'w') as output_file:
        for file_path in file_paths:
            with open(file_path, 'r') as input_file:
                content = input_file.read()
                output_file.write(content)

# Call the function to concatenate files
concatenate_files(file_paths, CORPUS_UNCLEANED_FILE_PATH)

print(f'Concatenated files to {CORPUS_UNCLEANED_FILE_PATH}')


Concatenated files to /content/drive/MyDrive/Colab Notebooks/concatenated_preprocessed_corpus.txt


### Define files path - run each time

In [2]:
# Path to the initial preprocessed corpus file. This file contains raw data that has undergone
# some basic cleaning but may require further processing to be ready for model training.
CORPUS_UNCLEANED_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/concatenated_preprocessed_corpus.txt'

# Path to the fully cleaned corpus file. This file is the result of applying an additional layer
# of cleaning to the initial preprocessed corpus, making it suitable for more advanced data analysis
# and model training.
CORPUS_FILE_PATH = '/content/drive/MyDrive/Colab Notebooks/cleaned_corpus.txt'

# The directory where various output files like models and processed data will be saved.
SAVE_PATH = '/content/drive/MyDrive/Colab Notebooks/save_files/'

# Path to the pre-trained Word2Vec model. This model provides embeddings for the words
# in the corpus and is used to convert words to vectors for machine learning tasks.
WORD2VEC_PATH = '/content/drive/MyDrive/Colab Notebooks/GoogleNews-vectors-negative300.bin'


### Extra Corpus cleaning: to run only when extra cleaning is needed

additional cleaning steps on a pre-processed corpus of text data:

In [None]:
"""
This code adds an advanced layer of cleaning to a pre-processed text corpus, specifically tailored for natural language processing applications.
The additional cleaning steps include:

1. **Further Verse Preprocessing**: It refines each verse in the corpus by removing specific markers (like `<winner>`) and handling special cases
such as `<battle_start>`. Unnecessary spaces are also condensed.

2. **Word Count Filtering**: Verses are assessed for their word count, and those with a word count below a set threshold (default 10) are excluded.
This helps in maintaining a quality standard in the dataset.

3. **Corpus Consolidation**: The entire corpus is processed verse by verse, ensuring only those battles (group of verses) that meet the word count
criteria are retained. This results in a more compact and relevant dataset.

4. **Missing Word Identification**: The code identifies words in the corpus that are not present in a pre-trained Word2Vec model.
This is crucial for aligning the corpus vocabulary with the model's vocabulary.

5. **Removal of Low-Frequency Missing Words**: Words that are both missing from the Word2Vec model and have low occurrence in the corpus are removed.
This step refines the corpus further, eliminating rare or potentially irrelevant words.

Overall, these steps significantly enhance the corpus’s quality by ensuring consistency, relevance, and alignment with the Word2Vec model,
thereby making it more suitable for training robust NLP models.
"""
import re
from gensim.models import KeyedVectors

# This function preprocesses individual verses in the corpus. It performs operations such as
# removing specific markers, consolidating battle start markers, and removing excessive spaces.
def preprocess_verse(verse):
    verse = verse.replace("<eos> <sos>", " ")
    verse = re.sub(r"<winner> (challenger|defender|unknown)", "", verse)
    verse = verse.replace("<battle_start> ", "<battle_start>")
    verse = re.sub(' +', ' ', verse)
    return verse.strip()

# Counts the number of words in a verse, used to determine if a battle verse should be kept.
def count_words_in_verse(verse):
    return len(verse.split())

# Determines if a battle should be kept based on the minimum word count criteria.
def should_keep_battle(battle, min_words=10):
    for verse in battle:
        if verse.startswith("<sos>") and count_words_in_verse(verse) < min_words:
            return False
    return True

# Processes the entire corpus, applying the preprocessing rules and filtering out battles
# that don't meet the word count criteria.
def process_corpus(input_file_path, output_file_path, min_words=10):
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        current_battle = []
        current_verse = ""

        for line in infile:
            line = line.strip()

            if line == "<battle_start>":
                if current_battle and should_keep_battle(current_battle, min_words):
                    outfile.write('\n'.join(current_battle) + '\n')
                current_battle = [line]
                current_verse = ""
            elif line in ["<challenger>", "<defender>", "<battle_end>"]:
                if current_verse:
                    current_battle.append(preprocess_verse(current_verse))
                    current_verse = ""
                current_battle.append(line)
            else:
                current_verse += line + " "

        if current_battle and should_keep_battle(current_battle, min_words):
            outfile.write('\n'.join(current_battle) + '\n')

# Paths to the input and output files for processing the corpus.
output_file_path = '/content/drive/MyDrive/Colab Notebooks/final_preprocessed_corpus.txt'
process_corpus(CORPUS_UNCLEANED_FILE_PATH, output_file_path)

# Preprocesses text by converting to lowercase, removing punctuation and special characters,
# and splitting into words.
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text.strip().split()

# Finds words in the corpus that are missing from the Word2Vec model, with their occurrence counts.
def find_missing_words_with_counts(corpus_path, word2vec_model):
    missing_words_counts = {}

    with open(corpus_path, 'r', encoding='utf-8') as file:
        for line in file:
            words = preprocess_text(line)
            for word in words:
                if word not in word2vec_model:
                    missing_words_counts[word] = missing_words_counts.get(word, 0) + 1

    return missing_words_counts

# Removes words from the corpus that are missing in the Word2Vec model and occur less frequently
# than a specified threshold.
def remove_low_frequency_missing_words(corpus_path, output_path, missing_words, min_occurrence=10):
    with open(corpus_path, 'r', encoding='utf-8') as infile, open(output_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            words = preprocess_text(line)
            cleaned_line = ' '.join([word for word in words if missing_words.get(word, 0) >= min_occurrence])
            if cleaned_line:
                outfile.write(cleaned_line + '\n')

# Path to the word2vec model.
word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

corpus_path = '/content/drive/MyDrive/Colab Notebooks/final_preprocessed_corpus.txt'
missing_words_counts = find_missing_words_with_counts(corpus_path, word2vec)
low_freq_missing_words = {word: count for word, count in missing_words_counts.items() if count < 10}

# Path to the new, cleaned corpus file.
remove_low_frequency_missing_words(corpus_path, CORPUS_FILE_PATH, low_freq_missing_words)


### Word Tokenizer - to run each time

In [None]:
"""
This code defines an `EnhancedWordLevelTokenizer` class for creating a custom word-level tokenizer. Its primary functions include:

1. **Training on a corpus:** It learns a mapping from words to indices based on the frequency of words in a given text corpus.
2. **Handling special tokens:** It can recognize and prioritize special tokens like `<sos>`, `<eos>`, etc.
3. **Encoding and decoding:** Converts texts into sequences of indices and vice versa, facilitating the processing of text data for machine learning models.
4. **State preservation:** The tokenizer's configuration can be saved to and loaded from a file, ensuring consistency across different sessions or applications.

"""

import os
from collections import Counter

# Class Definition: EnhancedWordLevelTokenizer
# This class is designed for creating a word-level tokenizer that can handle both standard words and special tokens.
# It can train on a corpus, encode and decode texts, and save/load its state for future use.
class EnhancedWordLevelTokenizer:
    # Initialization Method
    # Initializes the tokenizer with word-index and index-word mappings, and accepts special tokens.
    def __init__(self, special_tokens=None):
        self.word_index = {}  # Dictionary mapping words to indices
        self.index_word = {}  # Dictionary mapping indices to words
        self.special_tokens = special_tokens or []  # List of special tokens

    # Training Method
    # Trains the tokenizer on a list of texts. It processes the texts to create a word frequency map.
    # Special tokens are added first to ensure they have priority.
    def train(self, texts):
        word_counts = Counter()
        for text in texts:
            # Split the text into words and special tokens
            words = text.replace('>', '> ').replace('<', ' <').split()
            word_counts.update(words)

        # Add special tokens to the tokenizer
        for token in self.special_tokens:
            self.add_word(token)

        # Add words to the tokenizer based on their frequency
        for word, _ in word_counts.most_common():
            self.add_word(word)

    # Add Word Method
    # Adds a word to the tokenizer's dictionaries if it's not already present.
    def add_word(self, word):
        if word not in self.word_index:
            index = len(self.word_index)
            self.word_index[word] = index
            self.index_word[index] = word

    # Encoding Method
    # Converts a text into a sequence of indices representing the words.
    def encode(self, text):
        # Split the text into words and special tokens for encoding
        words = text.replace('>', '> ').replace('<', ' <').split()
        return [self.word_index.get(word, self.word_index["<unk>"]) for word in words]

    # Decoding Method
    # Converts a sequence of indices back into a text string.
    def decode(self, indices):
        decoded_words = []
        for index in indices:
            word = self.index_word.get(index, "<unk>")
            # Add the word or special token to the decoded list
            decoded_words.append(word)
        return ' '.join(decoded_words).strip()

    # Save Method
    # Saves the tokenizer's state (word-index mapping) to a file for later use.
    def save(self, path):
        with open(path, 'w') as file:
            for word, index in self.word_index.items():
                file.write(f'{word}\t{index}\n')

    # Load Method
    # Loads the tokenizer's state from a file, reconstructing the word-index and index-word mappings.
    def load(self, path):
        with open(path, 'r') as file:
            for line in file:
                word, index = line.strip().split('\t')
                self.word_index[word] = int(index)
                self.index_word[int(index)] = word

# Usage Example
# Define special tokens that will be treated uniquely by the tokenizer
special_tokens = ["<sos>", "<eos>", "<pad>", "<unk>", "<battle_start>", "<challenger>", "<defender>", "<winner>", "<battle_end>"]
tokenizer = EnhancedWordLevelTokenizer(special_tokens=special_tokens)

# Read and process the corpus file to prepare for tokenizer training
with open(CORPUS_FILE_PATH, 'r', encoding='utf-8') as file:
    lines = [line.strip() for line in file if line.strip()]

# Train the tokenizer with the processed lines and save its state
tokenizer.train(lines)
tokenizer.save(os.path.join(SAVE_PATH, 'word_level_tokenizer_rev1.txt'))


### Split Corpus train-validation-test

In [None]:
# This script is designed to split a text corpus into training, validation, and testing datasets.
# It is especially useful for preparing data for machine learning models in natural language processing.

def split_corpus(corpus_file, save_path, test_size, valid_size):
    """
    Splits the corpus into training, validation, and testing datasets.

    Args:
    corpus_file (str): Path to the corpus file.
    save_path (str): Directory where the split files will be saved.
    test_size (float): Proportion of the dataset to include in the test split.
    valid_size (float): Proportion of the training dataset to include in the validation split.

    The function reads the entire corpus, splits it into individual battles (assuming each battle ends with '<battle_end>\n'),
    and then divides these battles into training, validation, and test sets based on the specified proportions.
    """

    with open(corpus_file, 'r', encoding='utf-8') as f:
        # Read and split the corpus into separate battles
        data_cleaned = f.read().strip().split('<battle_end>\n')

    # Calculate the indices for splitting the data
    num_battles = len(data_cleaned)
    test_idx = int(num_battles * (1 - test_size))
    valid_idx = int(test_idx * (1 - valid_size))

    # Split the data into training, validation, and testing sets
    train_battles = data_cleaned[:valid_idx]
    valid_battles = data_cleaned[valid_idx:test_idx]
    test_battles = data_cleaned[test_idx:]

    # Write each dataset to a separate file
    for battles, name in [(train_battles, 'train'), (valid_battles, 'valid'), (test_battles, 'test')]:
        with open(os.path.join(save_path, f'{name}.txt'), 'w') as file:
            for battle in battles:
                file.write(battle + '<battle_end>\n')  # Append '<battle_end>\n' to maintain the original format

# Example usage of the function
split_corpus(CORPUS_FILE_PATH, SAVE_PATH, test_size=0.1, valid_size=0.1)


### Corpus split into paired verse
Corpus class to manage and tokenize the text corpus into training, validation, and testing sets for the LMST model. It processes the given corpus by splitting and encoding the text based on roles (challenger and defender) in battle rap verses, enabling these structured, tokenized data pairs to be used for the model training and evaluation.

In [None]:
import os
import torch
from collections import Counter

class Corpus(object):
    """
    A class to handle the tokenization of a text corpus for training, validation, and testing in NLP models.

    Attributes:
    tokenizer (EnhancedWordLevelTokenizer): An instance of tokenizer used to encode the text.
    train (list): List of tokenized verse pairs (challenger and defender) for training.
    valid (list): List of tokenized verse pairs for validation.
    test (list): List of tokenized verse pairs for testing.

    Methods:
    tokenize(path): Tokenizes the verses from a file at the given path.
    """

    def __init__(self, path, tokenizer):
        # Initialize the corpus with tokenized training, validation, and testing sets
        self.tokenizer = tokenizer
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        """
        Tokenizes a text file.

        Args:
        path (str): Path to the file to be tokenized.

        Returns:
        List of tokenized verse pairs (challenger, defender) in the file.

        The function reads the file line by line, distinguishing between challenger and defender verses,
        and tokenizes each verse using the provided tokenizer. Each battle is represented as a pair of tokenized verses.
        """
        verse_pairs = []
        challenger_verse, defender_verse = [], []
        is_challenger = True

        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line == "<battle_start>":
                    challenger_verse, defender_verse = [], []
                elif line == "<challenger>":
                    is_challenger = True
                elif line == "<defender>":
                    is_challenger = False
                elif line.startswith("<sos>"):
                    verse_content = line[5:].split("<eos>")[0].strip()  # Extract verse content
                    verse = self.tokenizer.encode(verse_content)
                    if is_challenger:
                        challenger_verse.extend(verse)
                    else:
                        defender_verse.extend(verse)
                elif line == "<battle_end>":
                    if challenger_verse and defender_verse:
                        verse_pairs.append((torch.tensor(challenger_verse, dtype=torch.long),
                                            torch.tensor(defender_verse, dtype=torch.long)))
                    challenger_verse, defender_verse = [], []

        return verse_pairs

# Example usage of the Corpus class
corpus = Corpus(SAVE_PATH, tokenizer)

# Print verses from the challenger and defender for the first few battles
num_battles_to_display = 5  # Number of battles to display
for i, (challenger_verse, defender_verse) in enumerate(corpus.train):
    if i >= num_battles_to_display:
        break

    challenger_text = ' '.join([tokenizer.index_word[idx] for idx in challenger_verse.numpy()])
    defender_text = ' '.join([tokenizer.index_word[idx] for idx in defender_verse.numpy()])

    print(f"Bataille {i+1}:")
    print("Challenger:", challenger_text)
    print("Defender:", defender_text)
    print("-" * 30)  # Separator for clarity


Bataille 1:
Challenger: have fun tryna make a comeback have fun tryna have sex with a dick like a thumb tack prolly spend hey day bitten watching pornhub i am so hungry look for milfs like its i found your mom and she is my top hoe she kinda old but still gimme top tho she kinda fishy smell like a flounder but who cares i am still going to pound her call me lil clumsy because i am a klutz tryna fuck the pussy but put it in they butts then i pull out and have hey mom on suck my nuts do not come at me tryna be lyrical because if i have to i will go political build you up break you down like the fucking wall take you out by your knees i make hey bitch ass fall to the ground shut the fuck up ion want to here sound like honestly how can one be so fucking dumb maybe hey daddy pulled out and you was born off of premium anybody got brains because this bitch really needs sum
Defender: hell you either know it or you own it i could give you some credit i understand you a little desperate i have l

### Embedding & LSTM model - to run each time
The provided script defines a neural network model for natural language processing tasks using Long Short-Term Memory (LSTM). The model features a hybrid embedding layer that integrates both pre-trained and trainable embeddings, enhancing its ability to handle a diverse vocabulary including out-of-vocabulary words. The LSTM layers in the model capture the temporal dependencies in the sequence data. This model is particularly suited for tasks involving sequential text data, like language modeling, text generation, or even more complex tasks such as sentiment analysis or machine translation. The inclusion of a hybrid embedding layer makes the model robust in handling a wide range of words, improving its overall performance on various natural language processing tasks.

In [None]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from gensim.models import KeyedVectors

class HybridEmbeddingLayer(nn.Module):
    """
    A layer that combines trainable embeddings with pretrained embeddings.

    Attributes:
    embedding (torch.nn.Embedding): An embedding layer.

    Methods:
    forward(x): Forward pass for embedding lookup.
    """

    def __init__(self, num_embeddings, embedding_dim, pretrained_embeddings=None, oov_indices=None):
        """
        Initialize the HybridEmbeddingLayer.

        Args:
        num_embeddings (int): The size of the vocabulary.
        embedding_dim (int): The size of each embedding vector.
        pretrained_embeddings (Tensor, optional): A tensor containing pretrained embeddings.
        oov_indices (list, optional): Indices of out-of-vocabulary words.
        """
        super(HybridEmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(pretrained_embeddings)
            if oov_indices is not None:
                self.embedding.weight.data[oov_indices].normal_()

    def forward(self, x):
        return self.embedding(x)

class LSTMModel(nn.Module):
    """
    A LSTM based sequence model.

    Attributes:
    ntoken (int): Number of tokens in the vocabulary.
    nhid (int): Number of hidden units per layer.
    nlayers (int): Number of LSTM layers.
    initrange (float): Range for weight initialization.
    drop (torch.nn.Dropout): Dropout layer.
    encoder (HybridEmbeddingLayer): Embedding layer.
    rnn (torch.nn.LSTM): LSTM network.
    decoder (torch.nn.Linear): Linear decoder layer.

    Methods:
    create_weight_matrix(ntoken, emb_dim, word2vec): Creates the weight matrix for embeddings.
    init_weights(): Initializes weights.
    init_hidden(bsz): Initializes hidden states.
    forward(input, hidden, lengths): Forward pass of the model.
    """

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.2, initrange=0.1, word2vec_path=None):
        """
        Initialize the LSTM model.

        Args:
        ntoken (int): Number of tokens in the vocabulary.
        ninp (int): Size of the embeddings.
        nhid (int): Number of hidden units per LSTM layer.
        nlayers (int): Number of LSTM layers.
        dropout (float): Dropout probability.
        initrange (float): Range for weight initialization.
        word2vec_path (str, optional): Path to pretrained Word2Vec embeddings.
        """
        super(LSTMModel, self).__init__()
        self.ntoken = ntoken
        self.nhid = nhid
        self.nlayers = nlayers
        self.initrange = initrange

        weights_matrix = None
        oov_indices = []

        if word2vec_path:
            word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
            weights_matrix, oov_indices = self.create_weight_matrix(ntoken, ninp, word2vec)

        self.drop = nn.Dropout(dropout)
        self.encoder = HybridEmbeddingLayer(ntoken, ninp, pretrained_embeddings=weights_matrix, oov_indices=oov_indices)
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=(dropout if nlayers > 1 else 0))
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

    def create_weight_matrix(self, ntoken, emb_dim, word2vec):
        """
        Creates a weight matrix for the embedding layer from a Word2Vec model.

        Args:
        ntoken (int): Number of tokens.
        emb_dim (int): Embedding dimension.
        word2vec (gensim.models.KeyedVectors): Pre-trained Word2Vec model.

        Returns:
        Tuple of weights_matrix (Tensor) and oov_indices (list).
        """
        weights_matrix = torch.zeros(ntoken, emb_dim)
        oov_indices = []
        for i, word in enumerate(vocabulary):
            try:
                weights_matrix[i] = torch.from_numpy(word2vec[word])
            except KeyError:
                oov_indices.append(i)
        return weights_matrix, oov_indices

    def init_weights(self):
        """
        Initialize weights of the model.
        """
        nn.init.uniform_(self.encoder.embedding.weight, -self.initrange, self.initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -self.initrange, self.initrange)

    def init_hidden(self, bsz):
        """
        Initialize the hidden states of the LSTM.

        Args:
        bsz (int): Batch size.

        Returns:
        A tuple of tensors representing the initial hidden state and cell state.
        """
        weight = next(self.parameters()).data
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid))

    def forward(self, input, hidden, lengths):
        """
        Forward pass of the LSTM model.

        Args:
        input (Tensor): Input tensor.
        hidden (tuple): Initial hidden and cell states.
        lengths (Tensor): The length of each sequence in the batch.

        Returns:
        Output tensor and the final hidden state.
        """
        emb = self.drop(self.encoder(input))

        packed_emb = pack_padded_sequence(emb, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(packed_emb, hidden)

        max_length = input.size(1)
        output, _ = pad_packed_sequence(packed_output, batch_first=True, total_length=max_length)

        return output, hidden


### Batching & Padding - to run each time
This script creates a custom Dataset and DataLoader for handling rap battle verses. The Dataset class PaddedBatchedRapBattles is designed to equalize the length of verses within each pair and sort them by length for efficient batching. It also includes a custom collation function to handle the padding and batching of variable-length sequences. This setup is particularly useful for tasks involving sequential data, such as language modeling or text generation, where input sequences can vary in length.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

class PaddedBatchedRapBattles(Dataset):
    """
    A Dataset class for handling rap battle verse pairs, ensuring they are padded and batched efficiently.

    Attributes:
    pad_idx (int): Index used for padding.
    sorted_verse_pairs (list): List of verse pairs, sorted by their length.

    Methods:
    equalize_verse_pairs_lengths(verse_pairs): Equalizes the length of verses in each pair.
    sort_verse_pairs_by_length(verse_pairs): Sorts verse pairs by their total length.
    __len__(): Returns the number of verse pairs.
    __getitem__(idx): Retrieves a verse pair based on the index.
    collate_fn(batch): Custom collate function to batch verses.
    """

    def __init__(self, verse_pairs, pad_idx):
        """
        Initializes the dataset with verse pairs that are equalized in length and sorted.

        Args:
        verse_pairs (list of tuples): List of tuples where each tuple contains two verses (challenger and defender).
        pad_idx (int): Index used for padding.
        """
        self.pad_idx = pad_idx
        equalized_verse_pairs = self.equalize_verse_pairs_lengths(verse_pairs)
        self.sorted_verse_pairs = self.sort_verse_pairs_by_length(equalized_verse_pairs)

    def equalize_verse_pairs_lengths(self, verse_pairs):
        """
        Equalizes the lengths of verses in each verse pair by padding them.

        Args:
        verse_pairs (list of tuples): Verse pairs to be equalized.

        Returns:
        List of equalized verse pairs.
        """
        equalized_verse_pairs = []
        for challenger_verse, defender_verse in verse_pairs:
            max_length = max(len(challenger_verse), len(defender_verse))
            challenger_verse_padded = list(challenger_verse) + [self.pad_idx] * (max_length - len(challenger_verse))
            defender_verse_padded = list(defender_verse) + [self.pad_idx] * (max_length - len(defender_verse))
            equalized_verse_pairs.append((challenger_verse_padded, defender_verse_padded))
        return equalized_verse_pairs

    def sort_verse_pairs_by_length(self, verse_pairs):
        """
        Sorts verse pairs by the total length of their verses.

        Args:
        verse_pairs (list): Verse pairs to be sorted.

        Returns:
        Sorted list of verse pairs.
        """
        return sorted(verse_pairs, key=lambda x: len(x[0]) + len(x[1]))

    def __len__(self):
        return len(self.sorted_verse_pairs)

    def __getitem__(self, idx):
        challenger_verse, defender_verse = self.sorted_verse_pairs[idx]
        return torch.tensor(challenger_verse, dtype=torch.long), torch.tensor(defender_verse, dtype=torch.long)

    def collate_fn(self, batch):
        """
        Custom collation function to group verses in a batch, handling variable-length sequences.

        Args:
        batch (list): Batch of verse pairs.

        Returns:
        Tuple of tensors representing padded challenger and defender verses.
        """
        challenger_batch, defender_batch = zip(*batch)
        challenger_padded = pad_sequence([v.clone().detach() for v in challenger_batch], batch_first=True, padding_value=self.pad_idx)
        defender_padded = pad_sequence([v.clone().detach() for v in defender_batch], batch_first=True, padding_value=self.pad_idx)
        return challenger_padded, defender_padded

def create_dataloader(verse_pairs, batch_size, pad_idx):
    """
    Creates a DataLoader for the rap battle verses dataset.

    Args:
    verse_pairs (list): List of verse pairs.
    batch_size (int): Size of each batch.
    pad_idx (int): Padding index.

    Returns:
    DataLoader object.
    """
    dataset = PaddedBatchedRapBattles(verse_pairs, pad_idx)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=dataset.collate_fn)

# `pad_idx` define
PAD_IDX = tokenizer.word_index["<pad>"]


### Batching & Padding - Test - No need run each time
This script demonstrates how to use the PaddedBatchedRapBattles dataset and create_dataloader function to create a DataLoader for training. It initializes the dataset with training data, specifying the padding index to handle variable-length sequences. A DataLoader is then created with a specified batch size. The script iterates through the first two batches of the DataLoader, displaying the contents of each batch for both challenger and defender verses. This setup is essential for training models on sequential data, where efficient batching and padding are crucial for handling varying sequence lengths.

In [None]:
# Example of creating the dataset with padding and batching
# Initialize the dataset using the training data and padding index
train_dataset_example = PaddedBatchedRapBattles(corpus.train, PAD_IDX)

# Use the create_dataloader function to create a DataLoader
# You can adjust the batch size as per your requirements
batch_size_example = 2
train_loader_example = create_dataloader(corpus.train, batch_size_example, PAD_IDX)

# Displaying details of the batches
# Iterate through the DataLoader to access batches
for i, batch in enumerate(train_loader_example):
    if i >= 2:  # Limit to 2 batches for demonstration purposes
        break

    # Extract challenger and defender verses from the batch
    challenger_batch, defender_batch = batch
    print(f"Batch {i + 1} :")
    print("Challenger Batch:", challenger_batch)
    print("Defender Batch:", defender_batch)
    print("-" * 40)  # Separator for clarity


Batch 1 :
Challenger Batch: tensor([[   79,    74,    13,   477,    28,    20,    17,    89,    38,     9,
           125,    13,    30,    47,   143,    10,    39,    27,    11,   536,
           244,   487,    10,    13,    17,  5301,  3878,    14,     9,    16,
            65,    13,   259,    17,   170,    13,    12, 13992,   104,    20,
            11,  1953,  2014,    49,    10,    65,    13,   221,   471,    84,
            22,    13,    18,   148,    49,    10,    30,    19,     9,    16,
            35,   584,    63,    80,    17,   302,   104,   389,    58,    11,
          4658,    75,   122,    12,  2865,     9,    37,    20,    18,   636,
             9,    30,    19,   189,   438,  2014,  2014,  2014,  1495,     9,
            34,  1670,    21,    79,    18,  2834,     9,   635,    24,    45,
            30,    19,    34,  1799,    23,    41,   260,     9,    31,   477,
            28,    60,     9,    16,    12,  6708,    42,     9,    16,    39,
            28,    11,  

### Batching & Padding - on all corpus - to run each time
This script sets up the DataLoaders for training, validation, and testing phases in a machine learning workflow. It first defines appropriate batch sizes for training and evaluation. Then, it initializes datasets for each phase (training, validation, testing) using the PaddedBatchedRapBattles class, which ensures that the data is consistently padded and batched. This class is particularly useful for sequential data like text, where inputs can vary in length. Finally, the script creates DataLoaders for these datasets, which will allow for efficient iteration over the data during model training and evaluation. The DataLoader for training uses a larger batch size compared to the evaluation DataLoader, reflecting common practice in machine learning to use smaller batches for evaluation to reduce memory requirements and potentially increase validation/test accuracy.

In [None]:
from torch.utils.data import DataLoader

# Define batch sizes for training and evaluation
BATCH_SIZE = 8  # Number of samples per batch during training
EVAL_BATCH_SIZE = 4  # Number of samples per batch during evaluation

# Initialize datasets for training, validation, and testing
# Using the PaddedBatchedRapBattles class ensures consistent padding and batch processing
train_dataset = PaddedBatchedRapBattles(corpus.train, PAD_IDX)
val_dataset = PaddedBatchedRapBattles(corpus.valid, PAD_IDX)
test_dataset = PaddedBatchedRapBattles(corpus.test, PAD_IDX)

# Create DataLoaders for the datasets
# DataLoaders provide an efficient way to iterate over the datasets in batches
# The create_dataloader function is used to create these loaders with specified batch sizes
train_loader = create_dataloader(corpus.train, BATCH_SIZE, PAD_IDX)
val_loader = create_dataloader(corpus.valid, EVAL_BATCH_SIZE, PAD_IDX)
test_loader = create_dataloader(corpus.test, EVAL_BATCH_SIZE, PAD_IDX)

# Now, train_loader, val_loader, and test_loader can be used in the training and evaluation loop


### Train - Validate - Evaluate functions - to run each time
This code provides the functions train, validate, and evaluate for an LSTM model. Each function performs a key step in the machine learning workflow: training the model with backpropagation, validating it on a separate dataset, and evaluating its performance on test data. The functions handle data from a DataLoader, which supplies inputs for the challenger and defender in a rap battle scenario. The model is trained on the challenger's verses first, with the hidden state then passed to the defender's verses, simulating a conversational context. Loss is computed based on the defender's output, reflecting the responsive nature of the training scenario. The model is optimized to predict the defender's response based on the challenger's input, considering the flow of the conversation.

In [None]:
import torch
from tqdm import tqdm

def train(model, train_loader, criterion, optimizer, device, pad_idx):
    """
    Train the model for one epoch.

    Args:
        model: The LSTM model to be trained.
        train_loader: DataLoader for the training data.
        criterion: Loss function.
        optimizer: Optimization algorithm.
        device: Device to train on (e.g., 'cuda' or 'cpu').
        pad_idx: Index of the padding token in the vocabulary.

    Returns:
        Average training loss for the epoch.
    """
    model.train()  # Set the model to training mode
    total_train_loss = 0

    for batch_num, (challenger_inputs, defender_inputs) in enumerate(tqdm(train_loader, desc="Training")):
        # Transfer input data to the specified device (e.g., GPU)
        challenger_inputs, defender_inputs = challenger_inputs.to(device), defender_inputs.to(device)

        # Clear previous gradients
        optimizer.zero_grad()

        # Initialize the hidden state of the LSTM
        hidden = model.init_hidden(challenger_inputs.size(0))

        # Forward pass through the model for the challenger
        _, hidden = model(challenger_inputs, hidden, challenger_inputs.ne(pad_idx).sum(1))

        # Forward pass for the defender, using the updated hidden state from the challenger's pass
        output, _ = model(defender_inputs, hidden, defender_inputs.ne(pad_idx).sum(1))

        # Decode the LSTM output to token space
        output = model.decoder(output)

        # Flatten the output and targets for loss computation
        targets_flat = defender_inputs.view(-1)
        output_flat = output.view(-1, model.ntoken)

        # Calculate loss, ignoring the padding
        loss = criterion(output_flat, targets_flat)
        loss = loss.masked_select(targets_flat.ne(pad_idx)).mean()
        total_train_loss += loss.item()

        # Perform backpropagation and optimize the model
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_loader)
    return avg_train_loss

def validate(model, val_loader, criterion, device, pad_idx):
    """
    Validate the model on the validation dataset.

    Args:
        model: The LSTM model to be validated.
        val_loader: DataLoader for the validation data.
        criterion: Loss function.
        device: Device for validation (e.g., 'cuda' or 'cpu').
        pad_idx: Index of the padding token in the vocabulary.

    Returns:
        Average validation loss.
    """
    model.eval()  # Set the model to evaluation mode
    total_val_loss = 0

    with torch.no_grad():
        for val_batch_num, (challenger_inputs, defender_inputs) in enumerate(tqdm(val_loader, desc="Validation")):
            challenger_inputs, defender_inputs = challenger_inputs.to(device), defender_inputs.to(device)
            hidden = model.init_hidden(challenger_inputs.size(0))
            _, hidden = model(challenger_inputs, hidden, challenger_inputs.ne(pad_idx).sum(1))
            output, _ = model(defender_inputs, hidden, defender_inputs.ne(pad_idx).sum(1))
            output = model.decoder(output)
            targets_flat = defender_inputs.view(-1)
            output_flat = output.view(-1, model.ntoken)
            loss = criterion(output_flat, targets_flat)
            loss = loss.masked_select(targets_flat.ne(pad_idx)).mean()
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_loader)
    return avg_val_loss

def evaluate(model, eval_loader, criterion, device, pad_idx):
    """
    Evaluate the model on the test dataset.

    Args:
        model: The LSTM model to be evaluated.
        eval_loader: DataLoader for the test data.
        criterion: Loss function.
        device: Device for evaluation (e.g., 'cuda' or 'cpu').
        pad_idx: Index of the padding token in the vocabulary.

    Returns:
        Average loss on the test data.
    """
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for batch_num, (challenger_inputs, defender_inputs) in enumerate(tqdm(eval_loader, desc="Evaluating")):
            challenger_inputs, defender_inputs = challenger_inputs.to(device), defender_inputs.to(device)
            hidden = model.init_hidden(challenger_inputs.size(0))
            _, hidden = model(challenger_inputs, hidden, challenger_inputs.ne(pad_idx).sum(1))
            output, _ = model(defender_inputs, hidden, defender_inputs.ne(pad_idx).sum(1))
            output = model.decoder(output)
            targets_flat = defender_inputs.view(-1)
            output_flat = output.view(-1, model.ntoken)
            loss = criterion(output_flat, targets_flat)
            loss = loss.masked_select(targets_flat.ne(pad_idx)).mean()
            total_loss += loss.item()

        avg_loss = total_loss / len(eval_loader)
    return avg_loss

### Hyper-parameters setup - to run each time

In [None]:
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import os
import time
from gensim.models import KeyedVectors

# Define constants and setup for the LSTM model training
EMBEDDING_SIZE = 400  # Size of word embeddings
HIDDEN_SIZE = 512     # Number of features in the hidden state of the LSTM
N_LAYERS = 3          # Number of stacked LSTM layers
DROPOUT = 0.3         # Dropout rate for regularization
EPOCHS = 20           # Number of training epochs
LEARNING_RATE = 0.01  # Learning rate for the optimizer
WEIGHT_DECAY = 5e-4   # Weight decay (L2 penalty) for regularization
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Training device

# Load pre-trained Word2Vec embeddings
word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

# Map words to indices for the model
word_index = tokenizer.word_index  # Assume tokenizer is already initialized
N_TOKENS = len(word_index)         # Total number of unique tokens in the vocabulary

# Initialize the LSTM model with the specified parameters
model = LSTMModel(
    N_TOKENS, EMBEDDING_SIZE, HIDDEN_SIZE, N_LAYERS,
    dropout=DROPOUT, word2vec_path=WORD2VEC_PATH
).to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss(reduction='none')
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Learning rate scheduler to reduce the learning rate when a metric has stopped improving
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=True)



### Training of the LSTM model - to run for each new training
This script outlines the training and evaluation loop for an LSTM model. It includes training and validation phases for each epoch, adjusts the learning rate based on validation loss, and logs key metrics like epoch duration, learning rate, and losses. The script saves the best model based on validation loss and the final model after all epochs. Additionally, it performs quick evaluations on the test data after each epoch and a final evaluation at the end.

This script includes early stopping. The training will stop if there's no improvement in the validation loss for a number of epochs equal to patience. After early stopping or completion of all epochs, the best model (based on validation loss) is loaded for final evaluation on the test data

In [None]:
import time
import torch
import os

# Training and evaluation loop with early stopping
best_val_loss = float('inf')
patience = 5  # Number of epochs to wait after last time validation loss improved.
counter = 0  # Counter for early stopping

for epoch in range(EPOCHS):
    start_time = time.time()

    # Training phase
    train_loss = train(model, train_loader, criterion, optimizer, device, PAD_IDX)

    # Validation phase
    val_loss = validate(model, val_loader, criterion, device, PAD_IDX)

    # Update learning rate
    scheduler.step(val_loss)

    # Log epoch details
    epoch_time = time.time() - start_time
    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch: {epoch+1}, Time: {epoch_time:.2f}s, LR: {current_lr:.6f}, "
          f"Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    # Check for improvement in validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_path = os.path.join(SAVE_PATH, 'best_model.pth')
        torch.save(model.state_dict(), best_model_path)
        print(f"Improved validation loss: {val_loss:.4f}. Saving model to {best_model_path}")
        counter = 0  # Reset counter after improvement
    else:
        counter += 1  # Increment counter if no improvement
        print(f"No improvement in validation loss for {counter} epoch(s).")

    # Early stopping check
    if counter >= patience:
        print("Stopping training early due to no improvement in validation loss.")
        break

    # Quick evaluation on test data
    quick_test_loss = evaluate(model, test_loader, criterion, device, PAD_IDX)
    print(f"Quick Test Loss: {quick_test_loss:.4f}")

# Load the best model for final evaluation
model.load_state_dict(torch.load(best_model_path))
print(f"Loaded best model from {best_model_path} for final evaluation.")

# Final evaluation on the test data
final_test_loss = evaluate(model, test_loader, criterion, device, PAD_IDX)
print(f"Final Test Loss: {final_test_loss:.4f}")


Training: 100%|██████████| 1653/1653 [08:08<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:21<00:00, 17.52it/s]


Epoch: 1, Time: 509.64s, LR: 0.001000, Training Loss: 3.6181, Validation Loss: 2.0465
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.80it/s]


Quick Test Loss: 1.9797


Training: 100%|██████████| 1653/1653 [08:11<00:00,  3.36it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.56it/s]


Epoch: 2, Time: 512.89s, LR: 0.001000, Training Loss: 1.5689, Validation Loss: 1.1785
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.83it/s]


Quick Test Loss: 1.1188


Training: 100%|██████████| 1653/1653 [08:09<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:21<00:00, 17.47it/s]


Epoch: 3, Time: 510.66s, LR: 0.001000, Training Loss: 0.8714, Validation Loss: 0.6967
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.85it/s]


Quick Test Loss: 0.6520


Training: 100%|██████████| 1653/1653 [08:09<00:00,  3.37it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.53it/s]


Epoch: 4, Time: 510.81s, LR: 0.001000, Training Loss: 0.4753, Validation Loss: 0.4293
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.92it/s]


Quick Test Loss: 0.3972


Training: 100%|██████████| 1653/1653 [08:10<00:00,  3.37it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.63it/s]


Epoch: 5, Time: 511.42s, LR: 0.001000, Training Loss: 0.2690, Validation Loss: 0.3049
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.81it/s]


Quick Test Loss: 0.2818


Training: 100%|██████████| 1653/1653 [08:08<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.54it/s]


Epoch: 6, Time: 509.90s, LR: 0.001000, Training Loss: 0.1513, Validation Loss: 0.2433
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.86it/s]


Quick Test Loss: 0.2218


Training: 100%|██████████| 1653/1653 [08:09<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:21<00:00, 17.51it/s]


Epoch: 7, Time: 510.65s, LR: 0.001000, Training Loss: 0.0904, Validation Loss: 0.2070
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 19.01it/s]


Quick Test Loss: 0.1885


Training: 100%|██████████| 1653/1653 [08:09<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.72it/s]


Epoch: 8, Time: 510.40s, LR: 0.001000, Training Loss: 0.0636, Validation Loss: 0.2309


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.96it/s]


Quick Test Loss: 0.2155


Training: 100%|██████████| 1653/1653 [08:08<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.57it/s]


Epoch: 9, Time: 509.45s, LR: 0.001000, Training Loss: 0.0753, Validation Loss: 0.1873
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 19.03it/s]


Quick Test Loss: 0.1745


Training: 100%|██████████| 1653/1653 [08:09<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:21<00:00, 17.50it/s]


Epoch: 10, Time: 510.64s, LR: 0.001000, Training Loss: 0.0388, Validation Loss: 0.1765
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.77it/s]


Quick Test Loss: 0.1641


Training: 100%|██████████| 1653/1653 [08:07<00:00,  3.39it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.69it/s]


Epoch: 11, Time: 508.77s, LR: 0.001000, Training Loss: 0.0257, Validation Loss: 0.1630
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.92it/s]


Quick Test Loss: 0.1530


Training: 100%|██████████| 1653/1653 [08:07<00:00,  3.39it/s]
Validation: 100%|██████████| 368/368 [00:21<00:00, 17.50it/s]


Epoch: 12, Time: 508.85s, LR: 0.001000, Training Loss: 0.0188, Validation Loss: 0.1577
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 19.06it/s]


Quick Test Loss: 0.1482


Training: 100%|██████████| 1653/1653 [08:08<00:00,  3.39it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.54it/s]


Epoch: 13, Time: 509.03s, LR: 0.001000, Training Loss: 0.0139, Validation Loss: 0.1560
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.86it/s]


Quick Test Loss: 0.1459


Training: 100%|██████████| 1653/1653 [08:09<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:21<00:00, 17.45it/s]


Epoch: 14, Time: 510.17s, LR: 0.001000, Training Loss: 0.0110, Validation Loss: 0.1546
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.82it/s]


Quick Test Loss: 0.1431


Training: 100%|██████████| 1653/1653 [08:08<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.55it/s]


Epoch: 15, Time: 509.90s, LR: 0.001000, Training Loss: 0.0087, Validation Loss: 0.1552


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.80it/s]


Quick Test Loss: 0.1418


Training: 100%|██████████| 1653/1653 [08:10<00:00,  3.37it/s]
Validation: 100%|██████████| 368/368 [00:21<00:00, 17.48it/s]


Epoch: 16, Time: 511.62s, LR: 0.001000, Training Loss: 0.0074, Validation Loss: 0.1473
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.78it/s]


Quick Test Loss: 0.1376


Training: 100%|██████████| 1653/1653 [08:08<00:00,  3.39it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.54it/s]


Epoch: 17, Time: 509.18s, LR: 0.001000, Training Loss: 0.0059, Validation Loss: 0.1454
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.88it/s]


Quick Test Loss: 0.1330


Training: 100%|██████████| 1653/1653 [08:09<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.60it/s]


Epoch: 18, Time: 510.15s, LR: 0.001000, Training Loss: 0.0052, Validation Loss: 0.1473


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.85it/s]


Quick Test Loss: 0.1340


Training: 100%|██████████| 1653/1653 [08:08<00:00,  3.38it/s]
Validation: 100%|██████████| 368/368 [00:21<00:00, 17.45it/s]


Epoch: 19, Time: 509.62s, LR: 0.001000, Training Loss: 0.0044, Validation Loss: 0.1409
New best model saved to /content/drive/MyDrive/Colab Notebooks/save_files/best_model8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.87it/s]


Quick Test Loss: 0.1308


Training: 100%|██████████| 1653/1653 [08:08<00:00,  3.39it/s]
Validation: 100%|██████████| 368/368 [00:20<00:00, 17.54it/s]


Epoch: 20, Time: 509.17s, LR: 0.001000, Training Loss: 0.0036, Validation Loss: 0.1443


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.81it/s]


Quick Test Loss: 0.1321
Final model saved to /content/drive/MyDrive/Colab Notebooks/save_files/model_final8.pth


Evaluating: 100%|██████████| 408/408 [00:21<00:00, 18.99it/s]

Final Test Loss: 0.1315





### Generate function - to run when generation needed
This script defines a function that generates a response verse to a given input verse using a trained model. The function tokenizes the input, feeds it through the model iteratively, and builds a response verse based on the model's predictions, using temperature scaling and top-k sampling for more nuanced and controlled text generation.

In [None]:
import torch
import torch.nn.functional as F

def generate_response(model, input_verse, tokenizer, max_length=50, temperature=1.0, topk=10):
    """
    Generates a response verse given an input verse using the trained model.

    Parameters:
    model (nn.Module): The trained neural network model.
    input_verse (str): The input verse to which the model will generate a response.
    tokenizer: The tokenizer used for encoding and decoding the words.
    max_length (int): The maximum length of the response verse.
    temperature (float): The temperature for sampling. A higher value generates more random outputs.
    topk (int): The number of top probable next words to consider for sampling.

    Returns:
    str: The generated response verse.
    """
    model.eval()  # Set the model to evaluation mode
    hidden = model.init_hidden(1)  # Initialize the hidden state
    device = next(model.parameters()).device  # Get the device of the model

    # Tokenize the input verse. Unknown words are replaced with <unk> token
    input_tokens = [tokenizer.word_index.get(w, tokenizer.word_index['<unk>']) for w in input_verse.split()]

    # Generate response
    response = []
    for i in range(max_length):
        with torch.no_grad():
            # Convert input tokens to tensor and move to the model's device
            input_tensor = torch.tensor([input_tokens], dtype=torch.long).to(device)
            lengths = torch.tensor([len(input_tokens)], dtype=torch.long).to(device)

            # Forward pass through the model
            output, hidden = model(input_tensor, hidden, lengths)

            # Apply temperature scaling to the last time-step output
            last_output = output[0, -1, :].squeeze().div(temperature)
            word_weights = F.softmax(last_output, dim=-1)  # Softmax to get probabilities

            # Select topk indices to sample from
            topk_indices = torch.topk(word_weights, topk).indices
            chosen_word_index = topk_indices[torch.randint(0, topk, (1,))].item()

            if chosen_word_index == tokenizer.word_index.get('<eos>', -1):
                break  # Stop if end-of-sequence token is generated

            # Append the chosen word to the response and update the input tokens
            response.append(tokenizer.index_word.get(chosen_word_index, "<unk>"))
            input_tokens.append(chosen_word_index)

    return ' '.join(response)  # Join the response words into a single string


### Generation execution
This script focuses on loading a pre-trained model, evaluating it on test data, and then using it to generate responses to a given challenger verse. The responses vary based on different combinations of temperature and topk values, demonstrating the model's ability to create diverse and contextually relevant verses. The script is useful for testing and showcasing the capabilities of the trained model in a rap battle context.

In [None]:
import torch
from tqdm import tqdm
import os

# Load the best model
best_model_path = os.path.join(SAVE_PATH, 'best_model7.pth')
if os.path.exists(best_model_path):
    # Load the state dictionary of the best model if it exists
    model.load_state_dict(torch.load(best_model_path))
    print("Model loaded successfully.")
else:
    # Print a message if the model file does not exist
    print(f"Model not found in {best_model_path}. Ensure the path is correct.")

# Evaluate the model on test data
test_loss = evaluate(model, test_loader, criterion, device, PAD_IDX)
print(f'Test Loss: {test_loss:.2f}')  # Print the calculated test loss

# Example Challenger verse for generating a response
challenger_verse = "i am the best in the world and you bitch who are you do you think you can win against me your dick is so small that nobody can see it"

# Define expanded ranges for temperature and topk values for response generation
temperatures = [0.7, 0.75, 0.8, 0.85]  # Temperature values to control randomness
topks = [10, 12, 15]  # Top-k values to limit choices in sampling

# Generate and display responses for each combination of temperature and topk
for temp in temperatures:
    for topk in topks:
        # Generate a response from the defender's perspective
        defender_verse = generate_response(model, challenger_verse, tokenizer, max_length=50, temperature=temp, topk=topk)
        print(f"\nDefender's response with temperature {temp} and topk {topk}:")
        print(defender_verse)  # Print the generated response



Model loaded successfully.


Evaluating: 100%|██████████| 408/408 [00:08<00:00, 46.15it/s]


Test Loss: 1.24

Defender's response with temperature 0.7 and topk 10:
him already fuck rap or would you any been emma again already should hell has keep ya over hey am next down something again heart you do i been but time be ass for been would ass made do i had from a some you been emma would him then

Defender's response with temperature 0.7 and topk 12:
back feel find there being that why over should well made do i being lines but going hell yeah should hell then flow make dead im but time a heart going heart been can or would back want words had you been do were fuckin with keep sick already my

Defender's response with temperature 0.7 and topk 15:
find think ya because going rap been do stop the has thats but yeah has thats am ass own little lyrics much something with some got you had can <defender> can <defender> i mind that been made because shit mean here dick from been then high should time high ass

Defender's response with temperature 0.75 and topk 10:
find because should a