#### Preparing data for the training

For training the text classification model, I use each speach as a sample and each party affiliation as a tag.
Before that, I want to look at the quality of the speeches. Some might be too short or too long.

In [1]:
'''import Libraries'''
import os
import sys
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import pickle as pkl

import spacy
from collections import Counter
from itertools import chain

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Sampler, BatchSampler, Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import warnings
warnings.filterwarnings("ignore")

from datasets import load_dataset

In [2]:
def regularized_f1(train_f1, dev_f1, threshold=0.0015):
    """
    Returns development F1 if overfitting is below threshold, otherwise 0.
    """
    return dev_f1 if (train_f1 - dev_f1) < threshold else 0


def save_metrics(*args, path, fname):
    if not os.path.exists(path):
        os.makedirs(path)
    if not os.path.isfile(path + fname):
        with open(path + fname, "w", newline="\n") as f:
            f.write(
                ",".join(
                    [
                        "config",
                        "epoch",
                        "train_loss",
                        "train_acc",
                        "train_f1",
                        "val_loss",
                        "val_acc",
                        "val_f1",
                    ]
                )
            )
            f.write("\n")
    if args:
        with open(path + fname, "a", newline="\n") as f:
            f.write(",".join([str(arg) for arg in args]))
            f.write("\n")

def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(1234)

In [3]:
VOCAB_SIZE = 5_000 #20_000
BATCH_SIZE = 32
NUM_EPOCHS = 15
MAX_LEN = 128 #256
LEARNING_RATE = 1e-4
seed_value = 43
torch.manual_seed(seed_value)

<torch._C.Generator at 0x1f7bd0ebef0>

In [4]:
df = pd.read_pickle('data/dataset.pkl')

In [10]:
train_data= df[:100]
dev_data= df[101:120]
test_data= df[121:140]

In [6]:
class SpacyVocab:
    def __init__(self, spacy_model_name="de_core_news_md", max_vocab_size=10000, specials=['<UNK>', '<PAD>']):
        # Load the spaCy model. Disable parts of pipeline that we don't need to speed up processing
        self.nlp = spacy.load(spacy_model_name, disable=["ner", "parser", "tagger", "attribute_ruler", "lemmatizer"])
        self.max_vocab_size = max_vocab_size
        self.specials = specials
        self.vocab = {}
        self.unk_index = None

    def build_vocab(self, sentences):
        # Tokenize sentences in bulk using spaCy's pipe method
        tokenized_sentences = list(self.nlp.pipe(sentences))

        # Add special tokens to the vocabulary and set the unknown token index?
        self.vocab = {token: idx for idx, token in enumerate(self.specials)}


        # Flatten the tokenized sentences and count word frequencies
        token_counts = Counter(token.text for doc in tokenized_sentences for token in doc)

        #making sure the most occuring tokens appea in the vocabulary:

        most_common_tokens = token_counts.most_common(self.max_vocab_size - len(self.specials))

        #adding the most frequent tokens to the vocabulary
        for idx, (token, _) in enumerate(most_common_tokens, start=len(self.specials)):
            self.vocab[token] = idx



    def __call__(self, sentence, max_len):

        #tokenizing the input sentence
        tokenized_sentence = self.nlp(sentence)

        # Convert text to token indices using the vocabulary, defaulting to <UNK> index for unknown tokens
        token_indices = [self.vocab.get(token.text,  self.vocab['<UNK>']) for token in tokenized_sentence]


        if len(token_indices) > max_len:
            token_indices = token_indices[:max_len]  #shortening the list of token indexes if it exceedes max_len

        return token_indices

# Create an instance of SpacyVocab with the specified spaCy model
spacy_vocab = SpacyVocab()

# Build the vocabulary using the training data
spacy_vocab.build_vocab(train_data['text'])

In [9]:
train_idx = [spacy_vocab(sentence, MAX_LEN) for sentence in train_data['text']]

In [11]:
dev_idx  = [spacy_vocab(sentence, MAX_LEN) for sentence in dev_data['text']]

In [12]:
test_idx = [spacy_vocab(sentence, MAX_LEN) for sentence in test_data['text']]

In [13]:
class ParliamentDataset(Dataset):
    def __init__(self, seq, lbl):
        self.seq = seq
        self.lbl = lbl

    def __getitem__(self, idx):
        # seq -  a list of tokenized indexes
        # lbl - the list of labels
        return torch.tensor(self.seq[idx]), torch.tensor(self.lbl[idx])

    def __len__(self):
        #return the length of the sequence?
        return len(self.seq)

In [14]:
train_set = ParliamentDataset(train_idx, train_data['party'])
dev_set = ParliamentDataset(dev_idx, dev_data['party'])
test_set = ParliamentDataset(test_idx, test_data['party'])

In [15]:
class GroupedSampler(Sampler):
    def __init__(self, seqs, batch_size):
        self.seqs = seqs
        self.batch_size = batch_size

        #pairing each sequence index with its length: [(index, length), ...]
        self.lengths = [(i, len(seq)) for i, seq in enumerate(seqs)]

    def __iter__(self):
        """
        Creates an iterator that returns shuffled indices sorted by length within chunks of size batch_size * 100.
        """
        #shuffling the list of (index, length) tuples
        random.shuffle(self.lengths)

        #setting group size as batch_size * 100
        group_size = self.batch_size * 100

        #sorting the data by sequence length in each group
        grouped_sorted = []
        for i in range(0, len(self.lengths), group_size):
            group = self.lengths[i:i + group_size]
            #sort group by the second element
            grouped_sorted.extend(sorted(group, key=lambda x: x[1]))

        # print(grouped_sorted)
        #Return only the indices, maintaining the sorted order within each group
        indices = [index for index, _ in grouped_sorted]
        return iter(indices)

    def __len__(self):
        return len(self.seqs)

In [16]:
train_grouped_sampler = GroupedSampler(train_idx, BATCH_SIZE)
train_sampler =  BatchSampler(train_grouped_sampler, batch_size=BATCH_SIZE, drop_last=False)

In [17]:
def collate_batch(batch):
    sequences, labels = zip(*batch)  # Batch is a list of (sequence, label) pairs

    #original lengths of sequences (before padding)
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long)

    #sequences to the same length using padding_value=1
    padded_sequences = pad_sequence([torch.tensor(seq, dtype=torch.long) for seq in sequences],
                                    batch_first=True,
                                    padding_value=1)

    #labels to tensor
    labels = torch.tensor(labels, dtype=torch.long)

    return padded_sequences, labels, lengths


In [18]:
# create dataloaders
NUM_WORKERS = 1 #it was said it should be <=2, I guess 0 is okay

train_loader = DataLoader(
    train_set, batch_sampler=train_sampler, collate_fn=collate_batch, num_workers=NUM_WORKERS
)

# DataLoadesr for the rest of the sets withouut shuffling or sampling as they are used for evaluation only
dev_loader = DataLoader(dev_set, batch_size=BATCH_SIZE, shuffle=False,
                        collate_fn=collate_batch, num_workers=NUM_WORKERS)

test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False,
                         collate_fn=collate_batch, num_workers=NUM_WORKERS)
