# CNN with FastText word-embeddings Training

## Dataset loading

5k Sample set:

train
https://drive.google.com/file/d/19t64Y_q5X9_2XkcePFCloFeMGj-zziDc/view?usp=drive_link

test
https://drive.google.com/file/d/1GgUb75IjJYkFygqWHZWiTiSZ9htWXsG-/view?usp=sharing

valid
https://drive.google.com/file/d/1Rq3OpD0ZfQ23zrJntAb-5X9p3GeQCVBy/view?usp=drive_link


All data set:

train
https://drive.google.com/file/d/1cA6QNMauKvZr4W62UZS9ysgrGEbURg7Y/view?usp=drive_link

test
https://drive.google.com/file/d/1hja3qBUB8BvTbhtpuaPHDkuvKOuOda7M/view?usp=drive_link

valid
https://drive.google.com/file/d/1f6p93RDkV9AeaWcprzKMGKjtgQr0opU0/view?usp=sharing

New all data set:

train
https://drive.google.com/file/d/1mNUOXt3ZiERCH401TkSGXx7-LpbmbMaK/view?usp=sharing

test
https://drive.google.com/file/d/16p0td9GgJRb9AP8i4HlX-xZGI2u849uA/view?usp=sharing

valid
https://drive.google.com/file/d/1FhT3m_ApKzX615JzshB5-d-j6S91-6oz/view?usp=sharing

In [None]:
# All training dataset
! gdown 1mNUOXt3ZiERCH401TkSGXx7-LpbmbMaK # new all train
! gdown 1FhT3m_ApKzX615JzshB5-d-j6S91-6oz # new all valid
! gdown 16p0td9GgJRb9AP8i4HlX-xZGI2u849uA # new all test


! mkdir open_llm
! mv train.jsonl valid.jsonl test.jsonl open_llm/

Downloading...
From (original): https://drive.google.com/uc?id=1mNUOXt3ZiERCH401TkSGXx7-LpbmbMaK
From (redirected): https://drive.google.com/uc?id=1mNUOXt3ZiERCH401TkSGXx7-LpbmbMaK&confirm=t&uuid=9ff659f8-9bb3-48d2-9d69-5dc3097cad2c
To: /content/train.jsonl
100% 292M/292M [00:04<00:00, 70.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=1FhT3m_ApKzX615JzshB5-d-j6S91-6oz
To: /content/valid.jsonl
100% 55.1M/55.1M [00:00<00:00, 60.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=16p0td9GgJRb9AP8i4HlX-xZGI2u849uA
To: /content/test.jsonl
100% 39.1M/39.1M [00:00<00:00, 57.3MB/s]


In [None]:
! mkdir models # used for saving models

## Library Installation

In [None]:
# Standard Libraries
import os
import re
import gc
import json
import pickle
import requests
from zipfile import ZipFile

# Data Science & Machine Learning Libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Natural Language Processing Libraries
import nltk
from nltk.tokenize import word_tokenize

# PyTorch Libraries
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn
import torch.nn.functional as F

# Visualization Libraries
import matplotlib.pyplot as plt

# Progress Bar Libraries
from tqdm import tqdm, tqdm_notebook


In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))

nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Path to save files in Google Drive
drive_path = '/content/drive/MyDrive/Thesis/Models/CNN/'

# Ensure the folder exists
os.makedirs(drive_path, exist_ok=True)

Mounted at /content/drive


## Choose runtime environment

In [None]:
# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    gpu_name = torch.cuda.get_device_name(0)
    gpu_capability = torch.cuda.get_device_properties(0).major, torch.cuda.get_device_properties(0).minor
    print(f"Using GPU: {gpu_name}")
    print(f"Compute Capability: {gpu_capability}")
    print(f"CUDA Device Count: {torch.cuda.device_count()}")
    print(f"Memory Allocated: {torch.cuda.memory_allocated(0)} bytes")
    print(f"Memory Reserved: {torch.cuda.memory_reserved(0)} bytes")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: Tesla T4
Compute Capability: (7, 5)
CUDA Device Count: 1
Memory Allocated: 0 bytes
Memory Reserved: 0 bytes


## Data preparation

In [None]:
df_train = pd.read_json("open_llm/train.jsonl", lines=True)

df_valid = pd.read_json("open_llm/valid.jsonl", lines=True)

training_len = len(df_train)

df = pd.concat([df_train, df_valid], ignore_index=True)

del df_train, df_valid
gc.collect()  # Force garbage collection to release memory

df

Unnamed: 0,uid,text,extra,source,label
0,[urlsf_subset00]-[15],The dangers of Illinois as a ‘right to work’ s...,"{'source': 'openweb', 'variant': 'original'}",openweb,0
1,[urlsf_subset00]-[15],"The governor of Illinois, Gov. Rauner, has req...","{'source': 'chatgpt', 'variant': 'original'}",chatgpt,1
2,[urlsf_subset00]-[83],Check current weather conditions\n\nIt’s going...,"{'source': 'openweb', 'variant': 'original'}",openweb,0
3,[urlsf_subset00]-[83],Check current weather conditions It’s going to...,"{'variant': 'original', 'source': 'llama'}",llama,1
4,[urlsf_subset00]-[89],"On Thursday, the president of the United State...","{'source': 'openweb', 'variant': 'original'}",openweb,0
...,...,...,...,...,...
120929,[urlsf_subset06]-[390176],Diego Maradona has paid tribute to the late Al...,"{'source': 'chatgpt', 'variant': 'original'}",chatgpt,1
120930,[urlsf_subset06]-[390305],Tymee Holds A Guerilla Performance\n\n[by Yanc...,"{'source': 'openweb', 'variant': 'original'}",openweb,0
120931,[urlsf_subset06]-[390305],Tymee Holds A Guerilla Performance\n\n[by Yanc...,"{'variant': 'original', 'source': 'llama'}",llama,1
120932,[urlsf_subset06]-[390316],South Korea President Moon Jae-in requested a ...,"{'source': 'openweb', 'variant': 'original'}",openweb,0


In [None]:
def text_process(mess):
    """
    Process text to:
    1. Remove punctuation (including all Unicode quotes)
    2. Convert text to lowercase
    3. Return cleaned text without removing stopwords
    """
    # Remove all punctuation using regex
    mess = re.sub(r"[^\w\s]", "", mess)

    # Convert the text to lowercase
    mess = mess.lower()

    # Return the cleaned text
    return mess


df['text'] = df['text'].apply(text_process)

df = df.drop(['extra', 'source'], axis=1)

texts = np.array(df.text)
labels = np.array(df.label)

df

Unnamed: 0,uid,text,label
0,[urlsf_subset00]-[15],the dangers of illinois as a right to work sta...,0
1,[urlsf_subset00]-[15],the governor of illinois gov rauner has reques...,1
2,[urlsf_subset00]-[83],check current weather conditions\n\nits going ...,0
3,[urlsf_subset00]-[83],check current weather conditions its going to ...,1
4,[urlsf_subset00]-[89],on thursday the president of the united states...,0
...,...,...,...
120929,[urlsf_subset06]-[390176],diego maradona has paid tribute to the late al...,1
120930,[urlsf_subset06]-[390305],tymee holds a guerilla performance\n\nby yanch...,0
120931,[urlsf_subset06]-[390305],tymee holds a guerilla performance\n\nby yanch...,1
120932,[urlsf_subset06]-[390316],south korea president moon jaein requested a c...,0


## FastText download


In [None]:
URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
FILE = "fastText"
ZIP_FILE = "crawl-300d-2M.vec.zip"

if os.path.isdir(FILE):
    print("fastText exists.")
else:
    os.makedirs(FILE, exist_ok=True)
    r = requests.get(URL, stream=True)
    with open(os.path.join(FILE, ZIP_FILE), 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

    with ZipFile(os.path.join(FILE, ZIP_FILE), 'r') as zip_ref:
        zip_ref.extractall(FILE)

# Data preparation

## Input tokenization and encoding

In [None]:
def tokenize(texts):
    """
    Tokenize texts, build a vocabulary, and determine the maximum sentence length.

    Args:
        texts (List[str]): A list of text data (strings) to be processed.

    Returns:
        tokenized_texts (List[List[str]]): A list of tokenized sentences, where each text in the input is split into tokens.
        word2idx (Dict[str, int]): A dictionary mapping each unique token to a unique integer index. The special token '<unk>' is reserved for unknown tokens and is assigned an index of 0.
        max_len (int): The length of the longest tokenized sentence in the input texts.

    Process:
        1. Initializes a special token '<unk>' in the vocabulary with index 0.
        2. Tokenizes each text in the input list using `nltk.word_tokenize`.
        3. Adds each unique token to the vocabulary with a unique index.
        4. Tracks the length of the longest tokenized sentence.

    Example:
        texts = ["Hello world!", "How are you?"]
        tokenized_texts, word2idx, max_len = tokenize(texts)

        # tokenized_texts = [['Hello', 'world', '!'], ['How', 'are', 'you', '?']]
        # word2idx = {'<unk>': 0, 'Hello': 1, 'world': 2, '!': 3, 'How': 4, 'are': 5, 'you': 6, '?': 7}
        # max_len = 4
    """
    print("Tokenizing texts...\n")

    max_len = 0
    tokenized_texts = []
    word2idx = {}

    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    for text in texts:
        tokenized_text = nltk.word_tokenize(text)
        tokenized_texts.append(tokenized_text)

        for token in tokenized_text:
            if token not in word2idx:
                word2idx[token] = len(word2idx)

        if len(tokenized_text) > max_len:
            max_len = len(tokenized_text)

    return tokenized_texts, word2idx, max_len


def encode(tokenized_texts, word2idx, max_len):
    """
    Encode tokenized texts into a numpy array of token indices, with padding.

    Args:
        tokenized_texts (List[List[str]]): List of tokenized texts.
        word2idx (Dict[str, int]): Dictionary mapping each token to its index.
        max_len (int): Maximum sentence length (for padding).

    Returns:
        np.array: A numpy array where each sentence is encoded as a list of token indices, with padding added to sentences shorter than `max_len`.

    Process:
        1. For each tokenized sentence, pad the sentence with the '<pad>' token if it's shorter than `max_len`.
        2. Convert each token to its corresponding index using the `word2idx` dictionary.
        3. If a token is not found in `word2idx`, it is replaced with the index for '<unk>' (unknown token).

    Example:
        tokenized_texts = [['hello', 'world'], ['how', 'are', 'you']]
        word2idx = {'<pad>': 0, '<unk>': 1, 'hello': 2, 'world': 3, 'how': 4, 'are': 5, 'you': 6}
        max_len = 3

        result = encode(tokenized_texts, word2idx, max_len)
        # result = np.array([[2, 3, 0], [4, 5, 6]])
    """
    input_idxs = []

    for tokenized_text in tokenized_texts:
        tokenized_text += ['<pad>'] * (max_len - len(tokenized_text)) # add padding

        input_idxs.append([word2idx.get(token, '<unk>') for token in tokenized_text])

    return np.array(input_idxs)


## Load Pretrained Vectors

In [None]:
def load_pretrained_vectors(word2idx, fname, embedding_dim=300):
    """
    Load pretrained vectors and create an embedding matrix.

    Args:
        word2idx (Dict[str, int]): Vocabulary mapping words to indices.
        fname (str): Path to the pretrained vector file (e.g., fastText .vec file).
        embedding_dim (int): Dimension of the embedding vectors (default: 300).

    Returns:
        embeddings (np.array): Embedding matrix with shape (N, d), where:
            - N is the size of word2idx
            - d is the embedding dimension.
    """

    print("Loading pretrained vectors...")

    # Initialize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), embedding_dim))
    embeddings[word2idx.get('<pad>', 0)] = np.zeros((embedding_dim,))  # Handle padding token

    with open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') as fin:
        # Check if the first line contains metadata (number of words and dimension)
        first_line = fin.readline().strip()
        if first_line.replace(' ', '').isdigit():
            n, d = map(int, first_line.split())
        else:
            n, d = None, embedding_dim
            fin.seek(0)  # Rewind file if no metadata header

        # Ensure dimensions match
        assert d == embedding_dim, \
            f"Embedding dimension mismatch: expected {embedding_dim}, found {d}"

        # Load embeddings
        count = 0
        for line in tqdm(fin, desc="Loading vectors"):
            tokens = line.rstrip().split(' ')
            word, vector = tokens[0], tokens[1:]
            if word in word2idx:
                count += 1
                embeddings[word2idx[word]] = np.array(vector, dtype=np.float32)

    print(f"Loaded {count} / {len(word2idx)} pretrained vectors.")
    return embeddings

## Apply input functions to the dataset

In [None]:
# Tokenize, build vocabulary, encode tokens
tokenized_texts, word2idx, max_len = tokenize(texts)
input_ids = encode(tokenized_texts, word2idx, max_len)

# Path to pretrained embeddings
embedding_file = "fastText/crawl-300d-2M.vec"

# Load pretrained vectors
pretrained_embeddings = load_pretrained_vectors(word2idx, embedding_file)
embeddings_tensor = torch.tensor(pretrained_embeddings, dtype=torch.float32)

Tokenizing texts...

Loading pretrained vectors...


Loading vectors: 1999995it [00:43, 45509.22it/s]


Loaded 186989 / 621876 pretrained vectors.


## Create PyTorch DataLoader

In [None]:
def data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50):
    """
    Prepare PyTorch DataLoaders for training and validation datasets.

    This function takes preprocessed training and validation inputs/labels, converts
    them into PyTorch tensors, and creates DataLoader objects. DataLoaders are used
    to iterate over batches of data efficiently during training and validation.

    Args:
        train_inputs (List or np.array): Tokenized and preprocessed training input data.
        val_inputs (List or np.array): Tokenized and preprocessed validation input data.
        train_labels (List or np.array): Corresponding labels for the training data.
        val_labels (List or np.array): Corresponding labels for the validation data.
        batch_size (int, optional): Number of samples per batch. Default is 50.

    Returns:
        Tuple[DataLoader, DataLoader]:
            - train_dataloader (DataLoader): DataLoader for the training data.
            - val_dataloader (DataLoader): DataLoader for the validation data.

    Notes:
        - Training data is shuffled to improve model generalization.
        - Validation data is not shuffled to ensure consistent evaluation.
    """
    # Convert datasets to PyTorch tensors
    tensor_train_inputs = torch.tensor(train_inputs)
    tensor_val_inputs = torch.tensor(val_inputs)
    tensor_train_labels = torch.tensor(train_labels)
    tensor_val_labels = torch.tensor(val_labels)

    # Create TensorDataset objects
    train_data = TensorDataset(tensor_train_inputs, tensor_train_labels)
    val_data = TensorDataset(tensor_val_inputs, tensor_val_labels)

    # Create DataLoaders
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    val_dataloader = DataLoader(val_data, shuffle=False, batch_size=batch_size)

    return train_dataloader, val_dataloader

In [None]:
train_inputs = input_ids[:training_len]
train_labels = labels[:training_len]
val_inputs = input_ids[training_len:]
val_labels = labels[training_len:]

# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = \
data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50)

## CNN Model creation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_NLP(nn.Module):
    """An 1D Convulational Neural Network for Sentence Classification."""
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[3, 4, 5],
                 num_filters=[100, 100, 100],
                 num_classes=2,
                 dropout=0.5):
        """
        The constructor for CNN_NLP class.

        Args:
            pretrained_embedding (torch.Tensor): Pretrained embeddings with
                shape (vocab_size, embed_dim)
            freeze_embedding (bool): Set to False to fine-tune pretraiend
                vectors. Default: False
            vocab_size (int): Need to be specified when not pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.5
        """

        super(CNN_NLP, self).__init__()
        # Embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, input_ids):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(input_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]

        # Concatenate x_pool_list to feed the fully connected layer.
        # Output shape: (b, sum(num_filters))
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)

        # Compute logits. Output shape: (b, n_classes)
        logits = self.fc(self.dropout(x_fc))

        return logits

## Optimizer

In [None]:
import torch.optim as optim

def initialize_model(pretrained_embedding=None,
                    freeze_embedding=False,
                    vocab_size=None,
                    embed_dim=300,
                    filter_sizes=[3, 4, 5],
                    num_filters=[100, 100, 100],
                    num_classes=2,
                    dropout=0.5,
                    learning_rate=0.01):
    """Instantiate a CNN model and an optimizer."""

    assert (len(filter_sizes) == len(num_filters)), "filter_sizes and \
    num_filters need to be of the same length."

    # Instantiate CNN model
    print("Initializing the model...")
    print(f"Model parameters:\n"
          f"\tDropout rate: {dropout}\n"
          f"\tLearning rate: {learning_rate}\n"
          f"\tFilters: {filter_sizes}\n"
          f"\tEmbeddings: ({embed_dim}, {vocab_size})\n"
          f"\tNum. filters: {num_filters}\n")


    # Instantiate CNN model
    cnn_model = CNN_NLP(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=freeze_embedding,
                        vocab_size=vocab_size,
                        embed_dim=embed_dim,
                        filter_sizes=filter_sizes,
                        num_filters=num_filters,
                        num_classes=2,
                        dropout=0.5)

    # Send model to `device` (GPU/CPU)
    cnn_model.to(device)

    # Instantiate Adadelta optimizer
    optimizer = optim.Adadelta(cnn_model.parameters(),
                               lr=learning_rate,
                               rho=0.95)

    return cnn_model, optimizer

## Training Loop


In [None]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility."""

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

from sklearn.metrics import f1_score

def evaluate(model, val_dataloader):
    """Evaluate the model's performance on the validation set after each epoch."""
    model.eval()

    # Tracking variables
    val_loss = []
    val_preds = []
    val_labels = []

    # Iterate over validation batches
    for batch in val_dataloader:
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        # Disable gradient calculation for evaluation
        with torch.no_grad():
            logits = model(b_input_ids)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get predictions
        preds = torch.argmax(logits, dim=1).flatten()
        val_preds.extend(preds.cpu().numpy())  # Collect predictions
        val_labels.extend(b_labels.cpu().numpy())  # Collect true labels

    # Compute the F1 score
    f1 = f1_score(val_labels, val_preds, average='weighted')  # Use weighted average for multi-class

    # Compute the average validation loss
    avg_val_loss = np.mean(val_loss)

    return avg_val_loss, f1


def train(model, optimizer, train_dataloader, val_dataloader=None, epochs=10, early_stopping_patience=5, save_best_model=True):
    """Train the model with GPU memory optimization, early stopping, and saving the best model."""

    best_f1 = 0 # Initialize best f1 score
    best_val_loss = float('inf')  # Initialize best validation loss as infinity
    epochs_no_improve = 0  # Early stopping counter
    scaler = GradScaler()  # For mixed precision training
    best_model_state = None  # Variable to store best model state

    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Train F1':^9} | {'Val Loss':^10} | {'Val F1':^9} | {'Elapsed':^9}")
    print("-" * 72)

    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        t0_epoch = time.time()
        total_loss = 0
        train_preds = []
        train_labels = []

        model.train()

        for step, batch in enumerate(train_dataloader):
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            optimizer.zero_grad()

            # Mixed precision forward pass
            with autocast():
                logits = model(b_input_ids)
                loss = loss_fn(logits, b_labels)  # Compute loss

            # Backward pass and optimization with mixed precision
            scaler.scale(loss).backward()  # Scale loss for mixed precision
            scaler.step(optimizer)        # Apply optimizer step
            scaler.update()               # Update scaler for next step

            total_loss += loss.item()

            # Collect training predictions and labels for F1 score
            preds = torch.argmax(logits, dim=1).flatten()
            train_preds.extend(preds.cpu().numpy())
            train_labels.extend(b_labels.cpu().numpy())

        # Compute the average training loss and F1 score
        avg_train_loss = total_loss / len(train_dataloader)
        train_f1 = f1_score(train_labels, train_preds, average='weighted')  # Training F1 score

        # =======================================
        #               Evaluation
        # =======================================
        if val_dataloader is not None:
            val_loss, val_f1 = evaluate(model, val_dataloader)

            # Save the best f1 score
            if val_f1 > best_f1:
                best_f1 = val_f1

            # Save the model with the lowest validation loss
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                epochs_no_improve = 0

                # Save the best model state based on the validation loss
                if save_best_model:
                    best_model_state = model.state_dict()

            else:
                epochs_no_improve += 1

            if epochs_no_improve >= early_stopping_patience:
                print(f"\nStopping early at epoch {epoch_i + 1}.")
                break

        # Print epoch performance
        time_elapsed = time.time() - t0_epoch
        print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.4f} | {train_f1:^9.4f} | {val_loss:^10.4f} | {val_f1:^9.4f} | {time_elapsed:^9.2f}")
    else:
        # Print training info if no validation data
        time_elapsed = time.time() - t0_epoch
        print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.4f} | {train_f1:^9.4f} | {'N/A':^10} | {'N/A':^9} | {time_elapsed:^9.2f}")

    print("\nTraining complete!")
    print(f"Best F1-score: {best_f1:.3f}")
    print(f"Best validation loss: {best_val_loss:.3f}")


### CNN-non-static training

In [None]:
set_seed(42)

# Initialize hyperparameeters
freeze_embedding = False
filter_sizes = [2, 3, 4, 5]
num_filters = [200, 300, 400, 500]
learning_rate = 0.1
dropout = 0.6
vocab_size = len(word2idx)
embed_dim = 300
save_best_model = True

epochs = 20
early_stopping_patience = 5

best_model_state = None

# Initialize model and optimizer
cnn_non_static, optimizer = initialize_model(pretrained_embedding=embeddings_tensor,
                                            freeze_embedding=freeze_embedding,
                                            vocab_size=vocab_size,
                                            embed_dim=embed_dim,
                                            learning_rate=learning_rate,
                                            filter_sizes=filter_sizes,
                                            num_filters=num_filters,
                                            dropout=dropout)

# Train the model
train(cnn_non_static,
      optimizer,
      train_dataloader,
      val_dataloader,
      epochs=epochs,
      early_stopping_patience=early_stopping_patience,
      save_best_model=save_best_model)

Initializing the model...
Model parameters:
	Dropout rate: 0.6
	Learning rate: 0.1
	Filters: [2, 3, 4, 5]
	Embeddings: (300, 621876)
	Num. filters: [200, 300, 400, 500]

Start training...

 Epoch  |  Train Loss  | Train F1  |  Val Loss  |  Val F1   |  Elapsed 
------------------------------------------------------------------------
   1    |    0.5160    |  0.7317   |   0.3578   |  0.8502   |  431.98  
   2    |    0.3443    |  0.8490   |   0.2896   |  0.8788   |  431.73  
   3    |    0.2703    |  0.8860   |   0.2588   |  0.8935   |  431.49  
   4    |    0.2229    |  0.9088   |   0.2560   |  0.8930   |  431.25  
   5    |    0.1832    |  0.9264   |   0.2708   |  0.8831   |  431.06  
   6    |    0.1486    |  0.9418   |   0.2259   |  0.9069   |  431.04  
   7    |    0.1226    |  0.9538   |   0.2433   |  0.8995   |  430.82  
   8    |    0.0984    |  0.9639   |   0.2333   |  0.9067   |  430.85  
   9    |    0.0783    |  0.9733   |   0.2353   |  0.9077   |  430.60  
  10    |    0.064

## Save the model and essential information

In [None]:
# Save embeddings tensor (pretrained word embeddings)

embeddings_tensor_path = '/content/drive/MyDrive/Thesis/Models/LSTM/'
torch.save(embeddings_tensor, embeddings_tensor_path)

# Save the word2idx dictionary (tokenizer mapping)
word2idx_path = os.path.join(drive_path, 'cnn_word2idx.pkl')
with open(word2idx_path, 'wb') as file:
    pickle.dump(word2idx, file)

# Save max_len (maximum length of sequences)
max_len_path = os.path.join(drive_path, 'cnn_max_len.pkl')
with open(max_len_path, 'wb') as file:
    pickle.dump(max_len, file)

# Save the best model during training with the highest F1 score
best_model_path = os.path.join(drive_path, 'cnn_best_model.pth')
if save_best_model and best_model_state is not None:
    torch.save(best_model_state, best_model_path)
else:
    torch.save(cnn_non_static.state_dict(), best_model_path)

# Save the CNN model parameters as a JSON file
model_params = {
    'vocab_size': vocab_size,
    'embedding_dim': embed_dim,
    'dropout': dropout,
    'filter_sizes': filter_sizes,
    'num_filters': num_filters
}
params_save_path = os.path.join(drive_path, 'cnn_model_params.json')
with open(params_save_path, 'w') as f:
    json.dump(model_params, f, indent=4)

print(f"Model parameters saved to: {params_save_path}")

# Output paths to confirm the saving
print(f"Embeddings tensor saved to: {embeddings_tensor_path}")
print(f"word2idx saved to: {word2idx_path}")
print(f"max_len saved to: {max_len_path}")
print(f"Best CNN model saved to: {best_model_path}")
print(f"CNN Model parameters saved to: {params_save_path}")

Model parameters saved to: /content/drive/MyDrive/Thesis/Models/CNN/cnn_model_params.json
Embeddings tensor saved to: /content/drive/MyDrive/Thesis/Models/CNN/cnn_embeddings_tensor.pth
word2idx saved to: /content/drive/MyDrive/Thesis/Models/CNN/cnn_word2idx.pkl
max_len saved to: /content/drive/MyDrive/Thesis/Models/CNN/cnn_max_len.pkl
Best CNN model saved to: /content/drive/MyDrive/Thesis/Models/CNN/cnn_best_model.pth
CNN Model parameters saved to: /content/drive/MyDrive/Thesis/Models/CNN/cnn_model_params.json
