In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fakereddit/multimodal_validate.tsv
/kaggle/input/fakereddit/multimodal_train.tsv
/kaggle/input/fakereddit/multimodal_test_public.tsv


# Fakeddit Multimodal Fake News Detection Challenge

The Main objective is to create a multiModal model capable of detecting fake news by extracting the information from text, title and image.

# Importing the required libraries

In [30]:
#importing the core libraries required for the task
import os
import numpy as np
import keras
import pandas as pd
from tqdm.notebook import tqdm
from collections import defaultdict
from textwrap import wrap
from PIL import Image, ImageFile, UnidentifiedImageError

In [31]:
# Importing Pytorch and transformers
import torch
# Import nn module for building stacked layers and optimizers
from torch import nn, optim
import torchvision
from torchvision import datasets, models, transforms
# Import modules for dataset configuration and loading
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
import torch.optim.lr_scheduler as lr_scheduler

#importing visual libraries
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

In [32]:
# Importing model evaluation tools
from sklearn.metrics import confusion_matrix, classification_report
import sklearn
from sklearn.model_selection import train_test_split

# Initializing and Preprocessing the Fakeddit-Benchmark dataset 

In [33]:
# Helper function to read the tsv file and convert it into a dataframe. 
# We use the read.csv() method and provide delimiter as '/t' to let the function know its a tsv file
def initialize_dataframe(path, file):
    dataframe = pd.read_csv(os.path.join(path, file), delimiter="\t")
    
    # Dropping the redundant columns 
    if "Unnamed: 0" in dataframe.columns:
        dataframe = dataframe.drop(["Unnamed: 0"], 1)
    
    # Return dataframe
    return dataframe

In [34]:
# Read input into dataframes
df_test = initialize_dataframe("dataset", "/kaggle/input/fakereddit/multimodal_test_public.tsv")
df_validate = initialize_dataframe("dataset", "/kaggle/input/fakereddit/multimodal_train.tsv")
df_train = initialize_dataframe("dataset", "/kaggle/input/fakereddit/multimodal_validate.tsv")

**As The training dataset contains over 700,000 samples, we use train-test-split to first use only 20%-30% of the total training data for actually training the model. Then we further divide the training data normally into a 80-20 split, keeping 80% for training and rest 20% for testing purposes. 
Stratify function is applied in order to keep the per class sample distribution from original Fakeddit source dataset.**

In [35]:
# Splitting complete Fakeddit-dataset into 20% training dataframe
# and 80% backup dataframe
df_train, df_backup = train_test_split(
    df_train,
    test_size=0.8,
    shuffle=True,
    
   stratify = df_train["6_way_label"]
)

In [36]:
# Keeping 80% of data samples for training and 20% for testing purposes
df_train, df_test = train_test_split(
    df_train,
    test_size=0.2,
    shuffle=True,
   
    stratify=df_train["6_way_label"]
)

In [37]:
# Dividing test split dataframe by factor 0,5 to have identically
# sized splits for validation and testing
df_test, df_validate = train_test_split(
    df_test,
    test_size=0.5,
    shuffle=True,
   
    stratify=df_test["6_way_label"]
)

In [38]:
# Fake News subtypes in order of Fakeddit benchmark dataset labeling
CLASS_NAMES = ["True", "Satire", "False Conn.", "Impost. Content", "Man. Content", "Mis. Content"]

**We will be using the DistillBert Model to process the text and title information**

In [39]:
# Importing needed modules for DistilBert model
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig

# Loading DistilBert tokenizers adjusted for lower case English text corpus
# for tokenization of title input sequence
title_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Processing the input data

** PostDataset contains all relevant information per batch and DataLoader iterates over complete Dataset to feed batches of size 16 to model.
Initially it doesn't contain any information regarding the reddit images or titles**

In [40]:
class PostDataset(Dataset):
    
    # Constructor initialized with relevant attributes plus tokenizer information
    def __init__(self, post_id, title, label, title_tokenizer, max_len):
        self.post_id = post_id
        self.title = title
        self.label = label
        self.title_tokenizer = title_tokenizer
        self.max_length = max_len
        
    # Returns length of the dataset for internal looping 
    def __len__(self):
        return len(self.label)
    
    # Internal function to fetch next sample within dataset object
    def __getitem__(self, idx):
        # Iteration function to retireve next sample
        post_id = self.post_id[idx]
        title = self.title[idx]
        label = self.label[idx]

        # Saving id, clean_title and label entries per post
        # in sample dictionary
        sample = {
            "post_id": post_id,
            "clean_title": title,
            "label": label
        }
        
        # Return sample dictionary containing all needed attributes
        return sample

**Train transform is specifically used to process image data for the train_model() function.Additional data augmentation is performed by random crop resizing and flipping image horizontally in order to artificially inflate the underlying training set split**

In [41]:
# Transform function for image processing (training)
# Performing data augmentation by random resizing, cropping
# and flipping images in order to artificially create new
# image data per training epoch
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.255]
    )
])

**Internal collate_batch() function implements the processing / sample preparation logic to convert all information into a readable format for the neural network model. Per batch a total of 16 samples are fetched, whereas the images are processed and attributes post_id, image tensor, and 6-way-label are stacked within a single batch dictionary. One batch holds above mentioned information for 16 fetched samples. batch_size of 16 was chosen in alignment with previously used DistilBert configuration.**

In [42]:
def collate_batch(batch):
    
    # List to save processed batch samples
    batch_processed = []
    
    # Iteration over input batch of size 16
    for i in range(len(batch)):
        
        # Saving attributes in local variables
        post_id = batch[i]["post_id"]
        title = batch[i]["clean_title"]
        label = batch[i]["label"]
        
        # Leveraging DistilBertTokenizer to generate
        # encoding of input text sequence
        encoding = title_tokenizer.encode_plus(
            title,
            max_length=80,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt",
        )

        # Try-Except-Else clause to process image data
        # Fetch images from image_set folder via post_id, transform and reshape tensor
        try:
            image_path = df_train.iloc[i]["image_url"]
            image = Image.open(image_path)
        # Handling FileNotFoundError and randomly initializing pixels
        except FileNotFoundError:
            image = torch.rand(3, 224, 224)
            image = torch.unsqueeze(image, 0)
        # Handling UnidentifiedImageError and randomly initializing pixels
        except UnidentifiedImageError:
            image = torch.rand(3, 224, 224)
            image = torch.unsqueeze(image, 0)
        # Handling OSError and randomly initializing pixels
        except OSError:
            image = torch.rand(3, 224, 224)
            image = torch.unsqueeze(image, 0)
        # Else: Convert image to RGB, process with train_transform
        # and reshape to tensor of shape = [1, 3, 224, 224] for
        # [sample_count, color_channels, height in pixel, width in pixel]
        else:
            image = image.convert("RGB")
            image = train_transform(image)
            image = torch.unsqueeze(image, 0)
        

        # Storing processed attributes of sample in sample
        # dictionary: post_id, title (text), input_ids,
        # attention_mask, image and label
        sample = {
            "post_id": post_id,
            "title": title,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "image": image.flatten(),
            "label": torch.tensor(label, dtype=torch.long)
        }
        
        # Append current samples dictionary to processed
        # batch list --> List of sample dictionaries
        batch_processed.append(sample)
        
    # Complex operation in order to unpack list of dictionaries and
    # merge dictionary entries into correct PyTorch tensor for forward processing
    postId = []
    titles = []
    
    # For-loop to stack sample dictionary keys into appropriate format
    for i in range(len(batch_processed)):
        # If first sample of batch, initialize attribute tensors and reshape
        if i == 0:
            postId.append(batch_processed[i]["post_id"])
            titles.append(batch_processed[i]["title"])
            input_ids_tensor = batch_processed[i]["input_ids"].reshape(-1, 80)
            attention_mask_tensor = batch_processed[i]["attention_mask"].reshape(-1, 80)
            image_tensor = batch_processed[i]["image"].reshape(-1, 3, 224, 224)
            label_tensor = batch_processed[i]["label"].reshape(-1,)
            continue

        # Stack attributes of sample dictionary keys to generate correct tensor shape
        postId.append(batch_processed[i]["post_id"])
        titles.append(batch_processed[i]["title"])
        input_ids_tensor = torch.cat((input_ids_tensor, batch_processed[i]["input_ids"].reshape(-1, 80)))
        attention_mask_tensor = torch.cat((attention_mask_tensor, batch_processed[i]["attention_mask"].reshape(-1, 80)))
        image_tensor = torch.cat((image_tensor, batch_processed[i]["image"].reshape(-1, 3, 224, 224)))
        label_tensor = torch.cat((label_tensor, batch_processed[i]["label"].reshape(-1,)))
    
    # Returning batch list of sample dictionaries containing 16 processed samples
    return {
        "post_id": postId,
        "title": titles,
        "input_ids": input_ids_tensor,
        "attention_mask": attention_mask_tensor,
        "image": image_tensor,
        "label": label_tensor
    }

In [43]:
def create_data_loader(df, title_tokenizer, max_len, batch_size):
    
    # Initialization of PostDataset and assignment
    # to dataset variable
    dataset = PostDataset(
                post_id = df["id"].to_numpy(),
                title = df["clean_title"].to_numpy(),
                label = df["6_way_label"].to_numpy(),
                title_tokenizer = title_tokenizer,
                max_len = max_len
              )
    
    # Forwarding dataset variable, batch_size and collate_batch function
    # to PyTorch DataLoader module. Returns Iterable DataLoader object
    return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_batch, num_workers=2, pin_memory=True, prefetch_factor=2)

**Now we repeat the same process for the testing and validation data**

In [44]:
# Transform function for image processing (validation and testing)
# No data augmentation in validation and test data splits in order to
# define constant validation and testing process
val_test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.255]
    )
])

In [45]:
def collate_batch_val_test(batch):
    
    # List to save processed batch samples
    batch_processed = []
    
    # Iteration over input batch
    for i in range(len(batch)):
        
        # Iteration over input batch of size 16
        post_id = batch[i]["post_id"]
        title = batch[i]["clean_title"]
        label = batch[i]["label"]
        
        # Leveraging DistilBertTokenizer to generate
        # encoding of input text sequence
        encoding = title_tokenizer.encode_plus(
            title,
            max_length=80,
            padding="max_length",
            truncation=True,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt",
        )

        # Try-Except-Else clause to process image data
        # Fetch images from image_set folder via post_id, transform and reshape tensor
        try:
            image_paths = df_test.iloc[i]["image_url"]
            image = Image.open(image_paths)
        # Handling FileNotFoundError and randomly initializing pixels
        except FileNotFoundError:
            image = torch.rand(3, 224, 224)
            image = torch.unsqueeze(image, 0)
        # Handling UnidentifiedImageError and randomly initializing pixels
        except UnidentifiedImageError:
            image = torch.rand(3, 224, 224)
            image = torch.unsqueeze(image, 0)
        # Handling OSError and randomly initializing pixels
        except OSError:
            image = torch.rand(3, 224, 224)
            image = torch.unsqueeze(image, 0)
        # Else: Convert image to RGB, process with train_transform
        # and reshape to tensor of shape = [1, 3, 224, 224] for
        # [sample_count, color_channels, height in pixel, width in pixel]
        else:
            image = image.convert("RGB")
            image = train_transform(image)
            image = torch.unsqueeze(image, 0)
        

        # Storing processed attributes of sample in sample
        # dictionary: post_id, title (text), input_ids,
        # attention_mask, image and label
        sample = {
            "post_id": post_id,
            "title": title,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "image": image.flatten(),
            "label": torch.tensor(label, dtype=torch.long)
        }
        
        # Append current samples dictionary to processed
        # batch list --> List of sample dictionaries
        batch_processed.append(sample)
        
    # Complex operation in order to unpack list of dictionaries and
    # merge dictionary entries into correct PyTorch tensor for forward processing
    postId = []
    titles = []
    
    # For-loop to stack sample dictionary keys into appropriate format
    for i in range(len(batch_processed)):
        if i == 0:
            # If first sample of batch, initialize attribute tensors and reshape
            postId.append(batch_processed[i]["post_id"])
            titles.append(batch_processed[i]["title"])
            input_ids_tensor = batch_processed[i]["input_ids"].reshape(-1, 80)
            attention_mask_tensor = batch_processed[i]["attention_mask"].reshape(-1, 80)
            image_tensor = batch_processed[i]["image"].reshape(-1, 3, 224, 224)
            label_tensor = batch_processed[i]["label"].reshape(-1,)
            continue

        # Stack attributes of sample dictionary keys to generate correct tensor shape
        postId.append(batch_processed[i]["post_id"])
        titles.append(batch_processed[i]["title"])
        input_ids_tensor = torch.cat((input_ids_tensor, batch_processed[i]["input_ids"].reshape(-1, 80)))
        attention_mask_tensor = torch.cat((attention_mask_tensor, batch_processed[i]["attention_mask"].reshape(-1, 80)))
        image_tensor = torch.cat((image_tensor, batch_processed[i]["image"].reshape(-1, 3, 224, 224)))
        label_tensor = torch.cat((label_tensor, batch_processed[i]["label"].reshape(-1,)))
    
    # Returning batch list of sample dictionaries containing 16 processed samples
    return {
        "post_id": postId,
        "title": titles,
        "input_ids": input_ids_tensor,
        "attention_mask": attention_mask_tensor,
        "image": image_tensor,
        "label": label_tensor
    }

In [46]:
def val_test_create_data_loader(df, title_tokenizer, max_len, batch_size):
    
    # Initialization of PostTitleDataset and assignment
    # to dataset variable
    dataset = PostDataset(
                post_id = df["id"].to_numpy(),
                title = df["clean_title"].to_numpy(),
                label = df["6_way_label"].to_numpy(),
                title_tokenizer = title_tokenizer,
                max_len = max_len
              )
    
    # Forwarding dataset variable, batch_size and collate function
    # to Pytorch DataLoader module. DataLoader is returned, over which
    # can be iterated
    return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_batch_val_test, num_workers=2, pin_memory=True, prefetch_factor=2)

In [47]:
%%time
# Defining batch size and maximum sequence length
# MAX_LEN is defined based on plotting of token length dsitribution
BATCH_SIZE = 16
MAX_LEN = 80

# Initializing Pytorch DataLoader for train, validate and test split dataframes
train_data_loader = create_data_loader(df_train, title_tokenizer, MAX_LEN, BATCH_SIZE)
validate_data_loader = val_test_create_data_loader(df_validate, title_tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = val_test_create_data_loader(df_test, title_tokenizer, MAX_LEN, BATCH_SIZE)

# Retrieving first batch from dataloaders via next() and iter() functions
train_data = next(iter(train_data_loader))
validate_data = next(iter(validate_data_loader))
test_data = next(iter(test_data_loader))

  self.pid = os.fork()
  self.pid = os.fork()


CPU times: user 74.6 ms, sys: 285 ms, total: 360 ms
Wall time: 1.11 s


# Title-Image DistilFND model definition

In [48]:
class FakeNewsDetector(nn.Module):
    
    def __init__(self, num_classes):
        super(FakeNewsDetector, self).__init__()
        
        # Initialize DistilBert model for title feature extraction
        self.title_module = DistilBertModel.from_pretrained("distilbert-base-uncased")
        
        # Initialize ResNet34 model for image feature extraction
        self.image_module = models.resnet34(weights=True)
        
        # Dropout layer to randomly nullify 30% of elements of output tensors (useful during training)
        self.drop = nn.Dropout(p=0.3)

        # Fully connected layers to reshape the output tensors
        self.fc_title = nn.Linear(in_features=self.title_module.config.hidden_size, out_features=num_classes, bias=True)
        self.fc_image = nn.Linear(in_features=1000, out_features=num_classes, bias=True)

        # Softmax layer for final class probability prediction
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, title_input_ids, title_attention_mask, image):
        # Extract features from the title using DistilBert
        title_last_hidden_states = self.title_module(
            input_ids=title_input_ids,
            attention_mask=title_attention_mask,
            return_dict=False
        )
        # Extract the pooled output from the hidden states (CLS token)
        title_pooled_output = title_last_hidden_states[0][:, 0, :]
        # Apply dropout to the pooled output
        title_pooled_output = self.drop(title_pooled_output)
        
        # Pass the title features through the fully connected layer
        title_output = self.fc_title(title_pooled_output)

        # Extract features from the image using ResNet34
        image_output = self.image_module(image)
        # Apply dropout to the image output
        image_output = self.drop(image_output)
        
        # Pass the image features through the fully connected layer
        image_output = self.fc_image(image_output)

        # Combine the title and image features using element-wise maximum
        fusion = torch.maximum(title_output, image_output)
        
        # Apply Softmax to the combined features to get class probabilities
        return self.softmax(fusion)

# Initialize the FakeNewsDetector with the number of classes
fn_detector = FakeNewsDetector(num_classes=len(CLASS_NAMES))
# Move the model to the specified device (e.g., GPU)




* This code defines a PyTorch neural network model named FakeNewsDetector designed to detect fake news by combining textual and visual information. 

Key Components:
* We are using the following models and layers in this function:
* DistilBert Model: Loaded with pre-trained weights to extract features from text (titles).
* ResNet34 Model: Loaded with pre-trained weights to extract features from images.
* Dropout Layer: Helps in preventing overfitting during training by randomly setting a fraction of input units to zero.
* Fully Connected Layers:
    * fc_title: Reduces the dimensionality of the text feature vector to match the number of classes.
    * fc_image: Reduces the dimensionality of the image feature vector to match the number of classes.
    * Softmax Layer: Converts the final output to a probability distribution over the classes.
        
* Forward Pass (forward method):

* Text Feature Extraction:
    * The model takes the title_input_ids and title_attention_mask as inputs and processes them using DistilBert.
    * Extracts the [CLS] token representation from the last hidden states of DistilBert, which serves as a summary of the input text.
    * Applies dropout to the extracted text features.
    * Passes the features through a fully connected layer to map them to the number of classes.
* Image Feature Extraction:
    * The model processes the input image using ResNet34 to extract features.
    * Applies dropout to the extracted image features.
    * Passes the features through a fully connected layer to map them to the number of classes.
* Feature Fusion:
    * Combines the text and image features using element-wise maximum operation.
    * Applies the softmax function to the combined features to obtain class probabilities.
    
* Model Initialization and Device Assignment:

    * Initializes the FakeNewsDetector with the specified number of classes.
    * Moves the model to the specified computing device (e.g., GPU) for efficient computation.
* Flow Summary:
* The model combines features extracted from text (using DistilBert) and images (using ResNet34).
These features are processed through fully connected layers and combined.
The combined features are then passed through a softmax layer to predict the probabilities of each class, which represent whether the news is fake or not.
The model is initialized and moved to the specified device for training or inference.

**get_class_weights() function calculates percentage values per class for a weighted CrossEntropy. Reasoning is an highly imbalanced Fakeddit benchmarks dataset. Classes with a high number of Reddit-Post samples are normalized and are considered less by percentage during loss calculation. Also, some classes have considerably more sample data, all classes are weighted and taken as input into the loss calculation according to their respective number of samples. High number of class samples yields lower percentage weights, and low number of class samples yields higher percentage weight.**

In [49]:
# Printing model architecture
print(fn_detector)

FakeNewsDetector(
  (title_module): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1)

In [50]:
def get_class_weights(dataframe):
    
    # Count labels per class / subtype of Fake News in training set split
    # in sorted order 0, 1, 2, 3, 4, 5 and put into label_count list
    # First, compute the value counts once
    value_counts_sorted = dataframe["6_way_label"].value_counts().sort_index()

# Then, access the counts directly
    label_count = [value_counts_sorted.get(i, 0) for i in range(6)]

    # Calculate weights per class by subtracting from 1 label_count per class divided
    # by sum of all label_counts
    class_weights = [1 - (x / sum(label_count)) for x in label_count]
    # Converting list of class_weights to float PyTorch tensor and assigning to device
    class_weights = torch.FloatTensor(class_weights)

    # Returns class_weights tensor of data type float
    return class_weights

In [51]:
# Calculate class weights on basis of training split dataframe and print weight tensor
class_weights = get_class_weights(df_train)
print(class_weights)

tensor([0.6070, 0.9407, 0.8100, 0.9791, 0.6998, 0.9633])


In [52]:
EPOCHS = 20

# AdamW optimizer with a linear learning rate scheduler
optimizer = AdamW(fn_detector.parameters(), lr=3e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_data_loader) * EPOCHS
)

# Weighted CrossEntropyLoss assigned to device
loss_function = nn.CrossEntropyLoss(weight=class_weights)




# Model training procedure

In [53]:
def train_model(model, data_loader, loss_function, optimizer,scheduler, num_examples):
    print("Training model in progress...")
    print("-" * 10)
    
    # Putting model in training condition including regularization layers
    model = model.train()
    
    # Saving training lossses per epoch and initalizing correct prediction count
    train_losses = []
    correct_preds = 0
    
    # Iteration over data (batches) contained in data split set DataLoader
    for data in tqdm(data_loader):

        # Initializing post title input_ids, attention_mask, 
        
        input_ids = data["input_ids"]
        attention_mask = data["attention_mask"]
        images = data["image"]
        labels = data["label"]

        # Feeding input data to Title-Image DistilFND
        outputs = model(
                title_input_ids = input_ids,
                title_attention_mask = attention_mask,
                image = images
        )

        # Final Softmax layer returns class predictions per sample in batch
        # Highest probability value resembles class prediction and is assigned to preds variable
        _, preds = torch.max(outputs, dim=1)

        # Training loss is calculated by applying weighted Cross Entropy Loss
        # on comparison between predicted label and ground truth label
        train_loss = loss_function(outputs, labels)

        # Counting correct model predictions and incrementing correct prediction count
        correct_preds += torch.sum(preds == labels)
        # Append training loss of current epoch to list of training losses
        train_losses.append(train_loss.item())
        # Initialize backpropagation to adjust model weights / parameters
        train_loss.backward()
        # Normalize gradient values to regularize parameter update
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        # Perform parameter update based on current gradient value
        optimizer.step()
        # Moving scheduler to next step / iteration
        scheduler.step()
        # Zero out current gradients to initialize fresh optimizer state for next epoch
        optimizer.zero_grad()
            
    # Return train_acc and train_loss values
    return correct_preds.double() / num_examples, np.mean(train_losses)

In [54]:
def evaluate_model(model, data_loader, loss_function, num_examples):
    print("Validating model in progress...")
    print("-" * 10)
    
    # Switching off regularization and normalization layers for evaluation mode
    # Hence, no model parameters are adjusted. Model is evaluated in current state
    model = model.eval()
    
    # Saving validation loss per epoch
    val_losses = []
    correct_preds = 0
    
    # Skipping gradient calulation over weights --> Not needed, because
    # no parameters are updated and significantly speeds up iteration over samples batches
    with torch.no_grad():
        # Iteration over data (batches) contained in data split set DataLoader
        for data in tqdm(data_loader):
            
            # Initializing post title input_ids, attention_mask, 
            # image data and label per Reddit-Post and assigning to device
            input_ids = data["input_ids"]
            attention_mask = data["attention_mask"]
            images = data["image"]
            labels = data["label"]
            
            # Feeding input data to Title-Image DistilFND state in current epoch
            outputs = model(
                    title_input_ids = input_ids,
                    title_attention_mask = attention_mask,
                    image = images
            )
            
            # Final Softmax layer returns class predictions per sample in batch
            # Highest probability value resembles class prediction and is assigned to preds variable
            _, preds = torch.max(outputs, dim=1)
            
            # Validation loss is calculated by applying weighted Cross Entropy Loss
            # on comparison between predicted label and ground truth label
            val_loss = loss_function(outputs, labels)
            
            # Counting correct model predictions and incrementing correct prediction count
            correct_preds += torch.sum(preds == labels)
            
            # Appending current validation loss per batch
            # to list of validation losses per epoch
            val_losses.append(val_loss.item())
    
    # Returns val_acc and val_loss values
    return correct_preds.double() / num_examples, np.mean(val_losses)

In [None]:
%%time

# Initializing training history dictionary and best_accuracy variable
history = defaultdict(list)
best_accuracy = 0

# Iteration times the total number of epochs
for epoch in range(EPOCHS):

    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)

    # Calling train_model() function, returns train_acc and train_loss
    train_acc, train_loss = train_model(
        fn_detector,
        train_data_loader,
        loss_function,
        optimizer,
        scheduler,
        len(df_train)
    )

    # Print train_loss and train_acc values for current epoch
    print(f"Train loss {train_loss} | Accuracy {train_acc}")
    print()

    # Calling evaluate_model() function, returns val_acc and val_loss
    val_acc, val_loss = evaluate_model(
            fn_detector,
            validate_data_loader,
            loss_function,
            device,
            len(df_validate)
    )

    # Print val_loss and val_acc values for current epoch
    print(f"Val   loss {val_loss} | Accuracy {val_acc}")
    print()

    # Save current values of train_acc, val_acc, train_loss and val_loss
    # in respective keys of history dictionary for later analysis
    history["train_acc"].append(train_acc)
    history["train_loss"].append(train_loss)
    history["val_acc"].append(val_acc)
    history["val_loss"].append(val_loss)

   
# Print output when training procedure completed
print()
print("Completed Training!")
print("-" * 20)

Epoch 1/20
----------
Training model in progress...
----------


  0%|          | 0/594 [00:00<?, ?it/s]