### This Jupyter Notebook is my submission to the Kaggle Twitter disaster prediction challenge where we were given the tweets and locatio of the tweets and we had to perform NLP on the tweets to check if the disaster is real or not. 

In [1]:
# Importing the libraries
import torch # Used for deep learning
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
# DistilBertTokenizer performs tokenization which breaks down input text into smaller units, such as words or subwords, and converts them into numerical representations
# DistilBertForSequenceClassification is designed to handle tasks where the model needs to classify the input text into different categories or classes
from torch.utils.data import DataLoader, Dataset 
# DataLoader can define how the dataset should be loaded, specify the batch size, enable shuffling of the data for randomness, utilize multi-process data loading for faster training
# Dataset acts like a blueprint for creating custom datasets in PyTorch.

In [2]:
# Loading the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
# Defining custom dataset class
class DisasterTweetsDataset(Dataset): # Creating a new class which is subset of Dataset class from pytorch
    def __init__(self, data, tokenizer, max_length, is_test = False): # Constructor of the class 
        self.data = data # The input dataset
        self.tokenizer = tokenizer # An instance of tokenizer, like DistilBertTokenizer, used to tokenize the tweet text
        self.max_length = max_length # The max length of the tokenized input, tweets longer than this length will be truncated
        self.is_test = is_test # Indicating whether the dataset is for testing or not
        
    def __len__(self): # Returns the length of the dataset
        return len(self.data)
    
    def __getitem__(self, index): # Tokenizes the tweet text using the tokenizer and retrieves the input tensors 
        tweet = self.data['text'][index] # Get the tweet text from the data at the specified index
        encoding = self.tokenizer.encode_plus( # Using the tokenizer's encode_plus method to tokenize the tweet text
            tweet,
            add_special_tokens = True, # Adds special tokens like [CLS] (classification) and [SEP] (separator) tokens
            max_length = self.max_length, # Truncates or pads the tokenized input to the specified max_length
            padding = 'max_length', # Pads the sequences to have the same length as max_length
            truncation = True, # Truncates the sequences if they exceed max_length
            return_tensors = 'pt' # Returns the tokenized inputs as PyTorch tensors
        )
        
        input_ids = encoding['input_ids'].squeeze() # Represent the tokenized input sequence, where each token is mapped to its corresponding ID
        attention_mask = encoding['attention_mask'].squeeze() # A binary mask indicating which tokens should be attended to (1) and which should be ignored (0) during processing
        
        if self.is_test: # Depending on the is_test flag, returns the appropriate dictionary
            return { # If is_test is True, only the tokenized inputs are returned in a dictionary
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }
        else: # If is_test is False, the label corresponding to the tweet is retrieved, converted to a PyTorch tensor and returned in addition to the tokenized inputs in a dictionary
            label = torch.tensor(self.data['target'][index])
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'label': label
            }

In [4]:
#  Setting up the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# DistilBertTokenizer class is responsible for tokenizing the input text into tokens that can be understood by the DistilBERT model
# The from_pretrained method loads the tokenizer with the pre-trained weights from the 'distilbert-base-uncased' model, which is a version of DistilBERT trained on uncased English text
# Using this tokenizer, we can convert text inputs into tokenized representations suitable for input to the DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2)
# This line creates an instance of the DistilBertForSequenceClassification class, which is a pre-trained DistilBERT model fine-tuned for sequence classification tasks
# The num_labels parameter is set to 2, indicating that the model is configured for a binary classification task (two labels)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.

In [5]:
# Setting device and defining training parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # This line assigns the device on which the model will be trained
# Assigning the device allows the model to utilize the available hardware for faster training and inference

batch_size = 16 # Determines the number of samples processed simultaneously during training or inference
max_length = 128 # Sets the maximum length of the input sequences
num_epochs = 50 # Determines how many times the entire dataset will be passed through the model during training
learning_rate = 2e-5 # Controls the step size or the rate at which the model's weights are updated during training

In [6]:
# Creating train dataset and loader
train_dataset = DisasterTweetsDataset(train_data, tokenizer, max_length) 
# This line creates an instance of the DisasterTweetsDataset class, which is a custom dataset specifically designed for handling disaster tweets data
# This dataset will be used to feed the training samples to the model during the training process
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle=True)
# The DataLoader class is provided by PyTorch and is responsible for loading the data in batches during training
# The shuffle parameter indicates that the training samples will be shuffled randomly before being divided into batches

In [7]:
# Sanity check of train loader
for batch in train_loader:
    print(batch)
    break
# Shows how to use the train_loader data loader to iterate over the batches of training data
# It prints the contents of the first batch to give you an idea of the data structure

{'input_ids': tensor([[  101,  9152, 23033,  ...,     0,     0,     0],
        [  101,  1030,  2534,  ...,     0,     0,     0],
        [  101,  3796,  1996,  ...,     0,     0,     0],
        ...,
        [  101, 16129,  2629,  ...,     0,     0,     0],
        [  101,  2023,  8505,  ...,     0,     0,     0],
        [  101,  1030,  5553,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'label': tensor([0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0])}


In [8]:
# Pushing model to device along with defining optimizer and loss function
model.to(device) # Moves the model to the specified device, ensures the model and its operations will be performed on the selected device during training
model.train() #  Activating the training-specific behavior of the model, enabling features like dropout and batch normalization
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate) # Initializes the optimizer, which is responsible for updating the model's parameters during training to minimize the loss
# The AdamW optimizer is used, which incorporates weight decay (L2 regularization) to help prevent overfitting
# model.parameters() method returns an iterable of the model's learnable parameters that need to be optimized
loss_fn = torch.nn.CrossEntropyLoss() # Initializes the loss function that will be used to compute the model's training loss
# CrossEntropyLoss is commonly used for multi-class classification problems, where the input consists of class labels

In [None]:
# Fine tuning the model for 5 epochs
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device) # Extracting the input data (input_ids and attention_mask) and the corresponding labels from the current batch
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad() # Clearing the gradients of all optimized parameters. It is necessary to reset the gradients before computing the gradients for the current batch to avoid accumulating gradients from previous batches

        outputs = model(input_ids, attention_mask = attention_mask, labels=labels) # Passeing the input data and labels to the model
        loss = outputs.loss # Extracting the loss value and predicted logits from the outputs object for the current batch
        logits = outputs.logits

        total_loss += loss.item() # Adding the current batch's loss value to the total_loss variable, accumulating the loss for the current epoch
        # .item() method is used to obtain the loss value as a scalar instead of a tensor
        loss.backward() # Performing backpropagation to calculate how the loss value changes as the model's parameters change
        optimizer.step() # Performing a parameter update step to minimize the loss

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}')

Epoch 1/50, Loss: 206.2861
Epoch 2/50, Loss: 151.8810


In [None]:
# Creating test dataset
test_dataset = DisasterTweetsDataset(test_data, tokenizer, max_length, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
# Pushing model to eval mode
model.eval() # Setting the model to evaluation mode. It's important to switch to evaluation mode before making predictions or evaluating the model's performance
predictions = []

In [None]:
# Generating predictions
from tqdm import tqdm # Providing a progress bar for loops allowing us to track the progress of the loop and provides an estimate of the remaining time
with torch.no_grad(): # It's used during evaluation to disable gradient computation which reduces memory usage and speeds up the process
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask) # Passing the input data and attention mask to the model to obtain the model's outputs
        logits = outputs.logits # Extracting the predicted logits from the outputs object.
        predicted_labels = torch.argmax(logits, dim=1).cpu().numpy() # Calculating the predicted labels by taking the index of the maximum value along the second dimension of the logits tensor
        # argmax() function returns the index with the highest value for each sample in the batch and .cpu().numpy() converts the tensor to a NumPy array on the CPU
        predictions.extend(predicted_labels.tolist()) # Extending the predictions list with the predicted labels for the current batch

In [None]:
# Generating submission file
submission_data = {'id': test_data['id'], 'target': predictions}
submission_df = pd.DataFrame(submission_data)
submission_df.to_csv('submission.csv', index=False)
submission_df