In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from scipy.stats import ttest_ind

import re
from transformers import ElectraForSequenceClassification, ElectraTokenizerFast
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


%matplotlib inline


In [2]:
# Loading the dataset
df = pd.read_csv('../data/clean/df_0.csv', sep=',')

In [3]:
# Preprocess the dataset: removing URLs, mentions and hashtags im 'text' column and convert to lowercase

# Define a function to clean the text
def clean_text(text):
    # Remove URLs, mentions, hashtags, and punctuation
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    #text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.lower()  # Convert to lowercase

# Apply the text cleaning function
df['cleaned_text'] = df['text'].apply(clean_text)

# Inspect the cleaned text column
df[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,"Cooking microwave pizzas, yummy","cooking microwave pizzas, yummy"
1,Any plans of allowing sub tasks to show up in ...,any plans of allowing sub tasks to show up in ...
2,"I love the humor, I just reworded it. Like sa...","i love the humor, i just reworded it. like sa..."
3,naw idk what ur talkin about,naw idk what ur talkin about
4,That sucks to hear. I hate days like that,that sucks to hear. i hate days like that


<br>
<br>

# Tokenize the data using ELECTRA's Tokenizer
Convert words to tokens that map to ELECTRA’s vocabulary.

In [4]:
# Load the pre-trained ELECTRA tokenizer
tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-small-discriminator', clean_up_tokenization_spaces=True)


In [5]:
# Tokenize the cleaned data
tokenized_data = tokenizer(
    df['cleaned_text'].tolist(),  # List of sentences to tokenize
    padding=True,                # Pads sequences to the longest sequence in the batch
    truncation=True,             # Truncates longer sequences to the max_length
    max_length=128,              # Sets the maximum length for tokenized sequences
    return_tensors='pt'          # Return as PyTorch tensors
)

In [11]:
tokenized_data

{'input_ids': tensor([[  101,  8434, 18302,  ...,     0,     0,     0],
        [  101,  2151,  3488,  ...,     0,     0,     0],
        [  101,  1045,  2293,  ...,     0,     0,     0],
        ...,
        [  101,  2023, 10439,  ...,     0,     0,     0],
        [  101,  1024,  2200,  ...,     0,     0,     0],
        [  101,  2204, 10439,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [6]:
# Check the first tokenized example
print("Input IDs (first example):", tokenized_data['input_ids'][0])
print("Attention Mask (first example):", tokenized_data['attention_mask'][0])

# Check the length of the input_ids and attention_mask (should be 128 if max_length=128 was set)
print("Length of Input IDs:", len(tokenized_data['input_ids'][0]))
print("Length of Attention Mask:", len(tokenized_data['attention_mask'][0]))


Input IDs (first example): tensor([  101,  8434, 18302, 10733,  2015,  1010,  9805, 18879,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,  

<br>
<br>

# Prepare the Data for Training
It's required to pair the tokenized data (which contains *input_ids* and *attention_mask*) with the sentiment labels from the target column. Once the tokenized data and labels are prepared, I can package them into a PyTorch Dataset.

In [7]:
# Convert the 'target' column to a list of labels
labels = df['target'].tolist()  

In [8]:
# Define a custom PyTorch Dataset class to hold the tokenized inputs and labels
class SentimentDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)  # Return the size of the dataset

    def __getitem__(self, idx):
        # Return a dictionary containing input_ids, attention_mask, and labels for each example
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [9]:
# Create a dataset using the tokenized inputs and labels
sentiment_dataset = SentimentDataset(
    input_ids=tokenized_data['input_ids'],  # Use the tokenized input_ids
    attention_masks=tokenized_data['attention_mask'],  # Use the tokenized attention masks
    labels=labels  # Use the sentiment labels from the 'target' column
)

In [10]:
# Inspect the first item in the dataset to make sure it's correct
first_item = sentiment_dataset[0]
print("First dataset item:")
print("Input IDs:", first_item['input_ids'])
print("Attention Mask:", first_item['attention_mask'])
print("Label:", first_item['labels'])

First dataset item:
Input IDs: tensor([  101,  8434, 18302, 10733,  2015,  1010,  9805, 18879,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     

<br>
<br>

# Fine-tune ELECTRA for Sentiment Analysis
ELECTRA is pre-trained on general text but needs to be fine-tuned to classify sentiments based on my dataset. These are the summary of the stpes for the fine-tune process:
1. Load the ELECTRA model for sequence classification.
2. Create a DataLoader to handle the batching of the dataset: it's required a DataLoader to feed the data in batches during training. This is important for performance reasons (since training on all data at once would use too much memory).
3. Define the training arguments and use the Hugging Face Trainer API for fine-tuning. There is a class provided by Hugging Face for this step.
4. Start the training process: use Hugging Face’s Trainer API, which simplifies the training process by handling the training loop.

In [11]:
# 1. Load the pre-trained ELECTRA model for sequence classification
model = ElectraForSequenceClassification.from_pretrained(
    'google/electra-small-discriminator',  # Pre-trained model
    num_labels=3  # there are 3 classes: negative, neutral, positive
)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# 2. Create a DataLoader to iterate through the dataset in batches
batch_size = 16  # this is defined acccording to system's capacity

train_dataloader = DataLoader(
    sentiment_dataset,  # dataset with tokenized values
    batch_size=batch_size,  # Number of samples per batch
    shuffle=True  # Shuffle the data at the start of each epoch (This ensures the data is shuffled during training to avoid any order bias).
)


In [13]:
# 3. Define Training Arguments
# Hugging Face provides the TrainingArguments class to handle these settings easily.

from transformers import Trainer, TrainingArguments

batch_size = 16  # adjust this based on the computer capacity (typically, values like 16, 32, or 64 work well depending on the memory of the GPU (or CPU).)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',  # Directory where the model's checkpoints will be saved
    num_train_epochs=3,  # Number of times the model will go over the entire dataset.
    per_device_train_batch_size=batch_size,  # Batch size for training
    per_device_eval_batch_size=64,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Weight decay to prevent overfitting
    logging_dir='./logs',  # Directory to store logs
    logging_steps=10,  # Log after every 10 steps (How often to log during training)
    eval_strategy="steps",  # Evaluate during training at each logging step
    save_steps=200,  # Save checkpoint every 500 steps
)

Before the training, I will split the dataset to provide it for the evaluation

In [15]:
# Provide an Evaluation Dataset
from sklearn.model_selection import train_test_split

# Split the dataset into train and eval (80% train, 20% eval)
train_dataset, eval_dataset = train_test_split(sentiment_dataset, test_size=0.2)


In [None]:
# Check the size of the splits
print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")

In [16]:
# Initialize the Trainer with both train and eval datasets
trainer = Trainer(
    model=model,  # The pre-trained model
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=eval_dataset,  # Evaluation dataset
    data_collator=None  # Default to handling batches
)

# Start training the model
trainer.train()


Step,Training Loss,Validation Loss
10,1.0954,1.096877
20,1.0978,1.096698
30,1.0982,1.096421
40,1.1013,1.095968
50,1.0966,1.095548
60,1.0958,1.094869
70,1.0974,1.094095
80,1.0918,1.093229
90,1.0936,1.09223
100,1.093,1.091398


ValueError: You are trying to save a non contiguous tensor: `electra.embeddings_project.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.