****IMPLEMENTING BERT MODEL FOR SENTIMENTAL ANALYISIS****

Loading necessary libraries and dataset form kaggle using an api token

In [1]:
import os
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from kaggle.api.kaggle_api_extended import KaggleApi
import numpy as np
from sklearn.metrics import accuracy_score

def download_dataset():
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files('kazanova/sentiment140', path='sentiment140', unzip=True)



Created a class to preprocess the data and sample 100000 instances

In [2]:
class Sentiment140Dataset(Dataset):
    def __init__(self, tokenizer, file_path, max_len=128, sample_size=100000):
        column_names=['sentiment','id','date','flag','user','text']
        self.data = pd.read_csv(file_path, encoding='ISO-8859-1', names=column_names)
        self.data['sentiment'] = self.data['sentiment'].replace(4, 1)  # Convert 4 to 1 for positive sentiment
        if sample_size:
            self.data = self.data.sample(n=sample_size, random_state=42)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['text']
        sentiment = self.data.iloc[index]['sentiment']
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        return {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'labels': torch.tensor(sentiment, dtype=torch.long)
        }


created a function to compute the metrics

In [3]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}



Defining a function to train the model on distil bert

In [4]:

def train_model(data_path):
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    dataset = Sentiment140Dataset(tokenizer, os.path.join(data_path, 'training.1600000.processed.noemoticon.csv'), sample_size=1000)
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

    train_dataset, test_dataset = train_test_split(dataset, test_size=0.1,random_state=42)
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=10,
        per_device_train_batch_size=128,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    train_result = trainer.train()
    eval_result = trainer.evaluate()


    print(f"Training results: {train_result.metrics}")
    print(f"Evaluation results: {eval_result}")
    
    model.save_pretrained('./saved_model')
    tokenizer.save_pretrained('./saved_model')


Using a main function to run the code

In [5]:
# Main execution
if __name__ == '__main__':
    download_dataset()
    train_model('sentiment140')



Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.6922
20,0.6829
30,0.6739
40,0.6156
50,0.4963
60,0.3275
70,0.2041
80,0.1085


Step,Training Loss
10,0.0788
20,0.0618
30,0.0489
40,0.1388
50,0.0597
60,0.0328
70,0.0183
80,0.022


Training results: {'train_runtime': 109.4008, 'train_samples_per_second': 82.266, 'train_steps_per_second': 0.731, 'total_flos': 298051646976000.0, 'train_loss': 0.057626415602862836, 'epoch': 10.0}
Evaluation results: {'eval_loss': 0.8881073594093323, 'eval_accuracy': 0.83, 'eval_runtime': 0.7235, 'eval_samples_per_second': 138.224, 'eval_steps_per_second': 17.969, 'epoch': 10.0}
