In [1]:
import os
import pandas as pd
import pyarrow.fs as fs
import s3fs
import tempfile
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Data manipulation
import pandas as pd
import numpy as np

# Tokenization and model
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

# Torch core
import torch
from torch.utils.data import Dataset, DataLoader, random_split

# Metrics (optional)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# To ensure reproducibility
import random
import os


In [6]:
# Define column names
column_names = ['tweet_id', 'entity', 'sentiment', 'text']

# Load and clean training data
df_train = pd.read_csv("/home/onyxia/work/Sentiment-analysis-MEPPDS/data/twitter_training.csv", header=None, names=column_names)
df_train = df_train.dropna(subset=['sentiment', 'text'])  # Drop rows with missing data
df_train['sentiment'] = df_train['sentiment'].str.strip().str.lower()
df_train = df_train[df_train['sentiment'].isin(['negative', 'neutral', 'positive'])]
df_train['label'] = df_train['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2}).astype(int)

# Load and clean validation data
df_validation = pd.read_csv("/home/onyxia/work/Sentiment-analysis-MEPPDS/data/twitter_validation.csv", header=None, names=column_names)
df_validation = df_validation.dropna(subset=['sentiment', 'text'])
df_validation['sentiment'] = df_validation['sentiment'].str.strip().str.lower()
df_validation = df_validation[df_validation['sentiment'].isin(['negative', 'neutral', 'positive'])]
df_validation['label'] = df_validation['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2}).astype(int)


In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [8]:
print(df_train['text'].head())  # Check the first few entries
df_train['text'] = df_train['text'].fillna('')  # or df_train['text'].dropna()


0    im getting on borderlands and i will murder yo...
1    I am coming to the borders and I will kill you...
2    im getting on borderlands and i will kill you ...
3    im coming on borderlands and i will murder you...
4    im getting on borderlands 2 and i will murder ...
Name: text, dtype: object


In [9]:
# Ensure 'text' column is in list format
encodings_train = tokenizer(df_train['text'].tolist(), truncation=True, padding=True, max_length=128)
encodings_validation = tokenizer(df_validation['text'].tolist(), truncation=True, padding=True, max_length=128)


In [10]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        } | {'labels': torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)

train = TweetDataset(encodings_train, df_train['label'].tolist())
validation = TweetDataset(encodings_validation, df_validation['label'].tolist())

In [12]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
for param in model.bert.parameters():
    param.requires_grad = False


In [16]:
for name, param in model.bert.named_parameters():
    if not name.startswith("encoder.layer.11") and not name.startswith("encoder.layer.10"):
        param.requires_grad = False


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=validation,
)

trainer.train()

Step,Training Loss


In [None]:
preds = trainer.predict(test_dataset)
predicted_labels = preds.predictions.argmax(-1)
