Install necessary libraries:

In [1]:
pip install transformers datasets evaluate accelerate

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/20/0a/739426a81f7635b422fbe6cb8d1d99d1235579a6ac8024c13d743efa6847/transformers-4.36.2-py3-none-any.whl.metadata
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
     ---------------------------------------- 0.0/126.8 kB ? eta -:--:--
     --- ------------------------------------ 10.2/126.8 kB ? eta -:--:--
     -------- ---------------------------- 30.7/126.8 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 126.8/126.8 kB 1.2 MB/s eta 0:00:00
Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/e2/cf/db41e572d7ed958e8679018f8190438ef700aeb501b62da9e1eed9e4d69a/datasets-2.15.0-py3-none-any.whl.metadata
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Import Libraries


In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


Check GPU availability:

In [16]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using CPU


Load Dataset

In [4]:
# Load the dataset
file_path = 'train.csv'  # Update the path if necessary
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())


   Unnamed: 0                                               text  label
0           0  #anywere &#8220;@TheCooleyShow: LA = palm tree...      1
1           1  RT @OfficialA1King: The face you make when you...      1
2           2             bitch get off my twitter hoe &#128074;      1
3           3    I can taste loud n pussy on my tongue &#128541;      1
4           4  Diabetes galore &#128514;&#128514;&#128514;&#1...      2


Define a Dataset class for handling the Twitter comments. This involves tokenizing the text and preparing it in a format suitable for BERT.

In [5]:
class TwitterCommentsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [9]:
df_train, df_val = train_test_split(data, test_size=0.1)


In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 256  # You can adjust this depending on the average length of the tweets

train_dataset = TwitterCommentsDataset(
    texts=df_train.text.to_numpy(),
    labels=df_train.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)

val_dataset = TwitterCommentsDataset(
    texts=df_val.text.to_numpy(),
    labels=df_val.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
)


Load BERT Model

In [12]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(data.label.unique()))

model.safetensors: 100%|██████████| 440M/440M [00:34<00:00, 12.6MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)


Training:

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


  0%|          | 10/3348 [01:56<10:42:57, 11.56s/it]

{'loss': 0.9403, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.01}


  1%|          | 20/3348 [04:02<11:29:29, 12.43s/it]

{'loss': 0.876, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.02}


  1%|          | 27/3348 [05:25<10:43:39, 11.63s/it]

KeyboardInterrupt: 

Evuluation

In [None]:
trainer.evaluate()
