In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import DistilBertModel, DistilBertTokenizerFast
from torch.utils.data import DataLoader
from transformers import AdamW

In [3]:
df_1 = pd.read_csv('/content/drive/MyDrive/BERT_DATA/text_labels_train.csv')
df_2 = pd.read_csv('/content/drive/MyDrive/BERT_DATA/text_labels_test.csv')
df_3 = pd.read_csv('/content/drive/MyDrive/BERT_DATA/text_labels_val.csv')

In [4]:
print(df_1.shape, df_2.shape, df_3.shape)

(3711, 3) (464, 3) (464, 3)


In [5]:
!pip install text_hammer



In [6]:
import text_hammer as th

In [7]:
df = pd.concat([df_1, df_2, df_3], axis=0)

In [8]:
print(df.shape)

(4639, 3)


In [9]:
df

Unnamed: 0,ID,tweet,label
0,865,Wednesday High pressure remains in charge yet ...,neutral
1,1706,The Dragon watching ... Beach Green Isolated ...,positive
2,3363,Via Reddit Final Fantasy IX Mistaken Love ReM...,positive
3,1346,Prayers JesusSaves NoOtherName Overcomer TeamJ...,positive
4,3188,USER so I have the new single on repeat and yo...,positive
...,...,...,...
459,4099,Happy patriot vs The angry Black woman USER N...,positive
460,4938,VdoBuzz \n,neutral
461,660,fishing Willow Strike Spinnerbait Colorado Ble...,positive
462,4838,.USER USER luke u sent me this same photo I'm...,negative


In [10]:
%%time

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

def text_preprocessing(df, col_name):
  column = col_name
  df[column] = df[column].progress_apply(lambda x:str(x).lower())
  df[column] = df[column].progress_apply(lambda x: th.cont_exp(x)) # you're -> you are; we'll be -> we will be
  df[column] = df[column].progress_apply(lambda x: th.remove_emails(x))
  df[column] = df[column].progress_apply(lambda x: th.remove_html_tags(x))

  df[column] = df[column].progress_apply(lambda x: th.remove_special_chars(x))
  df[column] = df[column].progress_apply(lambda x: th.remove_accented_chars(x))

  return df

CPU times: user 762 µs, sys: 0 ns, total: 762 µs
Wall time: 769 µs


In [11]:
df = text_preprocessing(df, 'tweet')

  0%|          | 0/4639 [00:00<?, ?it/s]

  0%|          | 0/4639 [00:00<?, ?it/s]

  0%|          | 0/4639 [00:00<?, ?it/s]

  0%|          | 0/4639 [00:00<?, ?it/s]

  0%|          | 0/4639 [00:00<?, ?it/s]

  0%|          | 0/4639 [00:00<?, ?it/s]

In [12]:
df.head()

Unnamed: 0,ID,tweet,label
0,865,wednesday high pressure remains in charge yet ...,neutral
1,1706,the dragon watching beach green isolated morni...,positive
2,3363,via reddit final fantasy ix mistaken love remi...,positive
3,1346,prayers jesussaves noothername overcomer teamj...,positive
4,3188,user so i have the new single on repeat and yo...,positive


In [13]:
df = df.iloc[:, 1:]

In [14]:
df.replace({'label':{'positive': 0, 'negative': 1, 'neutral': 2}}, inplace=True)

In [15]:
df.head(10)

Unnamed: 0,tweet,label
0,wednesday high pressure remains in charge yet ...,2
1,the dragon watching beach green isolated morni...,0
2,via reddit final fantasy ix mistaken love remi...,0
3,prayers jesussaves noothername overcomer teamj...,0
4,user so i have the new single on repeat and yo...,0
5,with help from user we break down where the fi...,2
6,you heard the lady anger,2
7,dave grohl and kurt cobain smirk while a flust...,0
8,just a casual subzero start to the day even if...,1
9,ana anorexic anorexia bulimia bulimic blades c...,1


In [16]:
print(df['label'].value_counts())

label
2    1745
0    1693
1    1201
Name: count, dtype: int64


In [17]:
class ReviewDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        review_text = self.dataset.iloc[idx, 0]
        labels = self.dataset.iloc[idx, 1]

        # Tokenize the review text
        encoding = self.tokenizer.encode_plus(
            review_text,
            add_special_tokens=True,  # Add [CLS] token at the start for classification
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'review_text': review_text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [18]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
review_dataset = ReviewDataset(df, tokenizer, 512)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [19]:
review_dataset[0]

{'review_text': 'wednesday high pressure remains in charge yet again staying dry and cloudy in most places gloomy',
 'input_ids': tensor([  101,  9317,  2152,  3778,  3464,  1999,  3715,  2664,  2153,  6595,
          4318,  1998, 24706,  1999,  2087,  3182, 24067,  2100,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
  

In [20]:
tokenizer.decode(review_dataset[0]['input_ids'])

'[CLS] wednesday high pressure remains in charge yet again staying dry and cloudy in most places gloomy [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [21]:
from torch.utils.data import DataLoader, random_split

# Split dataset into training and validation
train_size = int(0.8 * len(df))
val_size = len(df) - train_size
train_dataset, test_dataset = random_split(review_dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [22]:
len(train_loader), len(test_loader)

(232, 58)

In [23]:
class CustomDistilBertForSequenceClassification(nn.Module):
    def __init__(self, num_labels=3):
        super(CustomDistilBertForSequenceClassification, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.pre_classifier = nn.Linear(768, 768)  # DistilBERT's hidden size is 768
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, num_labels)

    def forward(self, input_ids, attention_mask):
        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = distilbert_output[0]  # (batch_size, sequence_length, hidden_size)
        pooled_output = hidden_state[:, 0]  # we take the representation of the [CLS] token (first token)
        pooled_output = self.pre_classifier(pooled_output)
        pooled_output = nn.ReLU()(pooled_output)
        pooled_output = self.dropout(pooled_output) # regularization
        logits = self.classifier(pooled_output)
        return logits


In [24]:
model = CustomDistilBertForSequenceClassification()

In [25]:
# Inspect DistilBERT
print(model.distilbert)

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(20):
    for i, batch in enumerate(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f"Epoch {epoch + 1}, Batch {i + 1}, Loss: {loss.item():.4f}")




Epoch 1, Batch 100, Loss: 0.7136
Epoch 1, Batch 200, Loss: 0.7453
Epoch 2, Batch 100, Loss: 0.2949
Epoch 2, Batch 200, Loss: 0.4575
Epoch 3, Batch 100, Loss: 0.0334
Epoch 3, Batch 200, Loss: 0.1435
Epoch 4, Batch 100, Loss: 0.1409
Epoch 4, Batch 200, Loss: 0.1816
Epoch 5, Batch 100, Loss: 0.0307
Epoch 5, Batch 200, Loss: 0.0264
Epoch 6, Batch 100, Loss: 0.0207
Epoch 6, Batch 200, Loss: 0.1245
Epoch 7, Batch 100, Loss: 0.0042
Epoch 7, Batch 200, Loss: 0.3267
Epoch 8, Batch 100, Loss: 0.0125
Epoch 8, Batch 200, Loss: 0.0918
Epoch 9, Batch 100, Loss: 0.0031
Epoch 9, Batch 200, Loss: 0.0166
Epoch 10, Batch 100, Loss: 0.0012
Epoch 10, Batch 200, Loss: 0.0563
Epoch 11, Batch 100, Loss: 0.0463
Epoch 11, Batch 200, Loss: 0.0292
Epoch 12, Batch 100, Loss: 0.0029
Epoch 12, Batch 200, Loss: 0.0431
Epoch 13, Batch 100, Loss: 0.0190
Epoch 13, Batch 200, Loss: 0.0026
Epoch 14, Batch 100, Loss: 0.0850
Epoch 14, Batch 200, Loss: 0.3619
Epoch 15, Batch 100, Loss: 0.1645
Epoch 15, Batch 200, Loss: 0.025

***EVALUATION***

In [27]:
model.eval()
total_correct = 0
total = 0
for batch in test_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    with torch.inference_mode():
        logits = model(input_ids=input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(logits, dim=1)
    total_correct += (predictions == labels).sum().item()
    total += predictions.size(0)

print(f'Test Accuracy: {total_correct / total:.4f}')


Test Accuracy: 0.7091
