**Importing Data**

In [None]:
import pandas as pd

df = pd.read_csv('/content/final_withaspects.csv')
print(df.head())

   class  id                                               text  text_length  \
0      0   1  thats true freedom speech doomed harassment su...           92   
1      0   2               neener neener time go playground yet           36   
2      0   3  like plastic gun fear armour piercing bullet f...           83   
3      0   4          geology religion werent see rock x formed           41   
4      0   5  well done monty mark first ever honest accurat...           52   

   words_per_sentence  sentiment  polarity  subjectivity  \
0                  13   0.350000  0.350000      0.650000   
1                   6   0.000000  0.000000      0.000000   
2                  13   0.000000  0.000000      0.000000   
3                   7   0.000000  0.000000      0.000000   
4                   9   0.416667  0.416667      0.622222   

                    aspects  
0                        []  
1  ['neener', 'playground']  
2                ['armour']  
3                  ['rock']  
4       

In [None]:
import ast
def convert_aspects(aspect_string):
    if aspect_string == '[]':
        return []
    else:
        return ast.literal_eval(aspect_string)

In [None]:
"""**Calling the Convert Aspect Function**"""

df['aspects'] = df['aspects'].apply(convert_aspects)


In [None]:
"""**Analyze the Classes**"""

class_with_aspects = df[df['aspects'].apply(lambda x: len(x) > 0)]['class'].value_counts()
class_without_aspects = df[df['aspects'].apply(lambda x: len(x) == 0)]['class'].value_counts()

In [None]:
"""**Aspect Summary**"""

print("Classes with aspects:")
print(class_with_aspects)

print("\nClasses without aspects:")
print(class_without_aspects)

Classes with aspects:
class
1    2371
0    1822
Name: count, dtype: int64

Classes without aspects:
class
0    2871
1    2322
Name: count, dtype: int64


In [None]:
"""**Final Checking of Text and Aspects Column, if there are any missing values.**"""

df = df.dropna(subset=['text', 'aspects'])

In [None]:
"""**Convert the Text and Aspects into String Type**"""

df['text'] = df['text'].astype(str)
df['aspects'] = df['aspects'].astype(str)

In [None]:
"""**Initialize the Tokenizer | Tokenization and Input Formatting using BERT Embedding**"""

import torch
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split

In [None]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization and input formatting
def preprocess_data(texts, aspects, tokenizer, max_len=128):
    input_ids, attention_masks = [], []

    for text, aspect in zip(texts, aspects):
        encoded = tokenizer.encode_plus(
            text,
            aspect,
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
texts = df['text'].values
aspects = df['aspects'].values
labels = df['class'].values

In [None]:
"""**Split Data**"""

train_texts, val_texts, train_aspects, val_aspects, train_labels, val_labels = train_test_split(texts, aspects, labels, test_size=0.2, random_state=42)

train_input_ids, train_attention_masks = preprocess_data(train_texts, train_aspects, tokenizer)
val_input_ids, val_attention_masks = preprocess_data(val_texts, val_aspects, tokenizer)

train_labels = torch.tensor(train_labels, dtype=torch.float32)
val_labels = torch.tensor(val_labels, dtype=torch.float32)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)


In [None]:
"""**Define the Model**"""

import torch.nn as nn
from transformers import BertModel

class BERT_CNN(nn.Module):
    def __init__(self, bert, num_filters=256, filter_sizes=[2, 3, 4], output_dim=1, dropout=0.5):
        super(BERT_CNN, self).__init__()
        self.bert = bert
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, bert.config.hidden_size))
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * num_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()  # Sigmoid for binary classification

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs[0]  # BERT's output (sequence of hidden states)
        cls_output = cls_output.unsqueeze(1)  # Add channel dimension

        conved = [torch.relu(conv(cls_output)).squeeze(3) for conv in self.convs]
        pooled = [torch.max(conv, dim=2)[0] for conv in conved]

        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.sigmoid(self.fc(cat))

In [None]:
"""**Loading Model**"""

bert = BertModel.from_pretrained('bert-base-uncased')
model = BERT_CNN(bert, dropout=0.5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT_CNN(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [None]:
"""**Train Setup**"""

import torch.optim as optim

criterion = nn.BCELoss()  # Use BCELoss for binary classification
optimizer = optim.Adam(model.parameters(), lr=2e-5, weight_decay=1e-4)

best_val_loss = float('inf')
patience = 2
trials = 0

def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

        optimizer.zero_grad()

        outputs = model(b_input_ids, b_attention_mask)
        loss = criterion(outputs.view(-1), b_labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
"""**Evaluate Functions**"""

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            b_input_ids, b_attention_mask, b_labels = tuple(t.to(device) for t in batch)

            outputs = model(b_input_ids, b_attention_mask)
            loss = criterion(outputs.view(-1), b_labels)
            total_loss += loss.item()

            all_preds.append(outputs.cpu().numpy())
            all_labels.append(b_labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return avg_loss, all_preds, all_labels

In [None]:
"""**Final Train**"""

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
epochs = 5
for epoch in range(epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    val_loss, val_preds, val_labels = evaluate(model, val_dataloader, criterion, device)

    print(f'Epoch {epoch+1}, Train Loss: {train_loss}')
    print(f'Epoch {epoch+1}, Validation Loss: {val_loss}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'bert_cnn_model_best.pth')
        trials = 0
    else:
        trials += 1
        if trials >= patience:
            print(f"Early stopping on epoch {epoch+1}")
            break

Epoch 1, Train Loss: 0.5871123121139851
Epoch 1, Validation Loss: 0.5182585064637459
Epoch 2, Train Loss: 0.4618118166289431
Epoch 2, Validation Loss: 0.5396520364082465
Epoch 3, Train Loss: 0.32257143585605824
Epoch 3, Validation Loss: 0.5462588314282693
Early stopping on epoch 3


In [None]:
"""**Save Final Model**"""

model_save_path = 'bert_cnn_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to bert_cnn_model.pth


In [None]:
"""**Evaluation**"""

def compute_metrics(preds, labels):
    pred_flat = np.round(preds).flatten()
    labels_flat = labels.flatten()
    accuracy = accuracy_score(labels_flat, pred_flat)
    precision = precision_score(labels_flat, pred_flat)
    recall = recall_score(labels_flat, pred_flat)
    f1 = f1_score(labels_flat, pred_flat)
    roc_auc = roc_auc_score(labels_flat, preds)  # AUC for binary classification
    conf_matrix = confusion_matrix(labels_flat, pred_flat)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'conf_matrix': conf_matrix
    }

metrics = compute_metrics(val_preds, val_labels)

print(f"Validation Loss: {val_loss}")
print(f"Accuracy: {metrics['accuracy']}")
print(f"Precision: {metrics['precision']}")
print(f"Recall: {metrics['recall']}")
print(f"F1 Score: {metrics['f1']}")
print(f"ROC AUC: {metrics['roc_auc']}")
print(f"Confusion Matrix:\n {metrics['conf_matrix']}")

Validation Loss: 0.5462588314282693
Accuracy: 0.7630457933972311
Precision: 0.746268656716418
Recall: 0.7978723404255319
F1 Score: 0.7712082262210795
ROC AUC: 0.8426926915574104
Confusion Matrix:
 [[683 255]
 [190 750]]
