In [27]:
import pandas as pd
import numpy as np
import contractions
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from transformers import BertTokenizer, BertModel

In [11]:
with open('rt-polaritydata/rt-polarity.neg', 'r', encoding='ISO-8859-1') as f:
    neg_reviews = f.readlines()

with open('rt-polaritydata/rt-polarity.pos', 'r', encoding='ISO-8859-1') as f:
    pos_reviews = f.readlines()

neg_reviews = pd.DataFrame(neg_reviews, columns=['text'])
pos_reviews = pd.DataFrame(pos_reviews, columns=['text'])
neg_reviews['label'] = 0
pos_reviews['label'] = 1
reviews = pd.concat([neg_reviews, pos_reviews], ignore_index=True)

In [15]:
def expand_contractions(text):
    return contractions.fix(text)

def pre_processing(rev):
    rev = expand_contractions(rev)
    return rev 

pos_reviews['preprocessed_review'] = pos_reviews['text'].apply(pre_processing)
neg_reviews['preprocessed_review'] = neg_reviews['text'].apply(pre_processing)

In [16]:
train_pos = pos_reviews.iloc[:4000]
train_neg = neg_reviews.iloc[:4000]
train_data = pd.concat([train_pos['preprocessed_review'], train_neg['preprocessed_review']], ignore_index=True)
train_labels = pd.concat([train_pos['label'], train_neg['label']], ignore_index=True)

val_pos = pos_reviews.iloc[4000:4500]
val_neg = neg_reviews.iloc[4000:4500]
val_data = pd.concat([val_pos['preprocessed_review'], val_neg['preprocessed_review']], ignore_index=True)
val_labels = pd.concat([val_pos['label'], val_neg['label']], ignore_index=True)

test_pos = pos_reviews.iloc[4500:]
test_neg = neg_reviews.iloc[4500:]
test_data = pd.concat([test_pos['preprocessed_review'], test_neg['preprocessed_review']], ignore_index=True)
test_labels = pd.concat([test_pos['label'], test_neg['label']], ignore_index=True)


In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, return_tensors="pt")

train_encodings = tokenize_function(train_data)
val_encodings = tokenize_function(val_data)
test_encodings = tokenize_function(test_data)

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels.values))
val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], torch.tensor(val_labels.values))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels.values))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [36]:
class BERTClassifier(nn.Module):
    def __init__(self):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 64)
        self.fc2 = nn.Linear(64, 1) 


    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return torch.sigmoid(self.fc2(self.fc1(pooled_output)))

In [37]:
model = BERTClassifier()
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [39]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_loader, desc=f'Training Epoch {epoch + 1}'):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels.unsqueeze(1).float())
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}')
        val_loss, val_f1 = evaluate_model(model, val_loader, criterion)
        print(f'Validation Loss: {val_loss:.4f}, Validation F1 Score: {val_f1:.4f}')

In [40]:
def evaluate_model(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Evaluating'):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels.unsqueeze(1).float())
            total_loss += loss.item()

            predicted = (outputs >= 0.5).int()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    avg_loss = total_loss / len(val_loader)
    f1 = f1_score(all_labels, all_predictions, average='weighted')
    print("\nClassification Report:\n", classification_report(all_labels, all_predictions, target_names=["Negative", "Positive"]))

    return avg_loss, f1

In [41]:
train_model(model, train_loader, val_loader, criterion, optimizer)

Training Epoch 1: 100%|███████████████████████████████████████████████████████████████| 250/250 [00:40<00:00,  6.14it/s]


Epoch 1, Loss: 0.4190


Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.81it/s]



Classification Report:
               precision    recall  f1-score   support

    Negative       0.89      0.87      0.88       500
    Positive       0.88      0.89      0.88       500

    accuracy                           0.88      1000
   macro avg       0.88      0.88      0.88      1000
weighted avg       0.88      0.88      0.88      1000

Validation Loss: 0.2949, Validation F1 Score: 0.8830


Training Epoch 2: 100%|███████████████████████████████████████████████████████████████| 250/250 [00:39<00:00,  6.33it/s]


Epoch 2, Loss: 0.2036


Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.98it/s]



Classification Report:
               precision    recall  f1-score   support

    Negative       0.92      0.85      0.89       500
    Positive       0.86      0.93      0.89       500

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.89      0.89      0.89      1000

Validation Loss: 0.3041, Validation F1 Score: 0.8899


Training Epoch 3: 100%|███████████████████████████████████████████████████████████████| 250/250 [00:39<00:00,  6.30it/s]


Epoch 3, Loss: 0.0719


Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.29it/s]


Classification Report:
               precision    recall  f1-score   support

    Negative       0.90      0.88      0.89       500
    Positive       0.88      0.90      0.89       500

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.89      0.89      0.89      1000

Validation Loss: 0.3643, Validation F1 Score: 0.8910





In [42]:
evaluate_model(model,val_loader,criterion)

Evaluating: 100%|███████████████████████████████████████████████████████████████████████| 32/32 [00:01<00:00, 22.07it/s]


Classification Report:
               precision    recall  f1-score   support

    Negative       0.90      0.88      0.89       500
    Positive       0.88      0.90      0.89       500

    accuracy                           0.89      1000
   macro avg       0.89      0.89      0.89      1000
weighted avg       0.89      0.89      0.89      1000






(0.36430347454734147, 0.8909815758863249)