In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

# Paths to positive and negative files
pos_file = '/content/drive/MyDrive/DeepLearning/NLP/TextClassifier/rt-polarity-pos.txt'
neg_file = '/content/drive/MyDrive/DeepLearning/NLP/TextClassifier/rt-polarity-neg.txt'


In [2]:
# Load data from files
with open(pos_file, 'r', encoding='Windows-1252') as f:
    positive_data = f.readlines()
with open(neg_file, 'r', encoding='Windows-1252') as f:
    negative_data = f.readlines()

# Create labels for data
positive_labels = [1] * len(positive_data)
negative_labels = [0] * len(negative_data)


In [3]:
# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:

all_data = positive_data + negative_data
all_labels = positive_labels + negative_labels

train_data, val_data, train_labels, val_labels = train_test_split(
    all_data, all_labels, test_size=0.2, random_state=42)

# Tokenize the data
train_encodings = tokenizer(train_data, truncation=True, padding=True)
val_encodings = tokenizer(val_data, truncation=True, padding=True)

# Convert the tokenized data into torch tensors
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels)
)

val_dataset = torch.utils.data.TensorDataset(
    torch.tensor(val_encodings['input_ids']),
    torch.tensor(val_encodings['attention_mask']),
    torch.tensor(val_labels)
)

# Train the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

for epoch in range(3):
    train_loss = 0.0
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        
        optimizer.step()
        train_loss += loss.item()
        
    train_loss /= len(train_loader)
    
    val_loss = 0.0
    val_acc = 0.0
    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = criterion(outputs.logits, labels)
            val_loss += loss.item()
            
            preds = outputs.logits.argmax(dim=1)
            val_acc += torch.sum(preds == labels).item()
    
    val_loss /= len(val_loader)
    val_acc /= len(val_dataset)
    
    print(f'Epoch {epoch+1}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}')


Epoch 1: train_loss=0.3976, val_loss=0.3463, val_acc=0.8584
Epoch 2: train_loss=0.2260, val_loss=0.3568, val_acc=0.8514
Epoch 3: train_loss=0.1264, val_loss=0.4202, val_acc=0.8594
