In [24]:
import pandas as pd

train = pd.read_csv('train.tsv', sep='\t')
train["classification_text"]= train['date']+ " " + train["subject"] + " " + train["title"] + " " + train["text"] 


In [25]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm

In [26]:
train.columns

Index(['Unnamed: 0', 'title', 'text', 'subject', 'date', 'label',
       'classification_text'],
      dtype='object')

In [27]:

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train['classification_text'].values, 
    train['label'].values, 
    test_size=0.2, 
    random_state=42
)

In [28]:
train_texts.shape , val_texts.shape , train_labels.shape , val_labels.shape

((24000,), (6000,), (24000,), (6000,))

In [38]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)


train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [43]:
print("Original text:", train_texts[0])
print("\nTokenized input ids:", train_dataset[0]['input_ids'])

Original text: September 26, 2017  worldnews Trump, Spanish PM Rajoy say they oppose Catalonia independence bid U.S. President Donald Trump and Spain s Prime Minister Mariano Rajoy told journalists on Tuesday they opposed a referendum in Spain s Catalonia region scheduled for this weekend, which the Spanish government has tried to thwart.  I m just for a united Spain,  said Trump, who cast doubt on polling data predicting a  yes  vote for independence will win.  I really think the people of Catalonia would stay with Spain. I think it would be foolish not to.

Tokenized input ids: tensor([  101,  2244,  2656,  1010,  2418,  2088,  2638,  9333,  8398,  1010,
         3009,  7610, 11948,  6977,  2360,  2027, 15391, 16711,  4336,  7226,
         1057,  1012,  1055,  1012,  2343,  6221,  8398,  1998,  3577,  1055,
         3539,  2704, 22695, 11948,  6977,  2409,  8845,  2006,  9857,  2027,
         4941,  1037,  9782,  1999,  3577,  1055, 16711,  2555,  5115,  2005,
         2023,  5353,  

In [47]:

num_labels = len(train['label'].unique()) 
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [45]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

epochs = 3

In [1]:
def train_model():
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader, desc='Training'):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        
        # Print running average loss
        avg_loss = total_loss / (batch_idx + 1)
        print(f'\rRunning average loss: {avg_loss:.4f}', end='')
    
    return total_loss / len(train_loader)

def evaluate_model():
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

# Lists to store losses for plotting
train_losses = []
val_losses = []

# Training loop
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss = train_model()
    val_loss = evaluate_model()
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    print(f'Average training loss: {train_loss:.4f}')
    print(f'Average validation loss: {val_loss:.4f}')

# Plot training and validation loss curves
plt.figure(figsize=(10, 6))
plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'epochs' is not defined

In [None]:
def train_model():
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for batch_idx, batch in enumerate(tqdm(train_loader, desc='Training')):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Calculate accuracy
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        
        loss.backward()
        optimizer.step()
        
        # Print running average loss and accuracy
        avg_loss = total_loss / (batch_idx + 1)
        accuracy = correct / total
        print(f'\rRunning average loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}', end='')
    
    return total_loss / len(train_loader), correct / total

def evaluate_model():
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            # Calculate accuracy
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    
    return total_loss / len(val_loader), correct / total

# Lists to store losses and accuracies for plotting
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

# Training loop
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss, train_acc = train_model()
    val_loss, val_acc = evaluate_model()
    
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)
    
    print(f'Average training loss: {train_loss:.4f}, Training accuracy: {train_acc:.4f}')
    print(f'Average validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}')

# Plot training and validation loss curves
plt.figure(figsize=(15, 5))

# Loss subplot
plt.subplot(1, 2, 1)
plt.plot(range(1, epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Time')
plt.legend()
plt.grid(True)

# Accuracy subplot
plt.subplot(1, 2, 2)
plt.plot(range(1, epochs + 1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, epochs + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy Over Time')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()