In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

# Load the JSON data
with open('training/data.json', 'r') as f:
    data = json.load(f)

# Extract texts and labels
texts = [item['text'] for item in data]
labels = [item['label'] for item in data]

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.1, random_state=42)

# Define custom dataset
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [2]:
model_dir = 'training/model'
tokenizer = BertTokenizer.from_pretrained(model_dir)
max_len = 60

# Create datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_len)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_len)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=500, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=500, shuffle=False)

# Initialize model and optimizer
model = BertForSequenceClassification.from_pretrained(model_dir, num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)




In [7]:
# Define evaluation function
import os

def evaluate(model, dataloader):
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += labels.size(0)
    accuracy = correct_predictions.double() / total_predictions
    return accuracy.item()

# Training loop with evaluation
checkpoint_dir = 'training/checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

# Function to save a checkpoint
def save_checkpoint(model, optimizer, epoch, loss, checkpoint_dir):
    checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_epoch_{epoch}.pth')
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, checkpoint_path)
    print(f'Checkpoint saved at {checkpoint_path}')

# Training loop with checkpointing
for epoch in range(50):  # Let's train for 50 epochs
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f'Epoch: {epoch}, Loss: {loss.item()}')
    
    # Evaluate the model
    val_accuracy = evaluate(model, val_dataloader)
    print(f'Epoch: {epoch}, Validation Accuracy: {val_accuracy}')
    
    # Save a checkpoint at the end of each epoch
    save_checkpoint(model, optimizer, epoch, loss.item(), checkpoint_dir)

Epoch: 0, Loss: 0.08587588369846344
Epoch: 0, Loss: 0.08258695900440216
Epoch: 0, Loss: 0.10346461832523346
Epoch: 0, Loss: 0.09246617555618286
Epoch: 0, Loss: 0.1039210706949234
Epoch: 0, Loss: 0.09753921627998352
Epoch: 0, Loss: 0.0888405591249466
Epoch: 0, Loss: 0.07822679728269577
Epoch: 0, Loss: 0.0903550311923027


In [6]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('training/trained_model')
tokenizer.save_pretrained('training/trained_model')

('training/trained_model/tokenizer_config.json',
 'training/trained_model/special_tokens_map.json',
 'training/trained_model/vocab.txt',
 'training/trained_model/added_tokens.json')