In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import random
import numpy as np














In [None]:
# Set seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

# Load data
df = pd.read_csv("preprocessed_df")

# Encode labels
labels, uniques = pd.factorize(df['category'])
df['label'] = labels

In [None]:
# Train validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['processed_log'].tolist(), df['label'].tolist(), test_size=0.1, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
class LogDataset(Dataset):
    def _init_(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels
    
    def _len_(self):
        return len(self.labels)
    
    def _getitem_(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


In [None]:
train_dataset = LogDataset(train_texts, train_labels, tokenizer)
val_dataset = LogDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(uniques))
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

scaler = torch.cuda.amp.GradScaler() if torch.cuda.is_available() else None

In [None]:
def train_epoch(model, data_loader, optimizer, scheduler, scaler=None):
    model.train()
    total_loss = 0
    loop = tqdm(data_loader, desc="Training", leave=False)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        if scaler:
            with torch.cuda.amp.autocast():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        scheduler.step()
        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    return total_loss / len(data_loader)

In [None]:
def eval_model(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    loop = tqdm(data_loader, desc="Evaluating", leave=False)
    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return correct / total


In [None]:
for epoch in range(3):
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, scaler)
    val_acc = eval_model(model, val_loader)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.4f} | Val Acc: {val_acc:.4f}")

model.save_pretrained('./fine_tuned_bert_logs')
tokenizer.save_pretrained('./fine_tuned_bert_logs')