## library

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.utils.data import Dataset, random_split, DataLoader
import pandas as pd
import torch
from torch.optim import Adam
import numpy as np
from sklearn.metrics import roc_auc_score

# dataset

In [None]:
data = pd.read_csv('train.csv')
data.head()

In [None]:
data.isna().sum()

In [None]:
class MyDataset(Dataset):
    def __init__(self, csv_file='train.csv'):
        super().__init__()
        self.data = pd.read_csv(csv_file)
        self.data['Label'] = self.data['Label'].astype(int)

    def __getitem__(self, index):
        return self.data.iloc[index]['content'], self.data.iloc[index]['Label']

    def __len__(self):
        return len(self.data)

In [None]:
dataset = MyDataset('train.csv')
for i in range(5):
    print(dataset[i])

# split

In [None]:
train_dataset, val_dataset = random_split(dataset, lengths=[0.8, 0.2])
len(train_dataset), len(val_dataset)

# Dataloader

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
def collate_func(batch):
    texts, labels = [], []
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])

    inputs = tokenizer(
        texts,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    inputs['labels'] = torch.tensor(labels, dtype=torch.long)
    return inputs

In [None]:
train_loader = DataLoader(
    train_dataset,
    batch_size=128,
    shuffle=True,
    collate_fn=lambda batch: collate_func(batch, tokenizer)
)
val_loader = DataLoader(
    val_dataset,
    batch_size=128,
    shuffle=False,
    collate_fn=lambda batch: collate_func(batch, tokenizer)
)

In [None]:
next(enumerate(train_loader))[1]

# model

In [None]:
num_labels = len(data['Label'].unique())

config = AutoConfig.from_pretrained('roberta-base')
config.num_labels = num_labels

model = AutoModelForSequenceClassification.from_pretrained(
    'roberta-base',
    config=config
)

if torch.cuda.is_available():
    model = model.cuda()

# optimizer

In [None]:
optimizer = Adam(model.parameters(), lr=2e-5)

# evaluate

In [None]:
def evaluate_auc(model, validloader):
    model.eval()
    all_labels = []
    all_probabilities = []
    
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}

            output = model(**batch)
            probabilities = torch.softmax(output.logits, dim=1)
            
            all_labels.extend(batch['labels'].cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
    
    all_labels = np.array(all_labels)
    all_probabilities = np.array(all_probabilities)
    
    # macro-averaged AUC

    auc_score = roc_auc_score(
        all_labels, 
        all_probabilities, 
        multi_class='ovr', 
        average='macro'
    )
    
    predictions = np.argmax(all_probabilities, axis=1)
    accuracy = (predictions == all_labels).mean()
    
    return auc_score, accuracy

# train

In [None]:
def train(model, trainloader, validloader, epochs=5, log_step=100):
    global_step = 0
    best_auc = 0
    
    for ep in range(epochs):
        model.train()
        total_loss = 0

        for batch_idx, batch in enumerate(trainloader):
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}

            optimizer.zero_grad()
            output = model(**batch)
            loss = output.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if global_step % log_step == 0:
                print(f'Epoch: {ep+1}/{epochs}, Step: {global_step}, Loss: {loss.item():.4f}')

            global_step += 1

        avg_loss = total_loss / len(trainloader)
        auc_score, accuracy = evaluate_auc(model, validloader, validset)
        
        print(f'Epoch {ep+1} completed - Avg Loss: {avg_loss:.4f}, AUC: {auc_score:.4f}, Accuracy: {accuracy:.4f}')
        
        if auc_score > best_auc:
            best_auc = auc_score
            print(f'best auc: {best_auc:.4f}')
            # model.save_pretrained('./best_model')
            # tokenizer.save_pretrained('./best_model')

    return model

In [None]:
trained_model = train(model, train_loader, val_loader, val_dataset, epochs=3)

# predict

In [None]:
def predict(model, tokenizer, texts):
    model.eval()
    predictions = []

    with torch.inference_mode():
        for text in texts:
            inputs = tokenizer(
                text,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}

            output = model(**inputs)
            pred = torch.argmax(output.logits, dim=-1)
            predictions.append(pred.cpu().item())

    return predictions

In [None]:
def predict_test_set(model, tokenizer, test_file='test.csv', batch_size=32):
    test_data = pd.read_csv(test_file)
    predictions = []
    
    model.eval()
    with torch.inference_mode():
        for i in range(0, len(test_data), batch_size):
            batch_texts = test_data.iloc[i:i+batch_size]['content'].tolist()
            
            inputs = tokenizer(
                batch_texts,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )
            
            if torch.cuda.is_available():
                inputs = {k: v.cuda() for k, v in inputs.items()}
            
            outputs = model(**inputs)
            batch_predictions = torch.argmax(outputs.logits, dim=1)
            
            predictions.extend(batch_predictions.cpu().numpy())
    
    return predictions

In [None]:
test_predictions = predict_test_set(trained_model, tokenizer, 'test.csv')

# save

In [None]:
test_data = pd.read_csv('test.csv')

submission = pd.DataFrame({
    'ID': test_data['ID'],
    'Prediction': test_predictions
})

submission.to_csv('result-bert.csv', index=False)
print(submission['Prediction'].value_counts().sort_index())