In [1]:
!pip install libauc

Collecting libauc
  Downloading libauc-1.3.3-py3-none-any.whl.metadata (4.3 kB)
Downloading libauc-1.3.3-py3-none-any.whl (120 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.6/120.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: libauc
Successfully installed libauc-1.3.3


In [2]:
from torch import cuda
import os

MAX_LEN = 512
DEVICE = "cuda"

In [3]:
config = {
    "selected_model": "distilroberta-base",
    "save": True,
    "model_prefix": "scigen",
    "save_tokenizer_path":"tokenizer_v1",
    "seed":0, 
    "training":[
        {
            "file": "/kaggle/input/deepfake-subset/sci_gen_train.csv",
            "label_col": "label",
            "batch_size":32,
            "shuffle": True,
            "epochs": 1
        }
    ],
    "evaluating":
    [
             {
                "file": "/kaggle/input/deepfake-subset/sci_gen_test.csv",
                "label_col": "label",
                "batch_size":1,
                "shuffle": False,
            }
    ]
}

In [4]:
from torch.utils.data import DataLoader
from libauc.losses import MultiLabelAUCMLoss, AUCMLoss, CompositionalAUCLoss
from libauc.optimizers import PESG, PDSCA
from libauc.sampler import DualSampler
import json
import pandas as pd
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from libauc.utils import set_all_seeds
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from torch.utils.data import DataLoader

set_all_seeds(config["seed"])

In [5]:
from torch.utils.data import Dataset
import torch

class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.ids = list()
        self.masks = list()
        self.targets = self.data["target"]

    def __getitem__(self, index):
        title = str(self.data.text[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            max_length=self.max_len,
            add_special_tokens=True,
            padding="max_length",
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.target[index], dtype=torch.long),
            "index": index,
            
        }

    def __len__(self):
        return self.len

In [6]:
def train(model, epochs, training_loader, optimizer, loss_fn, use_ap=False):
    tr_loss = list()
    model.train()
    for _ in range(epochs):
        for _,data in enumerate(tqdm(training_loader)):
            ids = data['ids'].to(DEVICE, dtype = torch.long)
            mask = data['mask'].to(DEVICE, dtype = torch.long)
            targets = data['targets'].to(DEVICE, dtype = torch.float)
            indices = data["index"].to(DEVICE, dtype=torch.long)
          #  task_ids = torch.argmax(targets, dim=1).to(DEVICE)
            
            optimizer.zero_grad()
            outputs = model(ids, attention_mask=mask)
            pred = torch.sigmoid(outputs[0]) # torch.softmax(outputs[0], dim=1)
            if not use_ap:
                loss = loss_fn(pred, targets)
            else:
                loss = loss_fn(pred, targets, indices, task_ids )
            tr_loss.append(loss.item())
            loss.backward()
            optimizer.step()
    return

def encode_label(label, flip=False, no_encode=True):
    # corresponds to [real, fake]
    if no_encode:
        return label
    if label == 0:
        ret = [1,0]
    else:
        ret = [0,1]
    if flip:
        return ret[::-1]
    return ret

def get_dataloader(file_config, tokenizer, model_type,sampler=False):
    df = pd.read_csv(file_config["file"])
    if model_type != 'all':
        df = df[df["model"].isin(["human",model_type])]
        df = pd.DataFrame(df).reset_index()
    df["target"] = df[file_config["label_col"]].apply(lambda x: encode_label(x))
    dataset = Triage(df, tokenizer, MAX_LEN)
    if not sampler:
        loader = DataLoader(dataset, batch_size=file_config["batch_size"], shuffle=file_config["shuffle"])
    else:
        ds = DualSampler(dataset, batch_size=file_config["batch_size"], sampling_rate=0.5, random_seed=config['seed'])
        loader = DataLoader(dataset, batch_size=file_config["batch_size"], sampler=ds)
    return loader, len(dataset)

def evaluate(model, testing_loader):
    model.eval()
    predictions = list()
    true_labels = list()
    with torch.no_grad():
        for _, data in enumerate(tqdm(testing_loader)):
            ids = data['ids'].to(DEVICE, dtype = torch.long)
            mask = data['mask'].to(DEVICE, dtype = torch.long)
            targets = data['targets'].to(DEVICE, dtype = torch.long)
            outputs = model(ids, attention_mask=mask)
            logits = outputs.logits
            #logits = torch.softmax(logits, dim=1)
            logits = torch.sigmoid(logits)
            predictions.append(logits.cpu().detach().numpy())
            true_labels.append(targets.cpu().numpy())
        predictions = np.concatenate(predictions)
        true_labels = np.concatenate(true_labels)
    return true_labels, predictions


def get_auc_score(model, loader):
    true_labels, predictions = evaluate(model, loader)
    y_true = true_labels
    y_prob = predictions
    return roc_auc_score(y_true, y_prob)

In [7]:
model_types = ["all"]#list(pd.read_csv(config["training"][0]["file"])["model"].unique())
for model_type in model_types:
    if model_type == "human":
        continue
    selected_model = config["selected_model"]
    tokenizer = AutoTokenizer.from_pretrained(selected_model)
    model = AutoModelForSequenceClassification.from_pretrained(selected_model,num_labels=1)
    model = model.to(DEVICE)
    save = config["save"]
    # tokenizer.save_pretrained("fully_trained_models/outfox/roberta/tokenizer")
    loss_fn = CompositionalAUCLoss(version="v2")
    optimizer = PDSCA(params=model.parameters(), loss_fn=loss_fn, lr=0.02)
    # training
    training_files = config["training"]
    print("Starting Training on", model_type)
    for file_config in training_files:
        training_loader, data_len = get_dataloader(file_config, tokenizer,model_type, True)
        train(model, 1, training_loader, optimizer, loss_fn)
    
    # evaluation
    for file_config in config["evaluating"]:
        eval_loader, _ = get_dataloader(file_config, tokenizer, model_type)
        print(file_config["file"],"AUC Score:", get_auc_score(model, eval_loader))

   
    if save:
        output_model_file = f"{config['model_prefix']}_distilroberta_{model_type}.pt"
        torch.save(model, output_model_file)
        tokenizer.save_pretrained(config["save_tokenizer_path"])

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting Training on all


100%|██████████| 1168/1168 [27:51<00:00,  1.43s/it]
100%|██████████| 4789/4789 [01:23<00:00, 57.22it/s]


/kaggle/input/deepfake-subset/sci_gen_test.csv AUC Score: 0.9734442515523264
