In [1]:
import json

import matplotlib.pyplot as plt
import pandas as pd
import torch 
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    RobertaForSequenceClassification,
)

In [None]:
def SubmitGenerator(prediction, sampleFile, filename="prediction.csv", public=True):
    sample = pd.read_csv(sampleFile)
    submit = {}
    submit["order_id"] = list(sample.order_id.values)
    redundant = len(sample) - prediction.shape[0]
    if public:
        submit["BACKGROUND"] = list(prediction[:,0]) + [0]*redundant
        submit["OBJECTIVES"] = list(prediction[:,1]) + [0]*redundant
        submit["METHODS"] = list(prediction[:,2]) + [0]*redundant
        submit["RESULTS"] = list(prediction[:,3]) + [0]*redundant
        submit["CONCLUSIONS"] = list(prediction[:,4]) + [0]*redundant
        submit["OTHERS"] = list(prediction[:,5]) + [0]*redundant
    else:
        submit["BACKGROUND"] = [0]*redundant + list(prediction[:,0])
        submit["OBJECTIVES"] = [0]*redundant + list(prediction[:,1])
        submit["METHODS"] = [0]*redundant + list(prediction[:,2])
        submit["RESULTS"] = [0]*redundant + list(prediction[:,3])
        submit["CONCLUSIONS"] = [0]*redundant + list(prediction[:,4])
        submit["OTHERS"] = [0]*redundant + list(prediction[:,5])
    df = pd.DataFrame.from_dict(submit) 
    df.to_csv(filename, index=False)

In [None]:
def correct_all_zero_problems(origin_predict):
    correct_predict = []
    count_of_zps = 0
    for row in origin_predict:
        one_hot = (row > 0.5).type(torch.IntTensor)
        if torch.sum(one_hot) > 0:
             correct_one_hot = one_hot
        else:
            # let best score to 1
            new_threadhold = torch.max(row)
            correct_one_hot = (row >= new_threadhold).type(torch.IntTensor)
            count_of_zps += 1
        correct_predict.append(correct_one_hot.unsqueeze(0))
    
    correct_predict = torch.cat(correct_predict).detach().numpy()
    assert correct_predict.shape == origin_predict.shape
    print("{} zero-problems is corrected".format(count_of_zps))
    return correct_predict

In [None]:
class TestDataset(Dataset):
    
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        feature = self.features[idx]
        
        return (
            feature["abstract_id"], # ignore
            feature["seq_id"],      # ignore
            torch.tensor(feature["net_inputs"]["input_ids"]),        
            torch.tensor(feature["net_inputs"]["attention_mask"]),
            torch.tensor(feature["net_inputs"]["token_type_ids"])
        )

In [None]:
def predit(model, dataloader, use_cuda=True, use_fp16=True):
    if use_fp16:
        model.half()
    if use_cuda:
        model.cuda()

    model.eval()
    prediction = []
    for batch in tqdm(test_dataloader, desc="predict:"):
        with torch.no_grad():
            net_input_batch = batch[2:]
            input_ids, attention_mask, token_type_ids = [i.to("cuda") for i in net_input_batch]
            logits = model(input_ids=input_ids, attention_mask=attention_mask)[0]
            soft_logits = torch.sigmoid(logits)
            prediction.append(soft_logits.to("cpu"))
    prediction = torch.cat(prediction).detach()
    return prediction

In [None]:
test_features = torch.load("data_bin/private-test_roberta-large_v2.pt")
test_dataset = TestDataset(test_features)
test_dataloader = DataLoader(dataset=test_dataset,
                             batch_size=512*4,
                             shuffle=False,
                             num_workers=16)

In [None]:
ensemble_checkpoints = [
    "checkpoint/roberta-large-best",
    "checkpoint/roberta-large-kaverage",
]

In [None]:
total_logits = []
for path in ensemble_checkpoints:
    model = RobertaForSequenceClassification.from_pretrained(path)
    model = nn.DataParallel(model)
    soft_logits = predit(model, test_dataloader, use_fp16=False)
    total_logits.append(soft_logits)

In [None]:
alpha = 0.5
final_predict = alpha * total_logits[0] + (1-alpha) * total_logits[1]

In [None]:
prediction = correct_all_zero_problems(final_predict)

In [None]:
SubmitGenerator(prediction, 
                "datas/task1_sample_submission.csv",
                "results/mix_best-kaverage_fixzp_private.csv",
                public=False)