In [1]:
import pandas as pd
import argparse
import importlib
import os
from os import environ
import pickle

import torch
from torch.utils.data import DataLoader

from datasets import Dataset
from transformers import AutoTokenizer

from utils.args import get_generate_config
from utils.tools import load_vocab, save2file
from utils.makeModel import make_model
from utils.checkpoint import process_state_dict, load_model
from utils.eval import Eval
from utils import constants
from utils.lang import translate2word

from tqdm.auto import tqdm

In [2]:
class generator:
    def __init__(self, *args, **kwargs):
        self.data = kwargs['data']
        self.model = kwargs['model']
        self.cfg = kwargs['cfg']
        self.tokenizer = kwargs['tokenizer']

    def _batch(self, st, ed):
        try:
            output = self.model(source=self.source[st:ed],
                                graph=self.graph[st:ed],
                                mode='test',
                                max_length=self.cfg.target_max_length)
            output = output.tolist()
            for i in range(len(output)):
                output[i] = output[i][1:]
                if self.cfg.EOS_index in output[i]:
                    end_index = output[i].index(self.cfg.EOS_index)
                    output[i] = output[i][:end_index]
                print(len(output[i]))

        except RuntimeError:
            if ed - st == 1:
                raise RuntimeError
            print('==>Reduce Batch Size')
            torch.cuda.empty_cache()
            output = []
            length = max(int((ed - st) / 4), 1)
            while st < ed:
                _ed = min(st + length, ed)
                output.extend(self._batch(st, _ed))
                st = _ed
        return output

    @torch.no_grad()
    def __call__(self):
        outputs = []
        self.model.eval()
        print('===>Start Generate.')
        for batch in tqdm(self.data):
            self.source = batch['source_input_ids'].to(self.cfg.device)
            self.graph = batch['graph'].to(self.cfg.device)
            outs = self._batch(0, self.source.size(0))
            dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
            outputs.extend(dec)
        return outputs

In [3]:
class CFG:
    beam = 5
    source_max_segment = 40
    source_max_length = 50
    target_max_length = 300
    pretrained_model_name_or_path = 'facebook/bart-base'
    model_path = "model/checkpoint5.pkl"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    batch_size = 16
    output_path = "data"


cfg = CFG()

In [4]:
test_df = pd.read_csv("data/test.csv")

test_df = test_df[:20]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(cfg.pretrained_model_name_or_path)

In [6]:
cfg.PAD_index = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
cfg.BOS_index = tokenizer.convert_tokens_to_ids(tokenizer.bos_token)
cfg.EOS_index = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
cfg.vocab_size = tokenizer.vocab_size

In [7]:
def get_test_dataset(df, cfg):
    D = []
    for index, row in tqdm(df.iterrows(), total=len(df)):
        node = eval(row['node'])
        edge = eval(row['edge'])
        docstring = row['docstring']
        source_input_ids = []
        source_attention_mask = []
        target_input_ids = []
        target_attention_mask = []
        for segment in node[:cfg.source_max_segment]:
            segment_tokens = tokenizer(segment, truncation=True, padding="max_length", max_length=cfg.source_max_length)
            source_input_ids.append(segment_tokens['input_ids'])
            source_attention_mask.append(segment_tokens['attention_mask'])
        if len(source_input_ids) < cfg.source_max_segment:
            for i in range(len(source_input_ids), cfg.source_max_segment):
                segment_tokens = tokenizer("", truncation=True, padding="max_length", max_length=cfg.source_max_length)
                source_input_ids.append(segment_tokens['input_ids'])
                source_attention_mask.append(segment_tokens['attention_mask'])
        graph = [[0] * cfg.source_max_segment for _ in range(cfg.source_max_segment)]

        for l, r in edge:
            if l < cfg.source_max_segment and r < cfg.source_max_segment:
                graph[l][r] = 1

        D.append({
            "source_input_ids": torch.LongTensor(source_input_ids),
            "graph": torch.LongTensor(graph)
        })

    dataset = Dataset.from_list(D)
    dataset.set_format(type="torch")

    return dataset

In [8]:
test_dataset = get_test_dataset(test_df, cfg)

  0%|          | 0/20 [00:00<?, ?it/s]

In [9]:
model_state_dict = load_model(cfg.model_path)
model = make_model(cfg)
model.load_state_dict(model_state_dict)
model = model.to(cfg.device)

In [10]:
test_loader = DataLoader(dataset=test_dataset, batch_size=cfg.batch_size, shuffle=False, num_workers=0)

In [11]:
generate = generator(data=test_loader, model=model, tokenizer=tokenizer, cfg=cfg)

In [12]:
outputs = generate()

===>Start Generate.


  0%|          | 0/2 [00:00<?, ?it/s]

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2


In [13]:
if not os.path.exists(cfg.output_path):
    os.makedirs(cfg.output_path)
save_file = os.path.join(cfg.output_path, 'result.txt')
save2file(outputs, save_file)