# Inference and Scoring

In [None]:
import pandas as pd
import torch
from transformers import BartForConditionalGeneration
from kobart import get_pytorch_kobart_model, get_kobart_tokenizer
from tqdm import tqdm
from torchtext.data.metrics import bleu_score

## load tokenizer & model
  - 표준어 -> 제주어 : s2d
  - 제주어 -> 표준어 : d2s

In [None]:
tokenizer = get_kobart_tokenizer()

In [None]:
model = BartForConditionalGeneration.from_pretrained('model_results/s2d/model/0522')

In [None]:
model.eval()
model.to('cuda')
print('>> model set')

>> model set


## load test data

In [None]:
test_df=pd.read_csv('data/test_cleaned.tsv',sep='\t')

In [None]:
test_df.sample(3)

### inference test

In [None]:
idx = 1212
sent = test_df['standard'][idx]
print('input: ' , sent)
print('gold: ' , test_df['dialect'][idx])

inputs=tokenizer(sent,return_tensors='pt')

outputs=model.generate(inputs['input_ids'].to('cuda'), eos_token_id=1, max_length=64, num_beams=5)
print('generation: ', tokenizer.decode(outputs[0]))

### scoring

In [None]:
preds=[]
for sent in tqdm(test_df['standard'][:100]):
    inputs=tokenizer(sent,return_tensors='pt')
    outputs=model.generate(inputs['input_ids'].to('cuda'), eos_token_id=1, max_length=64, num_beams=5)
    preds.append(tokenizer.decode(outputs[0][1:-1]))

In [None]:
preds = [p.split() for p in preds]
targets = [[d.split()] for d in test_df['dialect'][:100]]

In [None]:
bleu_score(preds, targets)