# File-Mask Testing

In this notebook we'll setup a fill-mask pipeline so we can test our new model qualitatively.

In [None]:
from transformers import pipeline
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np

In [None]:
filename = 'ecfp0'
samples_count = '10M'
model_name = f'molberto_{filename}_{samples_count}'

In [None]:
fill = pipeline('fill-mask', model=model_name, tokenizer=(model_name, {'truncation':True, 'max_length':512, 'padding':'max_length'}), device=5)

In [None]:
# check if it works
fill(f'3218693969 {fill.tokenizer.mask_token} 2245277810')

In [None]:
# check on example from dataset ('2246699815' is masked number)
fill(f'864942730 {fill.tokenizer.mask_token} 3217380708 3218693969 3217380708 864662311 2041434490 3217380708 3218693969 3218693969 3218693969 3218693969 3217380708 2092489639 2968968094 2968968094 2092489639 3217380708 3218693969 3218693969 3218693969 3217380708 2245277810 882399112 882399112 882399112 3218693969 2968968094 2968968094')

In [None]:
y_pred, y_true = list(), list()

In [None]:
paths = [str(x) for x in Path(f'data/ecfps_full/{filename}').glob('*.txt')]

for path in tqdm(paths[900:]):
    with open(path, 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    for line in tqdm(lines):
        numbers = line.split(' ')
        if len(numbers) <= 1:
            continue
        random_index = np.random.randint(0, len(numbers) - 1)
        true_number = numbers[random_index]
        numbers[random_index] = fill.tokenizer.mask_token
        try:
            predicted_values = fill(" ".join(numbers))
        except:
            print(len(numbers))
            continue
        y_pred.append(predicted_values[0]['token_str'][1:])
        y_true.append(true_number)

In [None]:
print(len(y_true), len(y_pred))

In [None]:
with open(f'{model_name}/y_true.txt', 'w+') as f:
    for line in y_true:
        f.write(line)
        f.write('\n')
with open(f'{model_name}/y_pred.txt', 'w+') as f:
    for line in y_pred:
        f.write(line)
        f.write('\n')

In [None]:
import sklearn
import sklearn.metrics

accuracy_score = sklearn.metrics.accuracy_score(y_true, y_pred)
precision_score = sklearn.metrics.precision_score(y_true, y_pred, average='weighted')
recall_score = sklearn.metrics.recall_score(y_true, y_pred, average='weighted')
f1_score = sklearn.metrics.f1_score(y_true, y_pred, average='weighted')

In [None]:
accuracy_score

In [None]:
precision_score

In [None]:
recall_score

In [None]:
f1_score