# File-Mask Testing

In this notebook we'll setup a fill-mask pipeline so we can test our new model qualitatively.

In [3]:
from transformers import pipeline
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np

2024-02-19 17:59:16.200859: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  torch.utils._pytree._register_pytree_node(


In [4]:
filename = 'ecfp0'
samples_count = '10M'
model_name = f'molberto_{filename}_{samples_count}'

In [5]:
fill = pipeline('fill-mask', model=model_name, tokenizer=(model_name, {'truncation':True, 'max_length':512, 'padding':'max_length'}), device=5)

In [23]:
# check if it works
fill(f'3218693969 {fill.tokenizer.mask_token} 2245277810')

[{'score': 0.37682217359542847,
  'token': 273,
  'token_str': ' 3218693969',
  'sequence': '3218693969 3218693969 2245277810'},
 {'score': 0.10960368812084198,
  'token': 397,
  'token_str': ' 3351556771',
  'sequence': '3218693969 3351556771 2245277810'},
 {'score': 0.054680805653333664,
  'token': 225,
  'token_str': ' ',
  'sequence': '3218693969  2245277810'},
 {'score': 0.028991278260946274,
  'token': 615,
  'token_str': ' 2998938176',
  'sequence': '3218693969 2998938176 2245277810'},
 {'score': 0.022372597828507423,
  'token': 336,
  'token_str': ' 2041434490',
  'sequence': '3218693969 2041434490 2245277810'}]

In [24]:
# check on example from dataset ('2246699815' is masked number)
fill(f'864942730 {fill.tokenizer.mask_token} 3217380708 3218693969 3217380708 864662311 2041434490 3217380708 3218693969 3218693969 3218693969 3218693969 3217380708 2092489639 2968968094 2968968094 2092489639 3217380708 3218693969 3218693969 3218693969 3217380708 2245277810 882399112 882399112 882399112 3218693969 2968968094 2968968094')

[{'score': 0.9997820258140564,
  'token': 316,
  'token_str': ' 2246699815',
  'sequence': '864942730 2246699815 3217380708 3218693969 3217380708 864662311 2041434490 3217380708 3218693969 3218693969 3218693969 3218693969 3217380708 2092489639 2968968094 2968968094 2092489639 3217380708 3218693969 3218693969 3218693969 3217380708 2245277810 882399112 882399112 882399112 3218693969 2968968094 2968968094'},
 {'score': 0.00011070397886214778,
  'token': 464,
  'token_str': ' 848127915',
  'sequence': '864942730 848127915 3217380708 3218693969 3217380708 864662311 2041434490 3217380708 3218693969 3218693969 3218693969 3218693969 3217380708 2092489639 2968968094 2968968094 2092489639 3217380708 3218693969 3218693969 3218693969 3217380708 2245277810 882399112 882399112 882399112 3218693969 2968968094 2968968094'},
 {'score': 2.57902720477432e-05,
  'token': 402,
  'token_str': ' 2246703798',
  'sequence': '864942730 2246703798 3217380708 3218693969 3217380708 864662311 2041434490 3217380708 

In [6]:
y_pred, y_true = list(), list()

In [7]:
paths = [str(x) for x in Path(f'data/ecfps_full/{filename}').glob('*.txt')]

for path in tqdm(paths[900:]):
    with open(path, 'r', encoding='utf-8') as fp:
        lines = fp.read().split('\n')
    for line in tqdm(lines):
        numbers = line.split(' ')
        if len(numbers) <= 1:
            continue
        random_index = np.random.randint(0, len(numbers) - 1)
        true_number = numbers[random_index]
        numbers[random_index] = fill.tokenizer.mask_token
        try:
            predicted_values = fill(" ".join(numbers))
        except:
            print(len(numbers))
            continue
        y_pred.append(predicted_values[0]['token_str'][1:])
        y_true.append(true_number)

  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]



  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

530


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

530


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

820


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

558


  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [8]:
print(len(y_true), len(y_pred))

999996 999996


In [9]:
with open(f'{model_name}/y_true.txt', 'w+') as f:
    for line in y_true:
        f.write(line)
        f.write('\n')
with open(f'{model_name}/y_pred.txt', 'w+') as f:
    for line in y_pred:
        f.write(line)
        f.write('\n')

In [10]:
import sklearn
import sklearn.metrics

accuracy_score = sklearn.metrics.accuracy_score(y_true, y_pred)
precision_score = sklearn.metrics.precision_score(y_true, y_pred, average='weighted')
recall_score = sklearn.metrics.recall_score(y_true, y_pred, average='weighted')
f1_score = sklearn.metrics.f1_score(y_true, y_pred, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
accuracy_score

0.8636234544938179

In [12]:
precision_score

0.8974600493915068

In [13]:
recall_score

0.8636234544938179

In [14]:
f1_score

0.874769040553478