# Тестирование модели

In [None]:
import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def define_spaces(spaced_string):
    previous_spased = [i for i,ch in enumerate(spaced_string) if ch == " "]
    for idx in range(len(previous_spased)):
        previous_spased[idx] -= idx

    return previous_spased


def f1_metrics(pred_spaces, target_spaces):
    tp = fp = fn = 0
    
    for p,g in zip(pred_spaces, target_spaces):
        pset = set(define_spaces(p))
        gset = set(define_spaces(g))
        tp += len(pset & gset)
        fp += len(pset - gset)
        fn += len(gset - pset)
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2*prec*rec/(prec+rec) if (prec+rec) > 0 else 0.0
    
    return {"precision": prec, "recall": rec, "f1": f1}

In [None]:
def predict_with_spaces(text, model, tokenizer, device=device, max_len = 40):

    model.eval()

    with torch.no_grad():
        enc = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_len)
        enc = {k: v.to(device) for k, v in enc.items()}

        gen_ids = model.generate(
            input_ids=enc["input_ids"],
            attention_mask=enc.get("attention_mask", None),
            max_length=max_len,
            num_beams=1,
            no_repeat_ngram_size=0
        )

        gen_ids = gen_ids.cpu()
        decoded = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)

        predict = decoded[0] if len(decoded) > 0 else ""
        spaced_predict = define_spaces(predict)

        return predict, spaced_predict

In [None]:
model_path = "best_model"

tokenizer_saved = T5Tokenizer.from_pretrained(model_path)
model_saved = T5ForConditionalGeneration.from_pretrained(model_path)

model_saved.to(device)

test_str = "этотестовая777строкадляproverkiрезультата"
test_str_spased = "это тестовая 777 строка для proverki результата"

pred, spaced_pred = predict_with_spaces(test_str, model_saved, tokenizer_saved)
print(f"target:  {test_str_spased}")
print(f"predict: {pred}")
print(f"target:  {define_spaces(test_str_spased)}")
print(f"predict: {spaced_pred}")