In [1]:
import os
import sys
import torch
import numpy as np

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
%cd ..
from src.utils import load_text, write_text, iterative_prediction
from src.models.seq2labels import PretrainedEncoder, Seq2Labels, Seq2LabelsDeeper
%cd notebooks

/home/rajk/Machine_Learning/DRL-GEC
/home/rajk/Machine_Learning/DRL-GEC/notebooks


# Define Parameters

In [4]:
force = True
max_iter = 10
model_path = os.path.abspath("sl_logs/pretrain_synthetic_23:10:2022_18:26/model-best.pt")
model_path = os.path.abspath("pg_logs/finetune_rl_30_10_2022_14:27/model-best.pt")
pytorch_model_name = os.path.basename(model_path)
model_dir = os.path.dirname(model_path)
benchmark_dir = os.path.join(model_dir, "jfleg")
os.makedirs(benchmark_dir, exist_ok=True)

In [5]:
label_path = "../data/vocabs/labels.txt"
jfleg_repo_path = os.path.abspath("../../jfleg/")

# Load Labels

In [6]:
label_vocab = np.char.array(load_text(label_path))
print(f"Number of labels: {len(label_vocab)}")

Number of labels: 5000


# Load Model

In [7]:
model_name = "roberta-base"
encoder = PretrainedEncoder(model_name).to(device)
policy = Seq2Labels(encoder_model=encoder, num_labels=len(label_vocab)).to(device)
if model_path:
    policy.load_state_dict(torch.load(model_path))
_ = policy.eval()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Make predictions on the test and dev datasets

In [8]:
for benchmark_type in ("dev", "test"):
    print(f"Benchmarking JFLEG {benchmark_type.title()}")
    data_path = os.path.join(jfleg_repo_path, f"{benchmark_type}/{benchmark_type}.spellchecked.src")
    output_path = os.path.join(benchmark_dir, pytorch_model_name.replace(".pt", f"_{benchmark_type}.out"))
    if force or not os.path.exists(output_path):
        benchmark_sentences = load_text(data_path)
        print(f"Number of benchmark sentences: {len(benchmark_sentences)}")
        corrected_sentences = iterative_prediction(policy, label_vocab, benchmark_sentences, num_iter=max_iter, filter_labels=False, insert_start=True)
        write_text(corrected_sentences, output_path)
    else:
        print(f"Output file '{output_path}' already exists!")
    print()

Benchmarking JFLEG Dev
Number of benchmark sentences: 754


Iteration 1:   0%|          | 0/754 [00:00<?, ?it/s]

Iteration 2:   0%|          | 0/424 [00:00<?, ?it/s]

Iteration 3:   0%|          | 0/72 [00:00<?, ?it/s]

Iteration 4:   0%|          | 0/13 [00:00<?, ?it/s]

Iteration 5:   0%|          | 0/1 [00:00<?, ?it/s]


Benchmarking JFLEG Test
Number of benchmark sentences: 747


Iteration 1:   0%|          | 0/747 [00:00<?, ?it/s]

Iteration 2:   0%|          | 0/691 [00:00<?, ?it/s]

Iteration 3:   0%|          | 0/120 [00:00<?, ?it/s]

Iteration 4:   0%|          | 0/19 [00:00<?, ?it/s]

Iteration 5:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration 6:   0%|          | 0/1 [00:00<?, ?it/s]




# Evalaute GLEU score on the test and dev datasets

In [9]:
cwd = os.getcwd()
%cd $jfleg_repo_path
for benchmark_type in ("dev", "test"):
    src_path = f"./{benchmark_type}/{benchmark_type}.src"
    ref_path = f"./{benchmark_type}/{benchmark_type}.ref[0-3]"
    output_path = os.path.join(benchmark_dir, pytorch_model_name.replace(".pt", f"_{benchmark_type}.out"))
    score_path = os.path.join(benchmark_dir, pytorch_model_name.replace(".pt", f"_{benchmark_type}.score"))
    !echo Evaluating $benchmark_type
    !{sys.executable} ./eval/gleu.py --ref $ref_path --src $src_path --hyp $output_path > $score_path
    !cat $score_path
    print()
%cd $cwd

/home/rajk/Machine_Learning/jfleg
Evaluating dev
Running GLEU...
/home/rajk/Machine_Learning/DRL-GEC/notebooks/pg_logs/finetune_rl_30_10_2022_14:27/jfleg/model-best_dev.out
[['0.511313', '0.009029', '(0.494,0.529)']]

Evaluating test
Running GLEU...
/home/rajk/Machine_Learning/DRL-GEC/notebooks/pg_logs/finetune_rl_30_10_2022_14:27/jfleg/model-best_test.out
[['0.536822', '0.008207', '(0.521,0.553)']]

/home/rajk/Machine_Learning/DRL-GEC/notebooks
