In [1]:
import os
import sys
import torch
import numpy as np

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
%cd ..
from src.models.seq2labels import PretrainedEncoder, Seq2Labels
from src.utils import clean_text, load_text, write_text, iterative_prediction
%cd notebooks

/home/rajk/Machine_Learning/DRL-GEC
/home/rajk/Machine_Learning/DRL-GEC/notebooks


In [4]:
def read_m2(data_path):
    with open(data_path, "r") as fp:
        sentences = [clean_text(line[2:]) for line in fp if line.startswith("S ")]
    return sentences

# Define Parameters

In [5]:
force = True
max_iter = 10
benchmark = "conll"
model_path = os.path.abspath("sl_logs/finetune_wi+locness_27:10:2022_11:45/model-best.pt")
model_path = os.path.abspath("pg_logs/finetune_rl_30_10_2022_14:27/model-best.pt")
model_dir = os.path.dirname(model_path)
pytorch_model_name = os.path.basename(model_path)
benchmark_dir = os.path.join(model_dir, benchmark)
output_path = os.path.join(benchmark_dir, pytorch_model_name.replace(".pt", ".out"))
score_path = os.path.join(benchmark_dir, pytorch_model_name.replace(".pt", ".score"))
os.makedirs(benchmark_dir, exist_ok=True)

The parameters below are the benchmark related parameters that stay same for all models.

In [6]:
label_path = "../data/vocabs/labels.txt"
conll_repo_path = os.path.abspath("../../m2scorer/")
data_path = os.path.join(conll_repo_path, f"conll14st-test-data/alt/official-2014.combined-withalt.m2")

# Load Labels

In [7]:
label_vocab = np.char.array(load_text(label_path))
print(f"Number of labels: {len(label_vocab)}")

Number of labels: 5000


# Load Model

In [8]:
model_name = "roberta-base"
encoder = PretrainedEncoder(model_name).to(device)
policy = Seq2Labels(encoder_model=encoder, num_labels=len(label_vocab)).to(device)
if model_path:
    policy.load_state_dict(torch.load(model_path))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Make predictions on the test and dev datasets

In [9]:
%%time
policy.eval()
print(f"Benchmarking CONLL-2014 Test")
if force or not os.path.exists(output_path):
    benchmark_sentences = read_m2(data_path)
    print(f"Number of benchmark sentences: {len(benchmark_sentences)}")
    corrected_sentences = iterative_prediction(policy, label_vocab, benchmark_sentences, num_iter=max_iter, filter_labels=False, insert_start=True)
    write_text(corrected_sentences, output_path)
else:
    print(f"Output file '{output_path}' already exists!")
print()

Benchmarking CONLL-2014 Test
Number of benchmark sentences: 1312


Iteration 1:   0%|          | 0/1312 [00:00<?, ?it/s]

Iteration 2:   0%|          | 0/572 [00:00<?, ?it/s]

Iteration 3:   0%|          | 0/53 [00:00<?, ?it/s]

Iteration 4:   0%|          | 0/3 [00:00<?, ?it/s]


CPU times: user 12 s, sys: 787 ms, total: 12.8 s
Wall time: 12.8 s


# Evaluate CONLL score on the test dataset

In [10]:
cwd = os.getcwd()
%cd $conll_repo_path
!./m2scorer -v $output_path $data_path > $score_path
!tail -3 $score_path
%cd $cwd

/home/rajk/Machine_Learning/m2scorer
Precision   : 0.7200
Recall      : 0.2611
F_0.5       : 0.5327
/home/rajk/Machine_Learning/DRL-GEC/notebooks
