In [1]:
import os
import sys
import torch
import numpy as np

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
%cd ..
from src.utils import load_labels, read_m2, load_model, write_text, iterative_prediction
%cd notebooks

/home/rajk/Machine_Learning/DRL-GEC
/home/rajk/Machine_Learning/DRL-GEC/notebooks


# Define Parameters

In [5]:
force = True
max_iter = 10
benchmark = "conll"
model_path = os.path.abspath("sl_logs/finetune_wi+locness_03:11:2022_00:32/model-best.pt")
model_path = os.path.abspath("pg_logs/finetune_rl_09_11_2022_08:20/model-last.pt")
model_dir = os.path.dirname(model_path)
pytorch_model_name = os.path.basename(model_path)
benchmark_dir = os.path.join(model_dir, benchmark)
output_path = os.path.join(benchmark_dir, pytorch_model_name.replace(".pt", ".out"))
score_path = os.path.join(benchmark_dir, pytorch_model_name.replace(".pt", ".score"))
os.makedirs(benchmark_dir, exist_ok=True)

The parameters below are the benchmark related parameters that stay same for all models.

In [6]:
label_path = "../data/vocabs/labels.txt"
conll_repo_path = os.path.abspath("../m2scorer/")
data_path = os.path.join(conll_repo_path, f"conll14st-test-data/alt/official-2014.combined-withalt.m2")

# Load Labels

In [7]:
label_vocab = load_labels(label_path, verbose=True)

Number of labels: 5001


# Load Model

In [8]:
policy = load_model(model_path, model_name="roberta-base", num_labels=len(label_vocab))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Make predictions on the test and dev datasets

In [9]:
%%time
policy.eval()
print(f"Benchmarking CONLL-2014 Test")
if force or not os.path.exists(output_path):
    benchmark_sentences = read_m2(data_path)
    print(f"Number of benchmark sentences: {len(benchmark_sentences)}")
    corrected_sentences = iterative_prediction(policy, label_vocab, benchmark_sentences, num_iter=max_iter, filter_labels=False, insert_start=True)
    write_text(corrected_sentences, output_path)
else:
    print(f"Output file '{output_path}' already exists!")
print()

Benchmarking CONLL-2014 Test
Number of benchmark sentences: 1312


Iteration 1:   0%|          | 0/1312 [00:00<?, ?it/s]

Iteration 2:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration 3:   0%|          | 0/3 [00:00<?, ?it/s]


CPU times: user 8.5 s, sys: 351 ms, total: 8.85 s
Wall time: 9.19 s


# Evaluate CONLL score on the test dataset

In [10]:
cwd = os.getcwd()
%cd $conll_repo_path
!./m2scorer -v $output_path $data_path > $score_path
!tail -3 $score_path
%cd $cwd

/home/rajk/Machine_Learning/m2scorer
Precision   : 0.4091
Recall      : 0.0285
F_0.5       : 0.1114
/home/rajk/Machine_Learning/DRL-GEC/notebooks
