In [1]:
import os
import gym
import torch
import numpy as np
import pandas as pd
from torch.distributions import Categorical

In [2]:
%cd ..
import src.envs
from src.utils import load_text, apply_labels
from src.models.seq2labels import PretrainedEncoder, Seq2Labels
%cd notebooks

/home/rajk/Machine_Learning/DRL-GEC
/home/rajk/Machine_Learning/DRL-GEC/notebooks


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
@torch.no_grad()
def greedy_action(policy, state, all_labels, verbose=True):
    [logits] = policy([state])
    top_logits, i = logits.topk(3)
    top_logits = top_logits.cpu().numpy()
    i = i.cpu().numpy()
    dist = Categorical(logits=logits)
    top_probs = dist.probs[torch.arange(len(state)).unsqueeze(1), i]
    entropy = dist.entropy().cpu().numpy()
    if verbose:
        for a, e, label_logit_prob in zip(state, entropy, zip(all_labels[i], top_logits, top_probs)):
            print(f"Entropy: {e:4f} | Label: {a:15}  |", " -- ".join(f"{lab} [{prob:3.2f}, {log:5.2f}]" for (lab, log, prob) in zip(*label_logit_prob)))
        print()
    action = logits.argmax(axis=-1)
    return action.cpu().numpy()

In [5]:
def load_model(model_path, output_size):
    model_name = "roberta-base"
    encoder = PretrainedEncoder(model_name).to(device)
    policy = Seq2Labels(encoder_model=encoder, num_labels=output_size).to(device)
    policy.load_state_dict(torch.load(model_path))
    policy.eval()
    return policy

# Load Labels

In [9]:
env = gym.make("wi_locness_gec_lev_dist-v1", new_step_api=True, correct_examples_percent=[0.0])

Original number of data in wi+locness: 26815
Number of data without correct sentences: 17494


# Load model

In [12]:
rl_model_path = os.path.abspath("pg_logs/finetune_rl_12_11_2022_11:40/model-last.pt")
sl_model_path = os.path.abspath("sl_logs/finetune_wi+locness_02:11:2022_23:06/model-best.pt")
rl_model = load_model(rl_model_path, output_size=len(env.labels))
sl_model = load_model(sl_model_path, output_size=len(env.labels))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 

# Test Model

# SL model

In [10]:
data_dict = dict(
    text = "he said in other words that the more fluoride may create damage in human body , specifically the bone .",
    references = [
        "He said in other words that the more fluoride may create damage in the human body , specifically the bone .",
        "He said , in other words , that more fluoride may create damage in the human body , specifically the bone .",
        "He said , in other words , that more fluoride may create damage to the human body , specifically the bones .",
        "In other words , he said that more fluoride may damage the human body , specifically the bones ."
    ],
)
state = env.reset(data_dict=data_dict)
done = False
while not done:
    action = greedy_action(sl_model, state, env.labels, verbose=True)
    next_state, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    state = next_state
    outputs = env.render()
    for o in outputs:
        print(o)

Entropy: 0.533590 | Label: $START           | $KEEP [0.92, 10.03] -- $APPEND_But [0.02,  6.12] -- $APPEND_And [0.01,  5.13]
Entropy: 0.337851 | Label: he               | $TRANSFORM_CASE_CAPITAL [0.93, 11.58] -- $KEEP [0.06,  8.82] -- $REPLACE_He [0.00,  6.32]
Entropy: 0.871657 | Label: said             | $APPEND_, [0.64, 10.05] -- $KEEP [0.32,  9.38] -- $REPLACE_, [0.01,  5.69]
Entropy: 0.779609 | Label: in               | $KEEP [0.81,  9.01] -- $TRANSFORM_CASE_CAPITAL [0.10,  6.89] -- $DELETE [0.07,  6.58]
Entropy: 0.595660 | Label: other            | $KEEP [0.87,  9.40] -- $DELETE [0.10,  7.21] -- $MERGE_SPACE [0.00,  4.15]
Entropy: 0.896537 | Label: words            | $KEEP [0.64,  9.58] -- $APPEND_, [0.30,  8.84] -- $DELETE [0.05,  6.94]
Entropy: 0.789471 | Label: that             | $KEEP [0.77,  9.21] -- $DELETE [0.17,  7.70] -- $REPLACE_, [0.03,  5.89]
Entropy: 0.998475 | Label: the              | $DELETE [0.51,  8.31] -- $KEEP [0.45,  8.19] -- $UNKNOWN [0.01,  4.09]
Entropy: 0.8

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
  logger.warn(


# RL model

In [13]:
data_dict = dict(
    text = "he said in other words that the more fluoride may create damage in human body , specifically the bone .",
    references = [
        "He said in other words that the more fluoride may create damage in the human body , specifically the bone .",
        "He said , in other words , that more fluoride may create damage in the human body , specifically the bone .",
        "He said , in other words , that more fluoride may create damage to the human body , specifically the bones .",
        "In other words , he said that more fluoride may damage the human body , specifically the bones ."
    ]
)
state = env.reset(data_dict=data_dict)
done = False
while not done:
    action = greedy_action(rl_model, state, env.labels, verbose=True)
    next_state, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    state = next_state
    outputs = env.render()
    for o in outputs:
        print(o)

Entropy: 0.645639 | Label: $START           | $KEEP [0.91,  9.74] -- $APPEND_But [0.02,  5.96] -- $APPEND_The [0.01,  5.17]
Entropy: 1.304728 | Label: he               | $TRANSFORM_CASE_CAPITAL [0.60, 10.71] -- $REPLACE_He [0.23,  9.77] -- $KEEP [0.06,  8.48]
Entropy: 1.088627 | Label: said             | $KEEP [0.60,  9.10] -- $APPEND_, [0.32,  8.48] -- $DELETE [0.02,  5.81]
Entropy: 1.227594 | Label: in               | $KEEP [0.72,  8.87] -- $TRANSFORM_CASE_CAPITAL [0.08,  6.72] -- $REPLACE_In [0.08,  6.67]
Entropy: 0.718042 | Label: other            | $KEEP [0.85,  9.28] -- $DELETE [0.09,  7.05] -- $APPEND_other [0.01,  4.63]
Entropy: 0.825542 | Label: words            | $KEEP [0.69,  9.62] -- $APPEND_, [0.27,  8.69] -- $DELETE [0.02,  6.14]
Entropy: 0.767806 | Label: that             | $KEEP [0.81,  9.25] -- $DELETE [0.12,  7.31] -- $APPEND_, [0.03,  5.93]
Entropy: 0.995892 | Label: the              | $KEEP [0.58,  8.18] -- $DELETE [0.39,  7.78] -- $APPEND_more [0.00,  2.72]
Entropy

In [12]:
text = "Tigers is cold blooded animals ."
references = [
    "Tigers are cold-blooded animals .",
    "Tigers is a cold-blooded animal .",
]
for i in range(3):
    action = greedy_action(sl_model, state, env.labels, verbose=False)
    labels = env.labels[action]
    new_state = apply_labels(state, labels)
    reward = env.compute_reward(state, new_state, references)
    output = env.render_text(state, labels, reward, new_state, i)
    state = new_state
    print(output)

[37;1mTimestep:[0m 0  
[37;1mRewards:[0m 0.348  
[37;1mSource:[0m $START Tigers [32;1mis[0m [[31;1m$REPLACE_are[0m] cold blooded animals .  
[37;1mOutput:[0m $START Tigers are cold blooded animals .  

[37;1mTimestep:[0m 1  
[37;1mRewards:[0m -0.007  
[37;1mSource:[0m $START Tigers are [32;1mcold[0m [[31;1m$APPEND_-[0m] blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  

[37;1mTimestep:[0m 2  
[37;1mRewards:[0m -0.100  
[37;1mSource:[0m $START Tigers are cold - blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  



In [13]:
text = "Tigers is cold blooded animals ."
references = [
    "Tigers are cold-blooded animals .",
    "Tigers is a cold-blooded animal .",
]
for i in range(3):
    action = greedy_action(rl_model, state, env.labels, verbose=False)
    labels = env.labels[action]
    new_state = apply_labels(state, labels)
    reward = env.compute_reward(state, new_state, references)
    output = env.render_text(state, labels, reward, new_state, i)
    state = new_state
    print(output)

[37;1mTimestep:[0m 0  
[37;1mRewards:[0m 0.139  
[37;1mSource:[0m $START Tigers [32;1mis[0m [[31;1m$REPLACE_are[0m] [32;1mcold[0m [[31;1m$APPEND_-[0m] blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  

[37;1mTimestep:[0m 1  
[37;1mRewards:[0m -0.100  
[37;1mSource:[0m $START Tigers are cold - blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  

[37;1mTimestep:[0m 2  
[37;1mRewards:[0m -0.100  
[37;1mSource:[0m $START Tigers are cold - blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  



# Model scores

In [14]:
model_dict = {
    "Pretrain": os.path.abspath("sl_logs/pretrain_synthetic_18:10:2022_13:59/"),
    "Pretrain + SL Fine-Tune": os.path.abspath("sl_logs/finetune_wi+locness_18:10:2022_21:42"),
    "Pretrain + RL Fine-Tune": os.path.abspath("pg_logs/finetune_rl_22_10_2022_01:15"),
    "Pretrain + SL Fine-Tune + RL Fine-Tune": os.path.abspath("pg_logs/finetune_rl_23_10_2022_00:33"),
}

results = []
for model_name, model_path in model_dict.items():
    dataset_path = os.path.join(model_path, "conll", "conll_test.score")
    data = load_text(dataset_path)
    p, r, f = (line.split(": ")[1] for line in data[-3:])
    results.append({"Model": model_name, "Precision": p, "Recall": r, "F-0.5 Score": f})
conll_df = pd.DataFrame(results)
conll_df.style.highlight_max(subset=["Precision", "Recall", "F-0.5 Score"], color='lightgreen', axis=0)

Unnamed: 0,Model,Precision,Recall,F-0.5 Score
0,Pretrain,0.6074,0.2958,0.5017
1,Pretrain + SL Fine-Tune,0.6561,0.4372,0.5964
2,Pretrain + RL Fine-Tune,0.689,0.3784,0.5918
3,Pretrain + SL Fine-Tune + RL Fine-Tune,0.6842,0.3593,0.5794


In [15]:
model_dict = {
    "Pretrain": os.path.abspath("sl_logs/pretrain_synthetic_18:10:2022_13:59/"),
    "Pretrain + SL Fine-Tune": os.path.abspath("sl_logs/finetune_wi+locness_18:10:2022_21:42"),
    "Pretrain + RL Fine-Tune": os.path.abspath("pg_logs/finetune_rl_22_10_2022_01:15"),
    "Pretrain + SL Fine-Tune + RL Fine-Tune": os.path.abspath("pg_logs/finetune_rl_23_10_2022_00:33"),
}

results = []
for model_name, model_path in model_dict.items():
    model_dict = {"Model": model_name}
    for score_type in ("dev", "test"):
        dataset_path = os.path.join(model_path, "jfleg", f"jfleg_{score_type}.score")
        data = load_text(dataset_path)
        score_list = eval(data[-1])
        model_dict[f"{score_type.title()} Score"] = score_list[0][0]
    results.append(model_dict)
jfleg_df = pd.DataFrame(results)
jfleg_df.style.highlight_max(subset=["Dev Score", "Test Score"], color='lightgreen', axis=0)

Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain,0.51141,0.538118
1,Pretrain + SL Fine-Tune,0.543455,0.59069
2,Pretrain + RL Fine-Tune,0.532699,0.576475
3,Pretrain + SL Fine-Tune + RL Fine-Tune,0.528306,0.575681
