In [1]:
import os
import gym
import torch
import numpy as np
import pandas as pd
from torch.distributions import Categorical

In [2]:
%cd ..
import src.envs
from src.utils import load_text, decode
from src.models.seq2labels import PretrainedEncoder, Seq2Labels
%cd notebooks

/home/rajk/Machine_Learning/DRL-GEC
/home/rajk/Machine_Learning/DRL-GEC/notebooks


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
@torch.no_grad()
def greedy_action(policy, state, all_labels, verbose=True):
    [logits] = policy([state])
    top_logits, i = logits.topk(3)
    top_logits = top_logits.cpu().numpy()
    i = i.cpu().numpy()
    dist = Categorical(logits=logits)
    top_probs = dist.probs[torch.arange(len(state)).unsqueeze(1), i]
    entropy = dist.entropy().cpu().numpy()
    if verbose:
        for a, e, label_logit_prob in zip(state, entropy, zip(all_labels[i], top_logits, top_probs)):
            print(f"Entropy: {e:4f} | Label: {a:15}  |", " -- ".join(f"{lab} [{prob:3.2f}, {log:5.2f}]" for (lab, log, prob) in zip(*label_logit_prob)))
        print()
    action = logits.argmax(axis=-1)
    return action.cpu().numpy()

In [5]:
def load_model(model_path, output_size):
    model_name = "roberta-base"
    encoder = PretrainedEncoder(model_name).to(device)
    policy = Seq2Labels(encoder_model=encoder, num_labels=output_size).to(device)
    policy.load_state_dict(torch.load(model_path))
    policy.eval()
    return policy

# Load Labels

In [6]:
env = gym.make("wi_locness_gec-v0")

Original number of data in wi+locness: 24932
Number of data without correct sentences: 24932


  deprecation(
  deprecation(


# Load model

In [7]:
# rl_model_path = os.path.abspath("pg_logs/finetune_rl_22_10_2022_01:15/model-best.pt")
rl_model_path = os.path.abspath("pg_logs/finetune_rl_29_10_2022_11:41/model-last.pt")
sl_model_path = os.path.abspath("sl_logs/finetune_wi+locness_18:10:2022_21:42/model-best.pt")
rl_model = load_model(rl_model_path, output_size=len(env.labels))
sl_model = load_model(sl_model_path, output_size=len(env.labels))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaMod

# Test Model

# SL model

In [8]:
state = "$START he said in other words that the more fluoride may create damage in human body , specifically the bone .".split()
references = [
    "$START He said in other words that the more fluoride may create damage in the human body , specifically the bone .".split(),
    "$START He said , in other words , that more fluoride may create damage in the human body , specifically the bone .".split(),
    "$START He said , in other words , that more fluoride may create damage to the human body , specifically the bones .".split(),
    "$START In other words , he said that more fluoride may damage the human body , specifically the bones .".split()
]
for i in range(1):
    action = greedy_action(sl_model, state, env.labels, verbose=True)
    labels = env.labels[action]
    new_state = decode(state, labels)
    reward = env.compute_reward(state, new_state, references)
    output = env.render_text(state, labels, reward, new_state, i)
    state = new_state
    print(output)

Entropy: 0.617051 | Label: $START           | $KEEP [0.91,  9.35] -- $APPEND_The [0.01,  5.02] -- $APPEND_But [0.01,  4.53]
Entropy: 0.080073 | Label: he               | $TRANSFORM_CASE_CAPITAL [0.99, 12.58] -- $KEEP [0.01,  8.13] -- $REPLACE_He [0.00,  4.81]
Entropy: 0.892204 | Label: said             | $APPEND_, [0.71,  8.61] -- $KEEP [0.23,  7.46] -- $APPEND_. [0.02,  4.96]
Entropy: 1.111054 | Label: in               | $KEEP [0.62,  7.50] -- $TRANSFORM_CASE_CAPITAL [0.23,  6.50] -- $DELETE [0.12,  5.84]
Entropy: 0.635756 | Label: other            | $KEEP [0.88,  7.89] -- $DELETE [0.09,  5.59] -- $TRANSFORM_CASE_CAPITAL [0.01,  2.96]
Entropy: 1.050573 | Label: words            | $KEEP [0.74,  7.67] -- $APPEND_, [0.14,  6.00] -- $DELETE [0.05,  4.95]
Entropy: 0.642304 | Label: that             | $KEEP [0.86,  8.26] -- $DELETE [0.07,  5.82] -- $APPEND_, [0.03,  5.06]
Entropy: 1.241009 | Label: the              | $DELETE [0.47,  6.61] -- $KEEP [0.46,  6.58] -- $REPLACE_a [0.01,  2.16]
E

# RL model

In [9]:
state = "$START he said in other words that the more fluoride may create damage in human body , specifically the bone .".split()
references = [
    "$START He said in other words that the more fluoride may create damage in the human body , specifically the bone .".split(),
    "$START He said , in other words , that more fluoride may create damage in the human body , specifically the bone .".split(),
    "$START He said , in other words , that more fluoride may create damage to the human body , specifically the bones .".split(),
    "$START In other words , he said that more fluoride may damage the human body , specifically the bones .".split()
]
for i in range(5):
    action = greedy_action(rl_model, state, env.labels, verbose=True)
    labels = env.labels[action]
    new_state = decode(state, labels)
    reward = env.compute_reward(state, new_state, references)
    output = env.render_text(state, labels, reward, new_state, i)
    state = new_state
    print(output)

Entropy: 1.412778 | Label: $START           | $KEEP [0.76,  9.87] -- $APPEND_But [0.04,  6.80] -- $APPEND_That [0.02,  6.18]
Entropy: 1.558759 | Label: he               | $TRANSFORM_CASE_CAPITAL [0.64, 11.63] -- $DELETE [0.09,  9.67] -- $REPLACE_It [0.08,  9.60]
Entropy: 1.582957 | Label: said             | $KEEP [0.54,  9.86] -- $APPEND_, [0.20,  8.89] -- $DELETE [0.09,  8.07]
Entropy: 1.889009 | Label: in               | $KEEP [0.38,  9.15] -- $REPLACE_In [0.19,  8.44] -- $DELETE [0.17,  8.34]
Entropy: 0.918801 | Label: other            | $KEEP [0.82,  9.18] -- $DELETE [0.11,  7.15] -- $REPLACE_in [0.02,  5.35]
Entropy: 1.079268 | Label: words            | $KEEP [0.76,  9.22] -- $APPEND_, [0.12,  7.36] -- $DELETE [0.06,  6.74]
Entropy: 0.628331 | Label: that             | $KEEP [0.87, 10.59] -- $DELETE [0.10,  8.40] -- $APPEND_, [0.01,  6.45]
Entropy: 2.106235 | Label: the              | $DELETE [0.59,  8.52] -- $KEEP [0.24,  7.64] -- $APPEND_. [0.01,  3.88]
Entropy: 0.945120 | Label

In [10]:
labels = env.labels[action]
labels

chararray(['$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP',
           '$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP',
           '$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP', '$KEEP',
           '$KEEP'], dtype='<U29')

In [11]:
all(labels == "$KEEP")

True

In [12]:
state = "$START Tigers is cold blooded animals .".split()
references = [
    "$START Tigers are cold-blooded animals .".split(),
    "$START Tigers is a cold-blooded animal .".split(),
]
for i in range(3):
    action = greedy_action(sl_model, state, env.labels, verbose=False)
    labels = env.labels[action]
    new_state = decode(state, labels)
    reward = env.compute_reward(state, new_state, references)
    output = env.render_text(state, labels, reward, new_state, i)
    state = new_state
    print(output)

[37;1mTimestep:[0m 0  
[37;1mRewards:[0m 0.348  
[37;1mSource:[0m $START Tigers [32;1mis[0m [[31;1m$REPLACE_are[0m] cold blooded animals .  
[37;1mOutput:[0m $START Tigers are cold blooded animals .  

[37;1mTimestep:[0m 1  
[37;1mRewards:[0m -0.007  
[37;1mSource:[0m $START Tigers are [32;1mcold[0m [[31;1m$APPEND_-[0m] blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  

[37;1mTimestep:[0m 2  
[37;1mRewards:[0m -0.100  
[37;1mSource:[0m $START Tigers are cold - blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  



In [13]:
state = "$START Tigers is cold blooded animals .".split()
references = [
    "$START Tigers are cold-blooded animals .".split(),
    "$START Tigers is a cold-blooded animal .".split(),
]
for i in range(3):
    action = greedy_action(rl_model, state, env.labels, verbose=False)
    labels = env.labels[action]
    new_state = decode(state, labels)
    reward = env.compute_reward(state, new_state, references)
    output = env.render_text(state, labels, reward, new_state, i)
    state = new_state
    print(output)

[37;1mTimestep:[0m 0  
[37;1mRewards:[0m 0.139  
[37;1mSource:[0m $START Tigers [32;1mis[0m [[31;1m$REPLACE_are[0m] [32;1mcold[0m [[31;1m$APPEND_-[0m] blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  

[37;1mTimestep:[0m 1  
[37;1mRewards:[0m -0.100  
[37;1mSource:[0m $START Tigers are cold - blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  

[37;1mTimestep:[0m 2  
[37;1mRewards:[0m -0.100  
[37;1mSource:[0m $START Tigers are cold - blooded animals .  
[37;1mOutput:[0m $START Tigers are cold - blooded animals .  



# Model scores

In [14]:
model_dict = {
    "Pretrain": os.path.abspath("sl_logs/pretrain_synthetic_18:10:2022_13:59/"),
    "Pretrain + SL Fine-Tune": os.path.abspath("sl_logs/finetune_wi+locness_18:10:2022_21:42"),
    "Pretrain + RL Fine-Tune": os.path.abspath("pg_logs/finetune_rl_22_10_2022_01:15"),
    "Pretrain + SL Fine-Tune + RL Fine-Tune": os.path.abspath("pg_logs/finetune_rl_23_10_2022_00:33"),
}

results = []
for model_name, model_path in model_dict.items():
    dataset_path = os.path.join(model_path, "conll", "conll_test.score")
    data = load_text(dataset_path)
    p, r, f = (line.split(": ")[1] for line in data[-3:])
    results.append({"Model": model_name, "Precision": p, "Recall": r, "F-0.5 Score": f})
conll_df = pd.DataFrame(results)
conll_df.style.highlight_max(subset=["Precision", "Recall", "F-0.5 Score"], color='lightgreen', axis=0)

Unnamed: 0,Model,Precision,Recall,F-0.5 Score
0,Pretrain,0.6074,0.2958,0.5017
1,Pretrain + SL Fine-Tune,0.6561,0.4372,0.5964
2,Pretrain + RL Fine-Tune,0.689,0.3784,0.5918
3,Pretrain + SL Fine-Tune + RL Fine-Tune,0.6842,0.3593,0.5794


In [15]:
model_dict = {
    "Pretrain": os.path.abspath("sl_logs/pretrain_synthetic_18:10:2022_13:59/"),
    "Pretrain + SL Fine-Tune": os.path.abspath("sl_logs/finetune_wi+locness_18:10:2022_21:42"),
    "Pretrain + RL Fine-Tune": os.path.abspath("pg_logs/finetune_rl_22_10_2022_01:15"),
    "Pretrain + SL Fine-Tune + RL Fine-Tune": os.path.abspath("pg_logs/finetune_rl_23_10_2022_00:33"),
}

results = []
for model_name, model_path in model_dict.items():
    model_dict = {"Model": model_name}
    for score_type in ("dev", "test"):
        dataset_path = os.path.join(model_path, "jfleg", f"jfleg_{score_type}.score")
        data = load_text(dataset_path)
        score_list = eval(data[-1])
        model_dict[f"{score_type.title()} Score"] = score_list[0][0]
    results.append(model_dict)
jfleg_df = pd.DataFrame(results)
jfleg_df.style.highlight_max(subset=["Dev Score", "Test Score"], color='lightgreen', axis=0)

Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain,0.51141,0.538118
1,Pretrain + SL Fine-Tune,0.543455,0.59069
2,Pretrain + RL Fine-Tune,0.532699,0.576475
3,Pretrain + SL Fine-Tune + RL Fine-Tune,0.528306,0.575681
