In [1]:
import os
import pandas as pd
from IPython.display import display

In [2]:
%cd ..
from src.utils import load_text
%cd notebooks

/home/rajk/Machine_Learning/DRL-GEC
/home/rajk/Machine_Learning/DRL-GEC/notebooks


# Local function

In [3]:
def gen_conll_report(model_info):
    results = []
    for model_name, (model_path, model_type) in model_info.items():
        dataset_path = os.path.join(model_path, "conll", f"{model_type}.score")
        data = load_text(dataset_path)
        correct, predicted, total = (eval(line.split(": ")[1]) for line in data[-9:-6])
        p, r, f = (eval(line.split(": ")[1]) for line in data[-3:])
        results.append({
            "Model": model_name, 
            "Correct": correct,  
            "Predicted": predicted,  
            "Total Error": total, 
            "Precision": p, 
            "Recall": r, 
            "F-0.5 Score": f,
        })
    conll_df = pd.DataFrame(results)
    highlight_columns = conll_df.columns.to_list()
    highlight_columns.remove("Model")
    display(conll_df.style.highlight_max(subset=highlight_columns, color='lightgreen', axis=0))
    return conll_df

def gen_jfleg_report(model_info):
    results = []
    for model_name, (model_path, model_type) in model_info.items():
        model_results = {"Model": model_name}
        for score_type in ("dev", "test"):
            dataset_path = os.path.join(model_path, "jfleg", f"{model_type}_{score_type}.score")
            data = load_text(dataset_path)
            score_list = eval(data[-1])
            model_results[f"{score_type.title()} Score"] = score_list[0][0]
        results.append(model_results)
    jfleg_df = pd.DataFrame(results)
    display(jfleg_df.style.highlight_max(subset=["Dev Score", "Test Score"], color='lightgreen', axis=0))
    return jfleg_df

## Model finetuned via SL and RL

In [4]:
model_dict = {
    "Pretrain": (os.path.abspath("sl_logs/pretrain_synthetic_31:10:2022_09:39/"), "model-best"),
    "Pretrain + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_01:11:2022_12:09/"), "model-best"),
    "Pretrain + RL Fine-Tune": (os.path.abspath("pg_logs/finetune_rl_01_11_2022_13:41/"), "model-best"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain,907,1651,2533,0.5494,0.3581,0.4963
1,Pretrain + SL Fine-Tune,1247,1870,2752,0.6668,0.4531,0.6094
2,Pretrain + RL Fine-Tune,929,1369,2500,0.6786,0.3716,0.5824


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain,0.512599,0.537839
1,Pretrain + SL Fine-Tune,0.545552,0.591531
2,Pretrain + RL Fine-Tune,0.530688,0.572043


## Model pretained on different amount of synthetic data 

In [5]:
model_dict = {
    "Pretrain (0.5K)": (os.path.abspath("sl_logs/pretrain_synthetic_31:10:2022_09:39/"), "model-best"),
    "Pretrain (2M)": (os.path.abspath("sl_logs/pretrain_synthetic_31:10:2022_19:25/"), "model-best"),
    "Pretrain (0.5K) + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_03:11:2022_00:32/"), "model-best"),
    "Pretrain (2M) + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_02:11:2022_23:06/"), "model-best"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain (0.5K),907,1651,2533,0.5494,0.3581,0.4963
1,Pretrain (2M),1036,1790,2619,0.5788,0.3956,0.5297
2,Pretrain (0.5K) + SL Fine-Tune,1279,1979,2793,0.6463,0.4579,0.5972
3,Pretrain (2M) + SL Fine-Tune,1352,2088,2829,0.6475,0.4779,0.6046


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain (0.5K),0.512599,0.537839
1,Pretrain (2M),0.520805,0.556833
2,Pretrain (0.5K) + SL Fine-Tune,0.541262,0.586046
3,Pretrain (2M) + SL Fine-Tune,0.546813,0.594995


## Model with different success reward

- 0.9 explore coefficient

In [6]:
model_dict = {
    "Pretrain + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_01:11:2022_12:09/"), "model-best"),
    "Pretrain + RL Fine-Tune (100 interval + 0.1 Success)": (os.path.abspath("pg_logs/finetune_rl_02_11_2022_13:10/"), "model-last"),
    "Pretrain + RL Fine-Tune (100 interval + 0.5 Success)": (os.path.abspath("pg_logs/finetune_rl_02_11_2022_17:24/"), "model-last"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + SL Fine-Tune,1247,1870,2752,0.6668,0.4531,0.6094
1,Pretrain + RL Fine-Tune (100 interval + 0.1 Success),1167,1868,2694,0.6247,0.4332,0.574
2,Pretrain + RL Fine-Tune (100 interval + 0.5 Success),924,1364,2512,0.6774,0.3678,0.5798


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + SL Fine-Tune,0.545552,0.591531
1,Pretrain + RL Fine-Tune (100 interval + 0.1 Success),0.538892,0.582487
2,Pretrain + RL Fine-Tune (100 interval + 0.5 Success),0.530445,0.569613


# Model with different explore coefficient

- +0.1 for success

### Model-Best

In [7]:
model_dict = {
    "Pretrain + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_01:11:2022_12:09/"), "model-best"),
    "Pretrain + RL Fine-Tune (0.1 explore)": (os.path.abspath("pg_logs/finetune_rl_03_11_2022_14:31/"), "model-best"),
    "Pretrain + RL Fine-Tune (0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_03_11_2022_10:20/"), "model-best"),
    "Pretrain + RL Fine-Tune (0.9 explore)": (os.path.abspath("pg_logs/finetune_rl_02_11_2022_13:10/"), "model-best"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + SL Fine-Tune,1247,1870,2752,0.6668,0.4531,0.6094
1,Pretrain + RL Fine-Tune (0.1 explore),1127,1709,2678,0.6594,0.4208,0.5923
2,Pretrain + RL Fine-Tune (0.5 explore),1122,1707,2636,0.6573,0.4256,0.5928
3,Pretrain + RL Fine-Tune (0.9 explore),1060,1614,2651,0.6568,0.3998,0.582


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + SL Fine-Tune,0.545552,0.591531
1,Pretrain + RL Fine-Tune (0.1 explore),0.540174,0.582753
2,Pretrain + RL Fine-Tune (0.5 explore),0.538418,0.582714
3,Pretrain + RL Fine-Tune (0.9 explore),0.534798,0.578188


### Model-Last

In [8]:
model_dict = {
    "Pretrain + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_01:11:2022_12:09/"), "model-best"),
    "Pretrain + RL Fine-Tune (0.1 explore)": (os.path.abspath("pg_logs/finetune_rl_03_11_2022_14:31/"), "model-last"),
    "Pretrain + RL Fine-Tune (0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_03_11_2022_10:20/"), "model-last"),
    "Pretrain + RL Fine-Tune (0.9 explore)": (os.path.abspath("pg_logs/finetune_rl_02_11_2022_13:10/"), "model-last"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + SL Fine-Tune,1247,1870,2752,0.6668,0.4531,0.6094
1,Pretrain + RL Fine-Tune (0.1 explore),1258,2016,2759,0.624,0.456,0.5812
2,Pretrain + RL Fine-Tune (0.5 explore),1177,1846,2701,0.6376,0.4358,0.5835
3,Pretrain + RL Fine-Tune (0.9 explore),1167,1868,2694,0.6247,0.4332,0.574


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + SL Fine-Tune,0.545552,0.591531
1,Pretrain + RL Fine-Tune (0.1 explore),0.545541,0.589471
2,Pretrain + RL Fine-Tune (0.5 explore),0.541086,0.587938
3,Pretrain + RL Fine-Tune (0.9 explore),0.538892,0.582487


# Model with different explore coefficient (new vs old methods)

- +0.1 for success

| Version | Description |
| ------- | ----------- |
| v0      | Exploit = Sample from Candidate Labels with their logits as probs |
| v1      | Exploit = Use logits.argmax() |
| v2      | Exploit = Use candidate_logits.argmax(). Also insert `$KEEP` in the labels |
| v3      | Exploit = Sample from mean(candidate_probs + lev_dist_probs). Also insert `$KEEP` in the labels |


### Model-Best

In [9]:
model_dict = {
    "Pretrain + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_01:11:2022_12:09/"), "model-best"),
    "Pretrain + RL Fine-Tune (v0 0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_03_11_2022_10:20/"), "model-best"),
    "Pretrain + RL Fine-Tune (v1 0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_06_11_2022_12:24/"), "model-best"),
    "Pretrain + RL Fine-Tune (v2 0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_06_11_2022_15:43/"), "model-best"),
    "Pretrain + RL Fine-Tune (v3 0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_07_11_2022_18:49/"), "model-best"),
    "Pretrain + RL Fine-Tune (v0 0.9 explore)": (os.path.abspath("pg_logs/finetune_rl_02_11_2022_13:10/"), "model-best"),
    "Pretrain + RL Fine-Tune (v1 0.9 explore)": (os.path.abspath("pg_logs/finetune_rl_05_11_2022_14:39/"), "model-best"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + SL Fine-Tune,1247,1870,2752,0.6668,0.4531,0.6094
1,Pretrain + RL Fine-Tune (v0 0.5 explore),1122,1707,2636,0.6573,0.4256,0.5928
2,Pretrain + RL Fine-Tune (v1 0.5 explore),809,1256,2444,0.6441,0.331,0.5416
3,Pretrain + RL Fine-Tune (v2 0.5 explore),821,1271,2466,0.6459,0.3329,0.5437
4,Pretrain + RL Fine-Tune (v3 0.5 explore),895,1337,2552,0.6694,0.3507,0.5665
5,Pretrain + RL Fine-Tune (v0 0.9 explore),1060,1614,2651,0.6568,0.3998,0.582
6,Pretrain + RL Fine-Tune (v1 0.9 explore),761,1165,2426,0.6532,0.3137,0.537


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + SL Fine-Tune,0.545552,0.591531
1,Pretrain + RL Fine-Tune (v0 0.5 explore),0.538418,0.582714
2,Pretrain + RL Fine-Tune (v1 0.5 explore),0.523478,0.559272
3,Pretrain + RL Fine-Tune (v2 0.5 explore),0.52307,0.55848
4,Pretrain + RL Fine-Tune (v3 0.5 explore),0.519709,0.548155
5,Pretrain + RL Fine-Tune (v0 0.9 explore),0.534798,0.578188
6,Pretrain + RL Fine-Tune (v1 0.9 explore),0.520957,0.553254


### Model-Last

In [10]:
model_dict = {
    "Pretrain + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_01:11:2022_12:09/"), "model-best"),
    "Pretrain + RL Fine-Tune (v0 0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_03_11_2022_10:20/"), "model-last"),
    "Pretrain + RL Fine-Tune (v1 0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_06_11_2022_12:24/"), "model-last"),
    "Pretrain + RL Fine-Tune (v2 0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_06_11_2022_15:43/"), "model-last"),
    "Pretrain + RL Fine-Tune (v3 0.5 explore)": (os.path.abspath("pg_logs/finetune_rl_07_11_2022_18:49/"), "model-last"),
    "Pretrain + RL Fine-Tune (v0 0.9 explore)": (os.path.abspath("pg_logs/finetune_rl_02_11_2022_13:10/"), "model-last"),
    "Pretrain + RL Fine-Tune (v1 0.9 explore)": (os.path.abspath("pg_logs/finetune_rl_05_11_2022_14:39/"), "model-last"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + SL Fine-Tune,1247,1870,2752,0.6668,0.4531,0.6094
1,Pretrain + RL Fine-Tune (v0 0.5 explore),1177,1846,2701,0.6376,0.4358,0.5835
2,Pretrain + RL Fine-Tune (v1 0.5 explore),813,1273,2451,0.6386,0.3317,0.5389
3,Pretrain + RL Fine-Tune (v2 0.5 explore),821,1271,2466,0.6459,0.3329,0.5437
4,Pretrain + RL Fine-Tune (v3 0.5 explore),929,1393,2554,0.6669,0.3637,0.5716
5,Pretrain + RL Fine-Tune (v0 0.9 explore),1167,1868,2694,0.6247,0.4332,0.574
6,Pretrain + RL Fine-Tune (v1 0.9 explore),804,1237,2450,0.65,0.3282,0.5434


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + SL Fine-Tune,0.545552,0.591531
1,Pretrain + RL Fine-Tune (v0 0.5 explore),0.541086,0.587938
2,Pretrain + RL Fine-Tune (v1 0.5 explore),0.523813,0.55812
3,Pretrain + RL Fine-Tune (v2 0.5 explore),0.52307,0.55848
4,Pretrain + RL Fine-Tune (v3 0.5 explore),0.520464,0.557786
5,Pretrain + RL Fine-Tune (v0 0.9 explore),0.538892,0.582487
6,Pretrain + RL Fine-Tune (v1 0.9 explore),0.522644,0.55513


# Model with different false negative rewards

- 0.5 explore coefficient

### Model-Best

In [11]:
model_dict = {
    "Pretrain + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_01:11:2022_12:09/"), "model-best"),
    "Pretrain + RL Fine-Tune (-0.1 fn reward)": (os.path.abspath("pg_logs/finetune_rl_03_11_2022_10:20/"), "model-best"),
    "Pretrain + RL Fine-Tune (0.0 fn reward)": (os.path.abspath("pg_logs/finetune_rl_07_11_2022_13:51/"), "model-best"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + SL Fine-Tune,1247,1870,2752,0.6668,0.4531,0.6094
1,Pretrain + RL Fine-Tune (-0.1 fn reward),1122,1707,2636,0.6573,0.4256,0.5928
2,Pretrain + RL Fine-Tune (0.0 fn reward),784,1119,2465,0.7006,0.3181,0.5648


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + SL Fine-Tune,0.545552,0.591531
1,Pretrain + RL Fine-Tune (-0.1 fn reward),0.538418,0.582714
2,Pretrain + RL Fine-Tune (0.0 fn reward),0.512721,0.547989


### Model-Last

In [12]:
model_dict = {
    "Pretrain + SL Fine-Tune": (os.path.abspath("sl_logs/finetune_wi+locness_01:11:2022_12:09/"), "model-best"),
    "Pretrain + RL Fine-Tune (-0.1 fn reward)": (os.path.abspath("pg_logs/finetune_rl_03_11_2022_10:20/"), "model-last"),
    "Pretrain + RL Fine-Tune (0.0 fn reward)": (os.path.abspath("pg_logs/finetune_rl_07_11_2022_13:51/"), "model-last"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + SL Fine-Tune,1247,1870,2752,0.6668,0.4531,0.6094
1,Pretrain + RL Fine-Tune (-0.1 fn reward),1177,1846,2701,0.6376,0.4358,0.5835
2,Pretrain + RL Fine-Tune (0.0 fn reward),970,1439,2575,0.6741,0.3767,0.5822


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + SL Fine-Tune,0.545552,0.591531
1,Pretrain + RL Fine-Tune (-0.1 fn reward),0.541086,0.587938
2,Pretrain + RL Fine-Tune (0.0 fn reward),0.521098,0.557999


# Model with different update interval

- 0.5 explore coefficient
- 0.0 fn reward

### Model-Best

In [16]:
model_dict = {
    "Pretrain + RL Fine-Tune (50 interval)": (os.path.abspath("pg_logs/finetune_rl_08_11_2022_12:47/"), "model-best"),
    "Pretrain + RL Fine-Tune (100 interval)": (os.path.abspath("pg_logs/finetune_rl_07_11_2022_13:51/"), "model-best"),
    "Pretrain + RL Fine-Tune (500 interval)": (os.path.abspath("pg_logs/finetune_rl_08_11_2022_16:39/"), "model-best"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + RL Fine-Tune (50 interval),696,997,2385,0.6981,0.2918,0.5461
1,Pretrain + RL Fine-Tune (100 interval),784,1119,2465,0.7006,0.3181,0.5648
2,Pretrain + RL Fine-Tune (500 interval),804,1168,2473,0.6884,0.3251,0.5626


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + RL Fine-Tune (50 interval),0.508417,0.527804
1,Pretrain + RL Fine-Tune (100 interval),0.512721,0.547989
2,Pretrain + RL Fine-Tune (500 interval),0.513188,0.545174


### Model-Last

In [15]:
model_dict = {
    "Pretrain + RL Fine-Tune (50 interval)": (os.path.abspath("pg_logs/finetune_rl_08_11_2022_12:47/"), "model-last"),
    "Pretrain + RL Fine-Tune (100 interval)": (os.path.abspath("pg_logs/finetune_rl_07_11_2022_13:51/"), "model-last"),
    "Pretrain + RL Fine-Tune (500 interval)": (os.path.abspath("pg_logs/finetune_rl_08_11_2022_16:39/"), "model-last"),
}

conll_df = gen_conll_report(model_dict)
jfleg_df = gen_jfleg_report(model_dict)

Unnamed: 0,Model,Correct,Predicted,Total Error,Precision,Recall,F-0.5 Score
0,Pretrain + RL Fine-Tune (50 interval),929,1413,2587,0.6575,0.3591,0.5638
1,Pretrain + RL Fine-Tune (100 interval),970,1439,2575,0.6741,0.3767,0.5822
2,Pretrain + RL Fine-Tune (500 interval),926,1367,2551,0.6774,0.363,0.5774


Unnamed: 0,Model,Dev Score,Test Score
0,Pretrain + RL Fine-Tune (50 interval),0.521022,0.558627
1,Pretrain + RL Fine-Tune (100 interval),0.521098,0.557999
2,Pretrain + RL Fine-Tune (500 interval),0.521452,0.550333
