In [1]:
import os
from tqdm.auto import tqdm 
from typing import Union, Dict
from collections import defaultdict

In [2]:
%cd ..
import m2_to_json
from m2_to_json import Speller
from src.utils import write_json
%cd notebooks

/home/rajk/Machine_Learning/DRL-GEC
/home/rajk/Machine_Learning/DRL-GEC/notebooks


In [3]:
def read_parallel(path_a, path_b):
    with open(path_a, "r") as fp_a, open(path_b, "r") as fp_b:
        for line_pair in zip(fp_a, fp_b):
            yield line_pair

In [4]:
def process_sent(
        text: str,
        reference: str,
        checker: Speller,
        min_len: int,
        max_len: int,
        min_sim: float,
        only_proper_sent: bool,
        spell_check: bool = False,
) -> Union[str, Dict[str, str]]:
    """
    Process a given sentence
    """
    # Filter sentence with ellipsis
    if m2_to_json.check_ellipsis(text):
        return "Ellipsis"
    text = m2_to_json.clean_text(text, is_ref=False)
    text = m2_to_json.remove_parenthetical_text(text)
    # Filter sentence based on number of tokens
    num_tokens = len(text.split())
    if num_tokens < min_len:
        return "Less Tokens"
    elif num_tokens > max_len:
        return "More Tokens"
    if spell_check:
        text = m2_to_json.correct_spelling(checker, text)
    # Filter sentence based on whether the reference is not a proper sentence
    reference = m2_to_json.clean_text(reference, is_ref=True)
    reference = m2_to_json.remove_parenthetical_text(reference)
    if only_proper_sent and not m2_to_json.check_proper_sent(reference):
        return "Improper Sentence"
    if text != reference:
        # Filter sentence based on the mean similarity between the original and reference sentences
        sim_score = m2_to_json.similar_ratio(text, reference)
        if sim_score < min_sim:
            return "Source-Reference Similarity"
    return {"text": text, "references": [reference]}

In [6]:
src_path = r"../data/raw/synthetic/a1/a1_train_incorr_sentences.txt"
trg_path = r"../data/raw/synthetic/a1/a1_train_corr_sentences.txt"
train_json_path = r"../data/processed/synthetic/data.json"
dev_json_path = r"../data/processed/synthetic/dev.json"
min_len = 5
max_len = 50
min_sim = 0.8
only_proper_sent = True
spell_check = False
remove_ellipsis = True
data_limit = 2_000_000
dev_percent = 0.02
dev_data_num = round(dev_percent*data_limit)

json_dir = os.path.dirname(train_json_path)
os.makedirs(json_dir, exist_ok=True)

In [7]:
num_data = 0
json_data = []
stats = defaultdict(int)
checker = Speller(lang="en", fast=False, threshold=0)
for (incorrect_sent, correct_sent) in tqdm(read_parallel(src_path, trg_path), desc="Processing", unit_scale=True):
    result = process_sent(incorrect_sent, correct_sent, checker, min_len, max_len, min_sim, only_proper_sent, spell_check)
    if isinstance(result, dict):
        json_data.append(result)
        num_data += 1
        if num_data >= data_limit:
            break
    else:
        stats[result] += 1
print(f"Number of sentences: {len(json_data)}")
print("Report of filtered sentences.")
for key, value in stats.items():
    print(f"{key:>30}: {value}")
    

params = {
    "min_len": min_len,
    "max_len": max_len,
    "min_sim": min_sim,
    "spell_check": spell_check,
    "remove_ellipsis": remove_ellipsis,
    "only_proper_sent": only_proper_sent,
}
write_json(os.path.join(json_dir, "params.json"), params)
write_json(os.path.join(json_dir, "metadata.json"), stats)
write_json(train_json_path, json_data[:-dev_data_num])
write_json(dev_json_path, json_data[-dev_data_num:])