In [None]:
import re
import pandas as pd
from pathlib import Path

In [None]:
# Define file system

input_dir = "COWS-L2H-CSV"
output_dir = "."

In [None]:
df = pd.DataFrame(columns=["essay", "corrected1"])
for file in Path(input_dir).glob("*.csv"):
    cur_df = pd.read_csv(file)
    cur_df = cur_df[["essay", "corrected1"]]
    df = pd.concat((df, cur_df))
df.dropna(inplace=True)

In [None]:
df

In [None]:
def seems_like(text1, text2):
    '''
    Defined as having more than 50% lemma overlap or the sentences are shorter than 4 tokens, naive and quick approach
    '''
    try:
        words1 = {token.lemma_ for token in text1}
    except AttributeError:
        words1 = {token.lemma_ for token in nlp(text1)}
    try:
        words2 = {token.lemma_ for token in text2}
    except AttributeError:
        words2 = {token.lemma_ for token in nlp(text2)}
    ratio = len(words1 & words2) / min(len(words1), len(words2))
    return ratio >= 0.5 or (len(words1) < 4 and len(words2) < 4)
    
def row_to_sentence_pairs(row):
    global skipped_sentences

    clean_errorful = re.sub(r"\s", ' ', row["essay"]).strip()
    clean_corrected = re.sub(r"\s", ' ', row["corrected1"]).strip()
    
    replacements = [(r"\*AGE\*", "edad"), (r"\*CITY\*", "ciudad"), (r"\*STATE\*", "estado"), \
                    (r"\*BIRTH_DATE\*", "fecha de nacimiento"), (r"\*UNIVERSITY\*", "universidad"), (r"\*PLACE\*", "lugar"), \
                   (r"\*FIRST_NAME\*", "nombre"), (r"\*LAST_NAME\*", "apellido"), (r"\*NUMBER\*", "número")]
    for replacement in replacements:
        clean_errorful = re.sub(replacement[0], replacement[1], clean_errorful)
        clean_corrected = re.sub(replacement[0], replacement[1], clean_corrected)
    if len(re.sub(r"\*", '', clean_errorful)) != len(clean_errorful): # Check if there are any remaining special tokens
        print(clean_errorful)
    errorful_sentences = list(nlp(clean_errorful).sents)
    corrected_sentences = list(nlp(clean_corrected).sents)

    paired = []
    i = 0
    j = 0
    while i < len(errorful_sentences) or j < len(corrected_sentences):
        if i >= len(errorful_sentences):
            paired[-1] = (paired[-1][0], f"{paired[-1][1]} {corrected_sentences[j]}")
            j += 1
        elif j >= len(corrected_sentences):
            paired[-1] = (f"{paired[-1][0]} {errorful_sentences[i]}", paired[-1][1])
            i += 1
        else:
            # Handle empty strs
            if re.sub("\s", '', str(errorful_sentences[i])) == "" and re.sub("\s", '', str(corrected_sentences[j])) == "":
                i += 1
                j += 1
                continue
            elif re.sub("\s", '', str(errorful_sentences[i])) == "":
                i += 1
                continue
            elif re.sub("\s", '', str(corrected_sentences[j])) == "":
                j += 1
                continue
            
            if seems_like(errorful_sentences[i], corrected_sentences[j]) or len(paired) < 1:
                paired.append((errorful_sentences[i], corrected_sentences[j]))
                i += 1
                j += 1
            else:
                if seems_like(errorful_sentences[i], paired[-1][1]):
                    paired[-1] = (f"{paired[-1][0]} {errorful_sentences[i]}", paired[-1][1])
                    i += 1
                elif seems_like(corrected_sentences[j], paired[-1][0]):
                    paired[-1] = (paired[-1][0], f"{paired[-1][1]} {corrected_sentences[j]}")
                    j += 1
                else:
                    print("====================\nFailure to resolve sentence position. Ignoring it.")
                    print(f"Current Errorful Candidate: {errorful_sentences[i]}")
                    print(f"Current Corrected Candidate: {corrected_sentences[j]}")
                    print(f"Last Errorful Sentence: {paired[-1][0]}")
                    print(f"Last Corrected Sentence: {paired[-1][1]}")
                    skipped_sentences += 1
                    i += 1
                    j += 1
    print(f"Skipped sentences: {skipped_sentences}")
    return paired

In [None]:
nlp = spacy.load("es_dep_news_trf")
skipped_sentences = 0
sentences = df.apply(row_to_sentence_pairs, axis=1)
print(f"Skipped sentences: {skipped_sentences}")

In [None]:
sentence_pairs = [sentence_pair for group in sentences for sentence_pair in group]
with open(Path(output_dir) / "COWS-L2H-sentence-pairs.txt", 'w') as f:
    for sentence_pair in sentence_pairs:
        f.write(f"{sentence_pair[0]}\n{sentence_pair[1]}\n\n")
with open(Path(output_dir) / "COWS-L2H-only-corrected.txt", 'w') as f:
    for sentence_pair in sentence_pairs:
        f.write(f"{sentence_pair[1]}\n")

In [None]:
# For seq2seq model

with open(Path(output_dir) / "COWS-L2H-unlabeled-STRICT.txt", 'w') as out_f:
    with open(Path(output_dir) / "COWS-L2H-labeled-STRICT.txt", 'r') as in_f:
        lines = in_f.readlines()
        for i in range(0, len(lines), 4):
            errorful_sentence = lines[i]
            token_labels = lines[i+1]
            correct_sentence = lines[i+2]

            out_f.write(f"{errorful_sentence}{correct_sentence}\n")