In [1]:
import pandas as pd
import os
from tqdm import tqdm

def verify_no_overlap(df1, df2):
    inner = pd.merge(df1, df2, how ='inner', on =['original', 'original'])
    return len(inner)

In [2]:
directory = "../data/"

In [3]:
def check_overlap_all_data(directory):
    for dir_name in os.listdir(directory):
        dir = os.path.join(directory, dir_name)
        if os.path.isdir(dir):
            print("--------------" + dir_name + "--------------")
            other_files = [os.path.join(dir,file) for file in os.listdir(dir) if (file[-4:] == ".csv" and os.path.isfile(os.path.join(dir,file)))]
            while len(other_files) > 1:
                test_file = other_files.pop(0)
                df1 = pd.read_csv(test_file)
                for data_file in other_files:
                    print ("TESTING " + test_file + " vs " + data_file + ": ", end="")
                    df2 = pd.read_csv(data_file)
                    overlap = verify_no_overlap(df1,df2)
                    if (overlap == 0):
                        print("OKAY")
                    else:
                        print("BAD (" + str(overlap) + " overlap original sentences) [MUST FIX!!!]")

In [4]:
def select_swap(from_counts, from_df, disallow):
    random_line = from_df.sample()
    while (random_line.iloc[0]['original'] not in from_counts) \
      or (from_counts[random_line.iloc[0]['original']] != 1) \
      or (random_line.iloc[0]['original'] == disallow):
        random_line = from_df.sample()
    return random_line

def fix_overlap(path1, path2):
    df1 = pd.read_csv(path1)
    df2 = pd.read_csv(path2)
    df1_counts = df1['original'].value_counts()
    df2_counts = df2['original'].value_counts()
    inner = pd.merge(df1, df2, how ='inner', on =['original', 'original'])
    original_len_df1 = df1.shape[0]
    original_len_df2 = df2.shape[0]
    for sentence in tqdm(inner['original']):
        if sentence in df1_counts and sentence in df2_counts:
            if df1_counts[sentence] > df2_counts[sentence]:
                row = df2.loc[df2['original'] == sentence].iloc[:1]
                if not row.empty:
                    # Move overlap line to df1
                    df1 = pd.concat([df1, row], ignore_index=True)
                    df2 = df2.drop(row.index)

                    # Move random line to df2
                    random_line = select_swap(df1_counts, df1, sentence)
                    df2 = pd.concat([df2, random_line], ignore_index=True)
                    df1 = df1.drop(random_line.index)
                
            else:
                row = df1.loc[df1['original'] == sentence].iloc[:1]
                if not row.empty:
                    # Move overlap line to df2
                    df2 = pd.concat([df2, row], ignore_index=True)
                    df1 = df1.drop(row.index)

                    # Move random line to df1
                    random_line = select_swap(df2_counts, df2, sentence)
                    df1 = pd.concat([df1, random_line], ignore_index=True)
                    df2 = df2.drop(random_line.index)
            df1_counts = df1['original'].value_counts()
            df2_counts = df2['original'].value_counts()

    assert df1.shape[0] == original_len_df1
    assert df2.shape[0] == original_len_df2

    df1.to_csv(path1, index=False)
    df2.to_csv(path2, index=False)



            

In [5]:
def fix_overlap_no_replace(path1_from, path2_to):
    df1 = pd.read_csv(path1_from)
    df2 = pd.read_csv(path2_to)
    df1_counts = df1['original'].value_counts()
    df2_counts = df2['original'].value_counts()
    inner = pd.merge(df1, df2, how ='inner', on =['original', 'original'])
    move_size = 0
    original_len_df1 = df1.shape[0]
    original_len_df2 = df2.shape[0]
    for sentence in tqdm(inner['original']):
        if sentence in df1_counts and sentence in df2_counts:
            row = df1.loc[df1['original'] == sentence].iloc[:1]
            if not row.empty:
                move_size += 1
                # Move overlap line to df2
                df2 = pd.concat([df2, row], ignore_index=True)
                df1 = df1.drop(row.index)

            df1_counts = df1['original'].value_counts()
            df2_counts = df2['original'].value_counts()

    assert df1.shape[0] + move_size == original_len_df1
    assert df2.shape[0] - move_size == original_len_df2

    df1.to_csv(path1_from, index=False)
    df2.to_csv(path2_to, index=False)

In [177]:
fix_overlap("../data/Japanese/Easy Japanese Corpus_train.csv", "../data/Japanese/Easy Japanese Corpus_test.csv")

100%|██████████| 1/1 [00:00<00:00, 44.77it/s]


In [6]:
def swap_row(df_from, df_to, df_to_counts, sentence):
    row = df_from.loc[df_from['original'] == sentence].iloc[:1]
    if not row.empty:
        # Move overlap line to df1
        df_to = pd.concat([df_to, row], ignore_index=True)
        df_from = df_from.drop(row.index)

        # Move random line to df2
        random_line = select_swap(df_to_counts, df_to, sentence)
        df_from = pd.concat([df_from, random_line], ignore_index=True)
        df_to = df_to.drop(random_line.index)
    return df_from, df_to

In [7]:
def align_splits(train1_path, test1_path, train2_path, test2_path):
    tr1, te1, tr2, te2 = [pd.read_csv(path) for path in [train1_path, test1_path, train2_path, test2_path]]
    tr1_counts, te1_counts, tr2_counts, te2_counts = [df['original'].value_counts() for df in [tr1, te1, tr2, te2]]
    tr1_olen, te1_olen, tr2_olen, te2_olen = [df.shape[0] for df in [tr1,te1,tr2,te2]]

    assert verify_no_overlap(tr1, te1) == 0, "train/test dataframe 1 do not have zero overlap to begin.  Please use fix_overlap."
    assert verify_no_overlap(tr2, te2) == 0, "train/test dataframe 2 do not have zero overlap to begin.  Please use fix_overlap."

    # First guarantee no overlap in train1 and test2
    print("Align TRAIN1 to TEST2 (reduce overlap)")
    inner = pd.merge(tr1, te2, how ='inner', on =['original', 'original'])
    for sentence in tqdm(inner['original']):
        if sentence in tr1_counts and sentence in te2_counts:
            if tr1_counts[sentence] > te2_counts[sentence]:
                te2, tr2 = swap_row(te2, tr2, tr2_counts, sentence)
            else:
                tr1, te1= swap_row(tr1, te1, te1_counts, sentence)
            tr1_counts, te1_counts, tr2_counts, te2_counts = [df['original'].value_counts() for df in [tr1, te1, tr2, te2]]

    # Second guarantee no overlap in test1 and train2
    print("Align TEST1 to TRAIN2 (reduce overlap)")
    inner = pd.merge(te1, tr2, how ='inner', on =['original', 'original'])
    for sentence in tqdm(inner['original']):
        if sentence in te1_counts and sentence in tr2_counts:
            if te1_counts[sentence] >= tr2_counts[sentence]:
                tr2, te2 = swap_row(tr2, te2, te2_counts, sentence)
            else:
                te1, tr1= swap_row(te1, tr1, tr1_counts, sentence)
            tr1_counts, te1_counts, tr2_counts, te2_counts = [df['original'].value_counts() for df in [tr1, te1, tr2, te2]]
    
    assert tr1_olen == tr1.shape[0], "ERROR: Number of rows in train1 changed"
    assert te1_olen == te1.shape[0], "ERROR: Number of rows in test1 changed"
    assert tr2_olen == tr2.shape[0], "ERROR: Number of rows in train2 changed"
    assert te2_olen == te2.shape[0], "ERROR: Number of rows in test2 changed"

    overlap = verify_no_overlap(tr1, te1)
    if overlap > 0:
        print("TRAIN1 to TEST1: BAD (" + str(overlap) + " overlap) rerun fix_overlap")
    overlap = verify_no_overlap(tr2, te2)
    if overlap > 0:
        print("TRAIN2 to TEST2: BAD (" + str(overlap) + " overlap) rerun fix_overlap")
    overlap = verify_no_overlap(tr1, te2)
    if overlap > 0:
        print("TRAIN1 to TEST2: BAD (" + str(overlap) + " overlap) rerun align_splits")
    overlap = verify_no_overlap(tr2, te1)
    if overlap > 0:
        print("TRAIN2 to TEST1: BAD (" + str(overlap) + " overlap) rerun align_splits")
    
    tr1.to_csv(train1_path, index=False)
    te1.to_csv(test1_path, index=False)
    tr2.to_csv(train2_path, index=False)
    te2.to_csv(test2_path, index=False)

    

In [8]:
# Align the splits without ever modifying train2 or test2
def align_splits_oneside(train1_path, test1_path, train2_path_lock, test2_path_lock):
    tr1, te1, tr2, te2 = [pd.read_csv(path) for path in [train1_path, test1_path, train2_path_lock, test2_path_lock]]
    tr1_counts, te1_counts, tr2_counts, te2_counts = [df['original'].value_counts() for df in [tr1, te1, tr2, te2]]
    tr1_olen, te1_olen, tr2_olen, te2_olen = [df.shape[0] for df in [tr1,te1,tr2,te2]]

    assert verify_no_overlap(tr1, te1) == 0, "train/test dataframe 1 do not have zero overlap to begin.  Please use fix_overlap."
    assert verify_no_overlap(tr2, te2) == 0, "train/test dataframe 2 do not have zero overlap to begin.  Please use fix_overlap."

    # First guarantee no overlap in train1 and test2
    print("Align TRAIN1 to TEST2 (reduce overlap)")
    inner = pd.merge(tr1, te2, how ='inner', on =['original', 'original'])
    for sentence in tqdm(inner['original']):
        if sentence in tr1_counts:
            tr1, te1 = swap_row(tr1, te1, te1_counts, sentence)
            tr1_counts, te1_counts = [df['original'].value_counts() for df in [tr1, te1]]

    # Second guarantee no overlap in test1 and train2
    print("Align TEST1 to TRAIN2 (reduce overlap)")
    inner = pd.merge(te1, tr2, how ='inner', on =['original', 'original'])
    for sentence in tqdm(inner['original']):
        if sentence in te1_counts:
            te1, tr1= swap_row(te1, tr1, tr1_counts, sentence)
            tr1_counts, te1_counts = [df['original'].value_counts() for df in [tr1, te1]]
    
    assert tr1_olen == tr1.shape[0], "ERROR: Number of rows in train1 changed"
    assert te1_olen == te1.shape[0], "ERROR: Number of rows in test1 changed"
    assert tr2_olen == tr2.shape[0], "ERROR: Number of rows in train2 changed"
    assert te2_olen == te2.shape[0], "ERROR: Number of rows in test2 changed"

    overlap = verify_no_overlap(tr1, te1)
    if overlap > 0:
        print("TRAIN1 to TEST1: BAD (" + str(overlap) + " overlap) rerun fix_overlap")
    overlap = verify_no_overlap(tr2, te2)
    if overlap > 0:
        print("TRAIN2 to TEST2: BAD (" + str(overlap) + " overlap) rerun fix_overlap")
    overlap = verify_no_overlap(tr1, te2)
    if overlap > 0:
        print("TRAIN1 to TEST2: BAD (" + str(overlap) + " overlap) rerun align_splits")
    overlap = verify_no_overlap(tr2, te1)
    if overlap > 0:
        print("TRAIN2 to TEST1: BAD (" + str(overlap) + " overlap) rerun align_splits")
    
    tr1.to_csv(train1_path, index=False)
    te1.to_csv(test1_path, index=False)

In [45]:
align_splits_oneside("../data/Japanese/Easy Japanese Extended_train.csv", "../data/Japanese/Easy Japanese Extended_val.csv", "../data/Japanese/Easy Japanese Corpus_train.csv", "../data/Japanese/Easy Japanese Corpus_val.csv")

Align TRAIN1 to TEST2 (reduce overlap)


0it [00:00, ?it/s]


Align TEST1 to TRAIN2 (reduce overlap)


0it [00:00, ?it/s]


In [54]:
check_overlap_all_data(directory)

--------------Japanese--------------
TESTING ../data/Japanese/Easy Japanese Extended_train.csv vs ../data/Japanese/Easy Japanese Corpus_train.csv: BAD (8377 overlap original sentences) [MUST FIX!!!]
TESTING ../data/Japanese/Easy Japanese Extended_train.csv vs ../data/Japanese/Easy Japanese Extended_test.csv: OKAY
TESTING ../data/Japanese/Easy Japanese Extended_train.csv vs ../data/Japanese/Easy Japanese Corpus_val.csv: OKAY
TESTING ../data/Japanese/Easy Japanese Extended_train.csv vs ../data/Japanese/Easy Japanese Corpus_test.csv: OKAY
TESTING ../data/Japanese/Easy Japanese Extended_train.csv vs ../data/Japanese/Easy Japanese Extended_val.csv: OKAY
TESTING ../data/Japanese/Easy Japanese Corpus_train.csv vs ../data/Japanese/Easy Japanese Extended_test.csv: OKAY
TESTING ../data/Japanese/Easy Japanese Corpus_train.csv vs ../data/Japanese/Easy Japanese Corpus_val.csv: OKAY
TESTING ../data/Japanese/Easy Japanese Corpus_train.csv vs ../data/Japanese/Easy Japanese Corpus_test.csv: OKAY
TESTIN

In [58]:
def assertFrameEqual(df1, df2, **kwds ):
    """ Assert that two dataframes are equal, ignoring ordering of columns"""
    from pandas.util.testing import assert_frame_equal
    return assert_frame_equal(df1.sort_index(axis=1), df2.sort_index(axis=1), check_names=True, **kwds )

def check_equivalent(new_train_path, new_test_path, new_val_path, old_train_path, old_test_path):
    tr1, te1, tr2, te2 = [pd.read_csv(path) for path in [new_train_path, new_test_path, old_train_path, old_test_path]]
    new_list = [tr1, te1]
    if not new_val_path == "" and os.path.exists(new_val_path):
        va1 = pd.read_csv(new_val_path)
        new_list.append(va1)
    new = pd.concat(new_list, ignore_index=True).sort_values(by=['original', 'simple']).reset_index(drop=True)
    old = pd.concat([tr2, te2], ignore_index=True).sort_values(by=['original', 'simple']).reset_index(drop=True)
    try:
        assertFrameEqual(new,old)
        return True
    except:
        return False

In [59]:
# Check that nothing in broken
def check_equivalent_all_data(directory, old_directory):
    print("Many will say bad once the references have been grouped.  Run this on ungrouped references for accurate results")
    for dir_name in os.listdir(directory):
        dir = os.path.join(directory, dir_name)
        if os.path.isdir(dir):
            print("--------------" + dir_name + "--------------")
            for file in os.listdir(dir):
                data_file = os.path.join(dir,file)
                if(os.path.isfile(data_file) and data_file[-9:] == "train.csv"):
                    print("Checking " + data_file.split("/")[-1][:-9] + ": ", end="")
                    test_file = file[:-9] + "test.csv"
                    val_file = file[:-9] + "val.csv"
                    new_train = data_file
                    new_test = os.path.join(directory,dir_name,test_file)
                    new_val = os.path.join(directory,dir_name,val_file)
                    old_train = os.path.join(old_directory, dir_name, file)
                    old_test = os.path.join(old_directory, dir_name, test_file)
                    if (check_equivalent(new_train, new_test, new_val, old_train, old_test)):
                        print("OKAY")
                    else:
                        print("BAD!!!!!!!!!!!!!")



In [172]:
def find_difference(new_train_path,new_test_path,new_val_path,old_train_path,old_test_path):
    tr1, te1, tr2, te2 = [pd.read_csv(path) for path in [new_train_path, new_test_path, old_train_path, old_test_path]]
    new_list = [tr1, te1]
    if not new_val_path == "" and os.path.exists(new_val_path):
        va1 = pd.read_csv(new_val_path)
        new_list.append(va1)
    new = pd.concat(new_list, ignore_index=True).sort_values(by=['original', 'simple']).reset_index(drop=True)
    old = pd.concat([tr2, te2], ignore_index=True).sort_values(by=['original', 'simple']).reset_index(drop=True)
    dif = pd.concat([new, old]).drop_duplicates(keep=False)
    print(dif)

In [173]:
find_difference("../data/Spanish/Newsela ES_train.csv","../data/Spanish/Newsela ES_test.csv","","../data-old/Spanish/Newsela ES_train.csv","../data-old/Spanish/Newsela ES_test.csv")

                                                original  \
1018          "Es significativo, y es un gran problema".   
1019        "Es significativo, y es un gran problema". !   
17690  Incluso los hogares cuyos ocupantes están bien...   
21783  Los doctores dicen que la deforestación está c...   
22517  Los legisladores buscan la forma de regular nu...   
22891  Los pobladores de Borneo de Malasia se están e...   
25151                                PASCAGOULA, Miss. ?   
28947  Sin embargo, de acuerdo a un nuevo informe del...   
31838  Varios bomberos tuvieron que permanecer en tie...   
1019          "Es significativo, y es un gran problema".   
17690  Incluso los hogares cuyos ocupantes están bien...   
21783  Los doctores dicen que la deforestación está c...   
22517  Los legisladores buscan la forma de regular nu...   
22891  Los pobladores de Borneo de Malasia se están e...   
25151                                  PASCAGOULA, Miss.   
28947  Sin embargo, de acuerdo a un nuev

In [60]:
check_equivalent_all_data(directory, "../data-checkpoint/")

Many will say bad once the references have been grouped.  Run this on ungrouped references for accurate results
--------------Japanese--------------
Checking Easy Japanese Extended_: 

  from pandas.util.testing import assert_frame_equal


OKAY
Checking Easy Japanese Corpus_: BAD!!!!!!!!!!!!!
--------------German--------------
Checking GEOLino Corpus_: BAD!!!!!!!!!!!!!
Checking German News_: BAD!!!!!!!!!!!!!
Checking TextComplexityDE Parallel Corpus_: BAD!!!!!!!!!!!!!
--------------Slovene--------------
Checking Text Simplification Slovene_: BAD!!!!!!!!!!!!!
--------------Brazilian Portuguese--------------
Checking PorSimples_: BAD!!!!!!!!!!!!!
--------------Urdu--------------
Checking SimplifyUR_: BAD!!!!!!!!!!!!!
--------------Russian--------------
Checking RuWikiLarge_: BAD!!!!!!!!!!!!!
Checking RuAdapt Ency_: BAD!!!!!!!!!!!!!
Checking RSSE Corpus_: OKAY
Checking RuAdapt Fairytales_: BAD!!!!!!!!!!!!!
Checking RuAdapt Literature_: BAD!!!!!!!!!!!!!
--------------Italian--------------
Checking Teacher_: BAD!!!!!!!!!!!!!
Checking Simpitiki Italian Wikipedia_: BAD!!!!!!!!!!!!!
Checking AdminIT_: BAD!!!!!!!!!!!!!
Checking PaCCSS-IT Corpus_: BAD!!!!!!!!!!!!!
Checking Terence_: BAD!!!!!!!!!!!!!
--------------English----------

In [32]:
def split_train_to_val(train_df, val_size):
  val_df = pd.DataFrame()

  while len(val_df) < val_size:
    random_line = train_df.sample()
    train_counts = train_df['original'].value_counts()
    samples = train_df.loc[train_df["original"] == random_line.iloc[0]['original']]
    val_df = pd.concat([val_df, samples], ignore_index=True)
    train_df = train_df.drop(samples.index)
  
  return train_df, val_df

In [38]:
train_df, val_df = split_train_to_val(pd.read_csv("../data/Japanese/Easy Japanese Extended_train.csv"), 1000)
print(len(train_df))
print(len(val_df))

33269
1000


In [39]:
# train_df.to_csv("../data/Japanese/Easy Japanese Extended_train2.csv", index=False)
# val_df.to_csv("../data/Japanese/Easy Japanese Extended_val2.csv", index=False)

In [50]:
def select_swap_multiref(from_counts, from_df, disallow, count):
    random_line = from_df.sample()
    while (random_line.iloc[0]['original'] not in from_counts) \
      or (from_counts[random_line.iloc[0]['original']] != count) \
      or (random_line.iloc[0]['original'] == disallow):
        random_line = from_df.sample()
    return from_df.loc[from_df['original'] == random_line.iloc[0]['original']]

def swap_row_multiref(df_from, df_to, df_to_counts, sentence):
    rows = df_from.loc[df_from['original'] == sentence]
    if not rows.empty:
        # Move overlap line to df1
        df_to = pd.concat([df_to, rows], ignore_index=True)
        df_from = df_from.drop(rows.index)

        # Move random line to df2
        random_line = select_swap_multiref(df_to_counts, df_to, sentence, len(rows))
        df_from = pd.concat([df_from, random_line], ignore_index=True)
        df_to = df_to.drop(random_line.index)
    return df_from, df_to

# Align the splits without ever modifying train2 or test2
def align_splits_oneside_multiref(train1_path, test1_path, train2_path_lock, test2_path_lock):
    tr1, te1, tr2, te2 = [pd.read_csv(path) for path in [train1_path, test1_path, train2_path_lock, test2_path_lock]]
    tr1_counts, te1_counts, tr2_counts, te2_counts = [df['original'].value_counts() for df in [tr1, te1, tr2, te2]]
    tr1_olen, te1_olen, tr2_olen, te2_olen = [df.shape[0] for df in [tr1,te1,tr2,te2]]

    assert verify_no_overlap(tr1, te1) == 0, "train/test dataframe 1 do not have zero overlap to begin.  Please use fix_overlap."
    assert verify_no_overlap(tr2, te2) == 0, "train/test dataframe 2 do not have zero overlap to begin.  Please use fix_overlap."

    # First guarantee no overlap in train1 and test2
    print("Align TRAIN1 to TEST2 (reduce overlap)")
    inner = pd.merge(tr1, te2, how ='inner', on =['original', 'original'])
    for sentence in tqdm(inner['original']):
        if sentence in tr1_counts:
            tr1, te1 = swap_row_multiref(tr1, te1, te1_counts, sentence)
            tr1_counts, te1_counts = [df['original'].value_counts() for df in [tr1, te1]]

    # Second guarantee no overlap in test1 and train2
    print("Align TEST1 to TRAIN2 (reduce overlap)")
    inner = pd.merge(te1, tr2, how ='inner', on =['original', 'original'])
    for sentence in tqdm(inner['original']):
        if sentence in te1_counts:
            te1, tr1= swap_row_multiref(te1, tr1, tr1_counts, sentence)
            tr1_counts, te1_counts = [df['original'].value_counts() for df in [tr1, te1]]
    
    assert tr1_olen == tr1.shape[0], "ERROR: Number of rows in train1 changed"
    assert te1_olen == te1.shape[0], "ERROR: Number of rows in test1 changed"
    assert tr2_olen == tr2.shape[0], "ERROR: Number of rows in train2 changed"
    assert te2_olen == te2.shape[0], "ERROR: Number of rows in test2 changed"

    overlap = verify_no_overlap(tr1, te1)
    if overlap > 0:
        print("TRAIN1 to TEST1: BAD (" + str(overlap) + " overlap) rerun fix_overlap")
    overlap = verify_no_overlap(tr2, te2)
    if overlap > 0:
        print("TRAIN2 to TEST2: BAD (" + str(overlap) + " overlap) rerun fix_overlap")
    overlap = verify_no_overlap(tr1, te2)
    if overlap > 0:
        print("TRAIN1 to TEST2: BAD (" + str(overlap) + " overlap) rerun align_splits")
    overlap = verify_no_overlap(tr2, te1)
    if overlap > 0:
        print("TRAIN2 to TEST1: BAD (" + str(overlap) + " overlap) rerun align_splits")
    
    tr1.to_csv(train1_path, index=False)
    te1.to_csv(test1_path, index=False)

In [53]:
align_splits_oneside_multiref("../data/English/ASSET_train.csv", "../data/English/ASSET_val.csv", "../data/English/WikiAuto_train.csv", "../data/English/WikiAuto_val.csv")

Align TRAIN1 to TEST2 (reduce overlap)


0it [00:00, ?it/s]


Align TEST1 to TRAIN2 (reduce overlap)


0it [00:00, ?it/s]
