In [1]:
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import os
import subprocess
from sklearn.neighbors import NearestNeighbors
from numpy.random import default_rng
from easse.sari import corpus_sari
from sacrebleu import corpus_bleu
import json
import requests
from collections import defaultdict
from tqdm import tqdm

In [2]:
load_dotenv()
token = os.environ.get("HUGGING_FACE_API_TOKEN")

In [3]:
%env LASER=../../../LASER

env: LASER=../../../LASER


In [4]:
def laser_embed(df, name, split, laser_version=""):
    txt_path = "./laser_embeddings/" + name + "_" + split + ".txt"
    bin_path = "./laser_embeddings/" + name + "_" + split + ".bin"
    with open(txt_path, 'w') as f:
        for txt in df['original']:
            f.write(txt.replace('\n','') + '\n')
    subprocess.run(["bash","../../../LASER/tasks/embed/embed.sh",txt_path,bin_path,laser_version])
    os.remove(txt_path)

def load_laser_embeddings(name, split):
    dim = 1024
    bin_path = "./laser_embeddings/" + name + "_" + split + ".bin"

    embeddings = np.fromfile(bin_path, dtype=np.float32, count=-1)                                                                          
    embeddings.resize(embeddings.shape[0] // dim, dim)

    return embeddings

def calc_distances_to_neighbors(train_emb, eval_emb, neighbors):
    # Find distances to all neighbors
    A = train_emb[neighbors, :]
    B = eval_emb

    dot_product = np.dot(A, B.T).diagonal(0,0,2).T

    # Compute the L2 norm of the vectors in A and B
    norm_A = np.linalg.norm(A, axis=2)
    norm_B = np.linalg.norm(B, axis=1)

    # Compute the cosine distance between each pair of vectors using broadcasting
    cosine_distances = 1 - (dot_product / (norm_A.T * norm_B).T)

    return cosine_distances

def generate_preprocessing_sim(name, train_emb, eval_emb, split="test"):
    K=20

    model = NearestNeighbors(n_neighbors=K,
                            metric='cosine',
                            algorithm='brute',
                            n_jobs=-1)
    model.fit(train_emb)

    closest_neighbors = model.kneighbors(eval_emb, return_distance=False)

    cosine_distances = calc_distances_to_neighbors(train_emb, eval_emb, closest_neighbors)

    pd.DataFrame(closest_neighbors).to_csv("./few_shot_preprocessing/" + name + "_" + split + "_similarity.csv")
    pd.DataFrame(cosine_distances).to_csv("./few_shot_preprocessing/" + name + "_" + split + "_similarity_dist.csv")

def generate_preprocessing_rand(name, train_emb, eval_emb, split="test"):
    K = 20

    rng = np.random.default_rng(3600)
    random_neighbors = rng.integers(low=0, high=train_emb.shape[0], size=(eval_emb.shape[0], K))
    cosine_distances = calc_distances_to_neighbors(train_emb, eval_emb, random_neighbors)

    pd.DataFrame(random_neighbors).to_csv("./few_shot_preprocessing/" + name + "_" + split + "_random.csv")
    pd.DataFrame(cosine_distances).to_csv("./few_shot_preprocessing/" + name + "_" + split + "_random_dist.csv")


def preprocess_dataset(train_path, test_path, name, split="test"):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)

    laser_version = ""
    if name == "SimplifyUR":
        laser_version = "urd_Arab"
    
    laser_embed(train, name, "train", laser_version)
    laser_embed(test, name, "test", laser_version)

    train_embeddings = load_laser_embeddings(name, "train")
    test_embeddings = load_laser_embeddings(name, "test")

    generate_preprocessing_sim(name, train_embeddings, test_embeddings, split)
    generate_preprocessing_rand(name, train_embeddings, test_embeddings, split)

In [14]:
preprocess_dataset("../data/Slovene/Text Simplification Slovene_train.csv", "../data/Slovene/Text Simplification Slovene_test.csv", "TSSlovene")

2022-12-29 12:24:05,866 | INFO | embed | spm_model: /Users/michaelryan/Documents/School/GeorgiaTech/Research/LASER/nllb/laser2.spm
2022-12-29 12:24:05,866 | INFO | embed | spm_cvocab: /Users/michaelryan/Documents/School/GeorgiaTech/Research/LASER/nllb/laser2.cvocab
2022-12-29 12:24:05,866 | INFO | embed | loading encoder: /Users/michaelryan/Documents/School/GeorgiaTech/Research/LASER/nllb/laser2.pt
2022-12-29 12:24:06,198 | INFO | preprocess | SPM processing TSSlovene_train.txt  
2022-12-29 12:24:06,305 | INFO | embed | encoding /var/folders/zy/zlsw34jx4zn2cv4tn33_02nh0000gn/T/tmp44dy4sj6/spm to ./laser_embeddings/TSSlovene_train.bin
2022-12-29 12:24:11,244 | INFO | embed | encoded 749 sentences in 4s
2022-12-29 12:24:12,347 | INFO | embed | spm_model: /Users/michaelryan/Documents/School/GeorgiaTech/Research/LASER/nllb/laser2.spm
2022-12-29 12:24:12,347 | INFO | embed | spm_cvocab: /Users/michaelryan/Documents/School/GeorgiaTech/Research/LASER/nllb/laser2.cvocab
2022-12-29 12:24:12,347

In [12]:
def calc_bleu_sari(df_ref, sentences):

    num_refs = df_ref.shape[1]-1

    bleu_scores = np.zeros((num_refs))
    sari_scores = np.zeros((num_refs))

    examples = [{"original": [], "sentences": [], "references": []} for _ in range(num_refs)]

    assert df_ref.shape[0] == len(sentences)

    for (index,row), sentence in zip(df_ref.iterrows(), sentences):
        original = row['original']
        simple = sentence
        ref_list = []
        for col in row.index:
            if col != 'original' and type(row[col]) != float:
                ref_list.append(row[col])
        num_ref = len(ref_list)
        examples[num_ref-1]['original'].append(original)
        examples[num_ref-1]['sentences'].append(simple)
        examples[num_ref-1]['references'].append(ref_list)

    counts = np.array([len(e['original']) for e in examples])
    total = sum(counts)
    weights = np.divide(counts, total)

    for i in range(len(examples)):
        if counts[i] > 0:
            references = np.array(examples[i]['references']).T.tolist()
            bleu_scores[i] = corpus_bleu(
                                examples[i]['sentences'],
                                references,
                                force = True,
                                tokenize = '13a',
                                lowercase = True
                            ).score
            sari_scores[i] = corpus_sari(
                                orig_sents = examples[i]['original'],
                                sys_sents = examples[i]['sentences'],
                                refs_sents = references,
                                tokenizer = '13a'
                            )
    
    bleu = np.dot(bleu_scores, weights)
    sari = np.dot(sari_scores, weights)

    return bleu, sari

In [6]:
API_URL = "https://api-inference.huggingface.co/models/bigscience/bloom"
headers = {"Authorization": f"Bearer {token}"}
def query(payload):
    data = json.dumps(payload)
    response = requests.request("POST", API_URL, headers=headers, data=data)
    return json.loads(response.content.decode("utf-8"))

def load_fewshot_examples(train, test, mapping, offset=0):
    output = defaultdict(lambda:[])
    for j, (example,ref) in enumerate(zip(test['original'], test['simple'])):
        output['original'].append(example)
        output['ref'].append(ref)
        i_off = 0
        for i, idx in enumerate(mapping.iloc[j]):
            if i != 0 and i > offset:
                output["ex" + str(i_off) + "_orig"].append(train.iloc[idx]["original"])
                output["ex" + str(i_off) + "_simp"].append(train.iloc[idx]["simple"])
                i_off += 1
    
    out_df = pd.DataFrame(output)
    return out_df
    
def construct_example(example_row, k=3):
    output = []
    for i in range(k):
        output.append("Original: \"" + example_row["ex" + str(i) +"_orig"] + "\"\n")
        output.append("Simple: \"" + example_row["ex" + str(i) + "_simp"] + "\"\n\n")

    output.append("Original: \"" + example_row["original"] + "\"\nSimple: \"")
    return "".join(output)

REQUERY_LIMIT = 5
def generate_fewshot(example_row, k=3):
    ex = construct_example(example_row, k=k)

    new = ""
    new_total = ""
    for i in range(REQUERY_LIMIT):
        response = query(ex)
        res = response[0]['generated_text']
        new = res[len(ex):]
        new_total += res[len(ex):]
        if "\"\n""" in new_total:
            return new_total.split("\"\n""")[0]
        elif "Original:" in new_total:
            return new_total.split("Original:")[0]
        else:
            ex += new
    return new_total

def fewshot_eval(train_path, test_path, preprocessed_path, k=3, output_csv="", checkpoint=""):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    preprocessed = pd.read_csv(preprocessed_path)
    examples = load_fewshot_examples(train, test, preprocessed)
    sentences = []
    if (not checkpoint == "" and os.path.exists(checkpoint)):
        ckpt = pd.read_csv(checkpoint)
        sentences_pd = list(ckpt['fewshot output'])
        sentences = []
        for i, s in enumerate(sentences_pd):
            if not type(s) == float:
                sentences.append(s)
            else:
                try:
                    sentences.append(generate_fewshot(examples.iloc[i], k))
                except:
                    # print("---")
                    # print("ERROR:  DUMPING GENERATED SENTENCES!")
                    # print()
                    # print(sentences)
                    # print()
                    # print("ERROR ON " + examples.iloc[i]['original'])
                    # print("---")
                    sentences.append("")
            exit = True
            for s in sentences_pd[i:]:
                if not type(s) == float:
                    exit = False
            if exit:
                break
    for i in tqdm(range(len(examples))):
        if i < len(sentences):
            continue
        row = examples.iloc[i]
        try:
            sentences.append(generate_fewshot(row, k))
        except:
            # print("---")
            # print("ERROR:  DUMPING GENERATED SENTENCES!")
            # print()
            # print(sentences)
            # print()
            # print("ERROR ON " + row['original'])
            # print("---")
            sentences.append("")
    if not output_csv == "":
        output = {"original":list(test['original']), "fewshot output": sentences}
        output_df = pd.DataFrame(output)
        output_df.to_csv(output_csv, index=False)
    bleu, sari = calc_bleu_sari(test, sentences)
    return bleu, sari

# Try k-shots to fill in blanks, but if the error persists try k-=1
def few_shot_backoff(train_path, test_path, preprocessed_path, k=3, output_csv="", checkpoint=""):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    preprocessed = pd.read_csv(preprocessed_path)
    examples = load_fewshot_examples(train, test, preprocessed)
    sentences = []
    if (not checkpoint == "" and os.path.exists(checkpoint)):
        ckpt = pd.read_csv(checkpoint)
        sentences_pd = list(ckpt['fewshot output'])
        sentences = []
        for i, s in tqdm(enumerate(sentences_pd)):
            if not type(s) == float:
                sentences.append(s)
            else:
                curr_k = k
                while curr_k >= 0:
                    try:
                        generated = generate_fewshot(examples.iloc[i], curr_k)
                        sentences.append(generated)
                        break
                    except:
                        curr_k -= 1
                if curr_k < 0:
                    print("ERROR ON INPUT: " + examples.iloc[i]['original'])
        if not output_csv == "":
            output = {"original":list(test['original']), "fewshot output": sentences}
            output_df = pd.DataFrame(output)
            output_df.to_csv(output_csv, index=False)
        bleu, sari = calc_bleu_sari(test, sentences)
        return bleu, sari


In [9]:
# bleu, sari = fewshot_eval("../data/English/ASSET_train.csv", "../data/English/ASSET_test.csv", "./few_shot_preprocessing/ASSET_test_random.csv", k=3, output_csv="../../fewshot-outputs/ASSET/3.rand.csv", checkpoint="../../fewshot-outputs/ASSET/3.rand.csv")
# bleu, sari = fewshot_eval("../data/Urdu/SimplifyUR_train.csv", "../data/Urdu/SimplifyUR_test.csv", "./few_shot_preprocessing/SimplifyUR_test_similarity.csv", k=0, output_csv="../../fewshot-outputs/SimplifyUR/0.sim.csv", checkpoint="../../fewshot-outputs/SimplifyUR/0.sim.csv")
bleu, sari = fewshot_eval("../data/Basque/CBST_train.csv", "../data/Basque/CBST_test.csv", "./few_shot_preprocessing/CBST_test_similarity.csv", k=0, output_csv="../../fewshot-outputs/CBST/0.sim.csv", checkpoint="../../fewshot-outputs/CBST/0.sim.csv")
print("BLEU", bleu)
print("SARI", sari)

100%|██████████| 23/23 [00:00<00:00, 694021.53it/s]

BLEU 29.938036356557735
SARI 34.4325805637174





In [15]:
train_set = "../data/Japanese/Easy Japanese Extended_train.csv"
test_set = "../data/Japanese/Easy Japanese Extended_test.csv"
name = "EasyJAExt"

split = "test"
demonstration = "similarity" # "similarity" or "random"

k_shots = [0,1,2,3,5,10,20]

for k in k_shots:
    print("TESTING " + str(k) + "-SHOT:")
    mapping = "./few_shot_preprocessing/" + name + "_" + split + "_" + demonstration + ".csv"
    dem = "sim" if (demonstration == "similarity") else ("rand" if (demonstration == "random") else "unk")
    output = "../../fewshot-outputs/" + name + "/" + str(k) + "." + dem + ".csv"
    bleu, sari = fewshot_eval(train_set, test_set, mapping, k=k, output_csv=output, checkpoint=output)
    bleu, sari = few_shot_backoff(train_set, test_set, mapping, k=k, output_csv=output, checkpoint=output)

    print("BLEU", bleu)
    print("SARI", sari)



TESTING 0-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2853268.03it/s]
100it [00:00, 1823610.43it/s]


BLEU 11.689742698596966
SARI 30.529485143245168
TESTING 1-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2267191.35it/s]
100it [00:00, 1959955.14it/s]


BLEU 17.091509721695978
SARI 46.08253360482048
TESTING 2-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2219208.47it/s]
100it [00:00, 1923992.66it/s]


BLEU 19.35726793526695
SARI 49.39268449407829
TESTING 3-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2621440.00it/s]
100it [00:00, 1664406.35it/s]


BLEU 25.344205034474086
SARI 47.00969273624047
TESTING 5-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2219208.47it/s]
100it [00:00, 2267191.35it/s]


BLEU 20.316525103528605
SARI 49.30464091007859
TESTING 10-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2046001.95it/s]
100it [00:00, 2129088.32it/s]


BLEU 27.308727335578766
SARI 47.503824026412374
TESTING 20-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2452809.36it/s]
100it [00:00, 2076388.12it/s]

BLEU 28.250291891552635
SARI 46.55381480754754





In [16]:
demonstration = "random" # "similarity" or "random"

k_shots = [1,2,3,5,10,20]

for k in k_shots:
    print("TESTING " + str(k) + "-SHOT:")
    mapping = "./few_shot_preprocessing/" + name + "_" + split + "_" + demonstration + ".csv"
    dem = "sim" if (demonstration == "similarity") else ("rand" if (demonstration == "random") else "unk")
    output = "../../fewshot-outputs/" + name + "/" + str(k) + "." + dem + ".csv"
    bleu, sari = fewshot_eval(train_set, test_set, mapping, k=k, output_csv=output, checkpoint=output)
    bleu, sari = few_shot_backoff(train_set, test_set, mapping, k=k, output_csv=output, checkpoint=output)

    print("BLEU", bleu)
    print("SARI", sari)

TESTING 1-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2076388.12it/s]
100it [00:00, 2343186.59it/s]


BLEU 12.717909340095272
SARI 40.70364494524691
TESTING 2-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2796202.67it/s]
100it [00:00, 1657827.67it/s]


BLEU 16.327023748425336
SARI 41.381299000171616
TESTING 3-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2330168.89it/s]
100it [00:00, 2006844.02it/s]


BLEU 17.700870844368012
SARI 40.037339935348065
TESTING 5-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2086718.41it/s]
100it [00:00, 2330168.89it/s]


BLEU 17.117299016458176
SARI 42.55154266686833
TESTING 10-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2511559.28it/s]
100it [00:00, 2231012.77it/s]


BLEU 22.609779504090277
SARI 42.49635550070905
TESTING 20-SHOT:


100%|██████████| 100/100 [00:00<00:00, 2330168.89it/s]
100it [00:00, 2036069.90it/s]

BLEU 19.26750449984209
SARI 42.110943041781155





In [7]:
few_shot_backoff("../data/Danish/DSim Corpus_train.csv", "../data/Danish/DSim Corpus_test.csv", "./few_shot_preprocessing/DSim_test_random.csv", k=20, output_csv="../../fewshot-outputs/DSim/20.rand.csv", checkpoint="../../fewshot-outputs/DSim/20.rand.csv")

9it [00:10,  1.11s/it]

ERROR ON INPUT: DMI lover en lidt vejrmæssig kedelig weekend med gråvejr , byger og blæst .


10it [00:15,  1.64s/it]

ERROR ON INPUT: Men nu er folk pludselig begyndt at tænke på , at det i virkeligheden kan have en pris kun at fokusere på sig selv , siger generalsekretæren til avisen . 


11it [00:19,  2.15s/it]

ERROR ON INPUT: Den NUM . februar .


14it [00:23,  1.82s/it]

ERROR ON INPUT: Beslutningen er taget i fællesskab , lyder det .


15it [00:27,  2.27s/it]

ERROR ON INPUT: Vagn Jelsøe forklarer , at hvis fire piger køber fire par sko på én gang , fordi portoen så bliver billigere , skal de muligvis bevise , at skoene kun er til eget brug .


17it [00:32,  2.22s/it]

ERROR ON INPUT: Kammerater og sanitetspersonel ydede førstehjælp på stedet .


18it [00:36,  2.59s/it]

ERROR ON INPUT: Rækken af koncerter nåede aldrig at blive til noget , da Michael Jackson blev fundet død i sit hjem i Los Angeles den NUM . juni .


22it [00:40,  1.75s/it]

ERROR ON INPUT: Piloten blev dræbt , da et passagerfly tirsdag kørte af landingsbanen under et uvejr på den thailandske ferieø Koh Samui .


24it [00:44,  1.88s/it]

ERROR ON INPUT: En kendelse , der blev kæret på stedet .


30it [00:48,  1.24s/it]

ERROR ON INPUT: - Der bliver simpelthen brugt for mange penge i Nato på ting , der ikke er målrettet missioner .


In [8]:
def save_dump(csv_path, test, sentences):
    while (len(sentences) < len(test)):
        sentences.append("")
    output = {"original":list(test['original']), "fewshot output": sentences}
    output_df = pd.DataFrame(output)
    output_df.to_csv(csv_path, index=False)

In [9]:
sentences = []

csv_path = "../../fewshot-outputs/ASSET/10.sim.csv"
test = pd.read_csv("../data/English/ASSET_test.csv")

save_dump(csv_path, test, sentences)

In [9]:
def find_average_distance(distances, k=3, offset=0):
    return np.average(np.average(distances[:,offset:offset+k], axis=1))

In [10]:
def similarity_experiment(train_path, test_path, preprocessed_path, preprocessed_dists, k=3, offset=0, output_csv="", checkpoint=""):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    preprocessed = pd.read_csv(preprocessed_path)
    preprocessed_dists = pd.read_csv(preprocessed_dists)
    examples = load_fewshot_examples(train, test, preprocessed, offset=offset)
    sentences = []
    if (not checkpoint == "" and os.path.exists(checkpoint)):
        ckpt = pd.read_csv(checkpoint)
        sentences_pd = list(ckpt['fewshot output'])
        sentences = []
        for i, s in enumerate(sentences_pd):
            if not type(s) == float:
                sentences.append(s)
            else:
                try:
                    sentences.append(generate_fewshot(examples.iloc[i], k))
                except:
                    print("---")
                    print("ERROR:  DUMPING GENERATED SENTENCES!")
                    print()
                    print(sentences)
                    print()
                    print("ERROR ON " + examples.iloc[i]['original'])
                    print("---")
                    sentences.append("")
            exit = True
            for s in sentences_pd[i:]:
                if not type(s) == float:
                    exit = False
            if exit:
                break
    for i in tqdm(range(len(examples))):
        if i < len(sentences):
            continue
        row = examples.iloc[i]
        try:
            sentences.append(generate_fewshot(row, k))
        except:
            print("---")
            print("ERROR:  DUMPING GENERATED SENTENCES!")
            print()
            print(sentences)
            print()
            print("ERROR ON " + row['original'])
            print("---")
            sentences.append("")
    if not output_csv == "":
        output = {"original":list(test['original']), "fewshot output": sentences}
        output_df = pd.DataFrame(output)
        output_df.to_csv(output_csv, index=False)
    bleu, sari = calc_bleu_sari(test, sentences)
    dist = find_average_distance(preprocessed_dists.to_numpy()[1:,1:], k, offset)
    return bleu, sari, dist

In [None]:
train_set = "../data/Urdu/SimplifyUR_train.csv"
test_set = "../data/Urdu/SimplifyUR_test.csv"
name = "SimplifyUR"

offsets = [0, 95, 195, 295, 395, 494]
k = 5

for offset in offsets:
    print("TESTING " + str(k) + "-SHOT OFFSET " + str(offset) + ":")
    mapping = "./few_shot_preprocessing/"+name+"_sim_experiment.csv"
    dist = "./few_shot_preprocessing/"+name+"_sim_experiment_dist.csv"
    output = "../../fewshot-sim-experiments/" + name + "/" + str(offset) + "-" + str((offset+k)) + ".csv"
    bleu, sari, dist = similarity_experiment(train_set, test_set, mapping, dist, k=k, offset=offset, output_csv=output, checkpoint=output)

    print("BLEU", bleu)
    print("SARI", sari)
    print("DIST", dist)


In [None]:
few_shot_backoff("../data/Urdu/SimplifyUR_train.csv", "../data/Urdu/SimplifyUR_test.csv", "./few_shot_preprocessing/SimplifyUR_test_similarity.csv", k=5, output_csv="../../fewshot-outputs/SimplifyUR/5.sim.csv", checkpoint="../../fewshot-outputs/SimplifyUR/5.sim.csv")