In [2]:
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source https://github.com/facebookresearch/access/tree/main/access
#

from functools import lru_cache
import re
from string import punctuation

from nltk.tokenize.nist import NISTTokenizer
from nltk.corpus import stopwords as nltk_stopwords
import spacy

# TODO: #language_specific
stopwords = set(nltk_stopwords.words('english'))


@lru_cache(maxsize=100)  # To speed up subsequent calls
def word_tokenize(sentence):
    tokenizer = NISTTokenizer()
    sentence = ' '.join(tokenizer.tokenize(sentence))
    # Rejoin special tokens that where tokenized by error: e.g. "<PERSON_1>" -> "< PERSON _ 1 >"
    for match in re.finditer(r'< (?:[A-Z]+ _ )+\d+ >', sentence):
        sentence = sentence.replace(match.group(), ''.join(match.group().split()))
    return sentence


def to_words(sentence):
    return sentence.split()


def remove_punctuation_characters(text):
    return ''.join([char for char in text if char not in punctuation])


@lru_cache(maxsize=1000)
def is_punctuation(word):
    return remove_punctuation_characters(word) == ''


@lru_cache(maxsize=100)
def remove_punctuation_tokens(text):
    return ' '.join([w for w in to_words(text) if not is_punctuation(w)])


def remove_stopwords(text):
    return ' '.join([w for w in to_words(text) if w.lower() not in stopwords])


@lru_cache(maxsize=1)
def get_spacy_model():
    model = 'ja_core_news_sm'
    if not spacy.util.is_package(model):
        spacy.cli.download(model)
        spacy.cli.link(model, model, force=True, model_path=spacy.util.get_package_path(model))
    return spacy.load(model)  # python -m spacy download en_core_web_sm`


@lru_cache(maxsize=10**6)
def spacy_process(text):
    return get_spacy_model()(str(text))

In [4]:
import math
import Levenshtein
from pathlib import Path
import fasttext


FASTTEXT_EMBEDDINGS_PATH = "/Users/michaelryan/Documents/School/GeorgiaTech/Research/MultilingualSimplification/fastText/cc.ja.300.bin"


def get_char_ratio(complex, simple):
	return len(simple) / len(complex)


def get_levenshtein_similarity(complex_sentence, simple_sentence):
	return Levenshtein.ratio(complex_sentence, simple_sentence)


def get_bucket(ratio, feat="NC"):
	bucket = round(min(math.ceil(ratio / 0.05), 40) * 0.05, 2)
	return "<" + feat + "_" + str(bucket) + "> "


def get_depth_ratio(complex, simple):
	denom = get_dependency_tree_depth(complex)
	if denom == 0:
		return 0
	return get_dependency_tree_depth(simple) * 1.0 / denom


def get_dependency_tree_depth(sentence):
	def get_subtree_depth(node):
		if len(list(node.children)) == 0:
			return 0
		return 1 + max([get_subtree_depth(child) for child in node.children])

	tree_depths = [get_subtree_depth(spacy_sentence.root) for spacy_sentence in spacy_process(sentence).sents]
	if len(tree_depths) == 0:
		return 0
	return max(tree_depths)


import numpy as np
from functools import lru_cache


def yield_lines(filepath, n_lines=float('inf'), prop=1):
	if prop < 1:
		assert n_lines == float('inf')
		n_lines = int(prop * count_lines(filepath))
	with open(filepath, 'r') as f:
		for i, l in enumerate(f):
			if i >= n_lines:
				break
			yield l.rstrip('\n')


def count_lines(filepath):
	n_lines = 0
	with Path(filepath).open() as f:
		for l in f:
			n_lines += 1
	return n_lines


@lru_cache(maxsize=1)
def get_word2rank(vocab_size=np.inf):
	# TODO: Decrease vocab size or load from smaller file
	word2rank = {}
	model = fasttext.load_model(FASTTEXT_EMBEDDINGS_PATH)
	for i, word in enumerate(model.get_words(on_unicode_error='replace')):
		if (i + 1) > vocab_size:
			break
		word2rank[word] = i
	return word2rank


def get_rank(word):
	return get_word2rank().get(word, len(get_word2rank()))


def get_log_rank(word):
	return np.log(1 + get_rank(word))


def get_lexical_complexity_score(sentence):
	words = to_words(remove_stopwords(remove_punctuation_tokens(sentence)))
	words = [word for word in words if word in get_word2rank()]
	if len(words) == 0:
		return np.log(1 + len(get_word2rank()))  # TODO: This is completely arbitrary
	return np.quantile([get_log_rank(word) for word in words], 0.75)


def get_word_rank(complex, simple):
	denom = get_lexical_complexity_score(complex)
	if denom == 0:
		return 0
	return get_lexical_complexity_score(simple) * 1.0 / denom


def get_prefix(complex, simple):
	prefixes = []
	prefixes.append(get_bucket(get_char_ratio(complex, simple), "NC"))
	prefixes.append(get_bucket(get_levenshtein_similarity(complex, simple), "LS"))
	prefixes.append(get_bucket(get_depth_ratio(complex, simple), "DR"))
	prefixes.append(get_bucket(get_word_rank(complex, simple), "WR"))
	return " ".join(prefixes)

def get_prefix_precomputed(chr_ratio, lev_sim, dep_ratio, word_rank):
	prefixes = []
	prefixes.append(get_bucket(chr_ratio, "NC"))
	prefixes.append(get_bucket(lev_sim, "LS"))
	prefixes.append(get_bucket(dep_ratio, "DR"))
	prefixes.append(get_bucket(word_rank, "WR"))
	return " ".join(prefixes)

print(get_bucket(2.81))
print(get_levenshtein_similarity("asdf", "asbf"))
print(get_depth_ratio("This is a test.", "This is."))
print(get_word_rank("This is a test.", "This is."))
print(get_prefix("This is a test.", "This is."))
print(get_prefix("C'est un test.","C'est un."))
print(get_prefix("Это проверка.", "Это."))

<NC_2.0> 
0.75
1.0
1.0
<NC_0.55>  <LS_0.7>  <DR_1.0>  <WR_1.0> 
<NC_0.65>  <LS_0.8>  <DR_1.0>  <WR_1.45> 
<NC_0.35>  <LS_0.5>  <DR_1.0>  <WR_1.15> 


In [5]:
import pandas as pd
from tqdm import tqdm
import os

In [6]:
def add_control_tokens_to_df(df, override_dr=None):
    original = []
    simple = []

    char_ratios = []
    lev_sims = []
    depth_ratios = []
    word_ranks = []

    with tqdm(total=len(df)) as progress_bar:
        for i, content in tqdm(df.iterrows()):
            chr_ratio = get_char_ratio(content['original'], content['simple'])
            lev_sim = get_levenshtein_similarity(content['original'], content['simple'])
            dep_ratio = override_dr
            if not override_dr:
                dep_ratio = get_depth_ratio(content['original'], content['simple'])
                
            word_rank = get_word_rank(content['original'], content['simple'])

            prefix = get_prefix_precomputed(chr_ratio, lev_sim, dep_ratio, word_rank)

            char_ratios.append(chr_ratio)
            lev_sims.append(lev_sim)
            depth_ratios.append(dep_ratio)
            word_ranks.append(word_rank)

            original.append(prefix + content['original'])
            simple.append(content['simple'])
            progress_bar.update(1)

    char_ratios = np.array(char_ratios)
    lev_sims = np.array(lev_sims)
    depth_ratios = np.array(depth_ratios)
    word_ranks = np.array(word_ranks)

    avg_prefix = get_prefix_precomputed(
        np.average(char_ratios),\
        np.average(lev_sims),\
        np.average(depth_ratios),\
        np.average(word_ranks))
    
    return pd.DataFrame({"original": original, "simple": simple}), avg_prefix

In [7]:
df, prefix = add_control_tokens_to_df(pd.read_csv("../data/Japanese/Easy Japanese Corpus_train.csv"))

27600it [02:19, 198.04it/s]0 [02:19<00:00, 202.72it/s]
100%|██████████| 27600/27600 [02:19<00:00, 198.04it/s]


In [8]:
df.head(5)

Unnamed: 0,original,simple
0,<NC_1.0> <LS_0.9> <DR_1.0> <WR_1.0> 彼は私を見て危...,彼は私を見て危険だといいました。
1,<NC_0.95> <LS_0.8> <DR_1.0> <WR_1.0> もう授業中に...,もう授業中には決して勝手に話しません。
2,<NC_1.1> <LS_0.75> <DR_1.5> <WR_1.0> 午前中は在宅...,午前中は家にいる予定です。
3,<NC_1.1> <LS_0.95> <DR_1.0> <WR_1.0> 電話を切らず...,電話を切らずにちょっとお待ちください。
4,<NC_1.0> <LS_0.95> <DR_1.0> <WR_1.0> 彼の妻はもち...,彼の妻はもちろん子供たちもそのパーティーに呼ばれた。


In [9]:
print(prefix)

<NC_1.1>  <LS_0.8>  <DR_1.15>  <WR_1.0> 


In [10]:
def add_ctrl_tokens(path_to_modify, store_averages="../misc/average_train_prefix.txt", override_dr = None):
    df = pd.read_csv(path_to_modify)
    df, prefix = add_control_tokens_to_df(df, override_dr)
    with open(store_averages, 'a') as f:
        f.write(os.path.basename(path_to_modify) + ": " + prefix + "\n")
    df.to_csv(path_to_modify, index=False)

In [12]:
add_ctrl_tokens("../data/Japanese/Easy Japanese Extended_train.csv")

32248it [02:33, 210.58it/s]8 [02:33<00:00, 467.93it/s]
100%|██████████| 32248/32248 [02:33<00:00, 210.58it/s]
