From c35bde9b937985f00ac72c2d3bbc4719e24a2ee9 Mon Sep 17 00:00:00 2001 From: Jack Morris Date: Fri, 12 Aug 2022 15:54:23 -0400 Subject: [PATCH 1/4] updating transformers, datasets, gensim --- requirements.txt | 4 ++-- ...multilingual_universal_sentence_encoder.py | 3 +-- .../models/tokenizers/glove_tokenizer.py | 2 +- textattack/shared/word_embeddings.py | 22 +++++++++---------- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/requirements.txt b/requirements.txt index a791a32d6..4befebef6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,13 +5,13 @@ filelock language_tool_python lemminflect lru-dict -datasets==2.2.2 +datasets==2.4.0 nltk numpy>=1.21.0 pandas>=1.0.1 scipy>=1.4.1 torch>=1.7.0,!=1.8 -transformers>=3.3.0 +transformers>=4.21.0 terminaltables tqdm word2number diff --git a/textattack/constraints/semantics/sentence_encoders/universal_sentence_encoder/multilingual_universal_sentence_encoder.py b/textattack/constraints/semantics/sentence_encoders/universal_sentence_encoder/multilingual_universal_sentence_encoder.py index 717a3944b..3b2da0379 100644 --- a/textattack/constraints/semantics/sentence_encoders/universal_sentence_encoder/multilingual_universal_sentence_encoder.py +++ b/textattack/constraints/semantics/sentence_encoders/universal_sentence_encoder/multilingual_universal_sentence_encoder.py @@ -2,13 +2,12 @@ multilingual universal sentence encoder ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ """ -import tensorflow_text # noqa: F401 from textattack.constraints.semantics.sentence_encoders import SentenceEncoder from textattack.shared.utils import LazyLoader hub = LazyLoader("tensorflow_hub", globals(), "tensorflow_hub") - +tensorflow_text = LazyLoader("tensorflow_text", globals(), "tensorflow_text") class MultilingualUniversalSentenceEncoder(SentenceEncoder): """Constraint using similarity between sentence encodings of x and x_adv diff --git a/textattack/models/tokenizers/glove_tokenizer.py b/textattack/models/tokenizers/glove_tokenizer.py index 847b9fc89..6deb616fa 100644 --- a/textattack/models/tokenizers/glove_tokenizer.py +++ b/textattack/models/tokenizers/glove_tokenizer.py @@ -46,7 +46,7 @@ def __init__( word_list_file = tempfile.NamedTemporaryFile() word_list_file.write(json.dumps(word_id_map).encode()) - word_level = hf_tokenizers.models.WordLevel( + word_level = hf_tokenizers.models.WordLevel.from_file( word_list_file.name, unk_token=str(unk_token) ) tokenizer = hf_tokenizers.Tokenizer(word_level) diff --git a/textattack/shared/word_embeddings.py b/textattack/shared/word_embeddings.py index 76b5bb960..49b6977e0 100644 --- a/textattack/shared/word_embeddings.py +++ b/textattack/shared/word_embeddings.py @@ -313,7 +313,7 @@ def __init__(self, keyed_vectors): gensim = utils.LazyLoader("gensim", globals(), "gensim") if isinstance( - keyed_vectors, gensim.models.keyedvectors.WordEmbeddingsKeyedVectors + keyed_vectors, gensim.models.KeyedVectors ): self.keyed_vectors = keyed_vectors else: @@ -335,11 +335,11 @@ def __getitem__(self, index): """ if isinstance(index, str): try: - index = self.keyed_vectors.vocab.get(index).index + index = self.keyed_vectors.key_to_index.get(index) except KeyError: return None try: - return self.keyed_vectors.vectors_norm[index] + return self.keyed_vectors.get_normed_vectors()[index] except IndexError: # word embedding ID out of bounds return None @@ -352,10 +352,10 @@ def word2index(self, word): Returns: index (int) """ - vocab = self.keyed_vectors.vocab.get(word) + vocab = self.keyed_vectors.key_to_index.get(word) if vocab is None: raise KeyError(word) - return vocab.index + return vocab def index2word(self, index): """ @@ -368,7 +368,7 @@ def index2word(self, index): """ try: # this is a list, so the error would be IndexError - return self.keyed_vectors.index2word[index] + return self.keyed_vectors.index_to_key[index] except IndexError: raise KeyError(index) @@ -386,8 +386,8 @@ def get_mse_dist(self, a, b): try: mse_dist = self._mse_dist_mat[a][b] except KeyError: - e1 = self.keyed_vectors.vectors_norm[a] - e2 = self.keyed_vectors.vectors_norm[b] + e1 = self.keyed_vectors.get_normed_vectors()[a] + e2 = self.keyed_vectors.get_normed_vectors()[b] e1 = torch.tensor(e1).to(utils.device) e2 = torch.tensor(e2).to(utils.device) mse_dist = torch.sum((e1 - e2) ** 2).item() @@ -406,9 +406,9 @@ def get_cos_sim(self, a, b): distance (float): cosine similarity """ if not isinstance(a, str): - a = self.keyed_vectors.index2word[a] + a = self.keyed_vectors.index_to_key[a] if not isinstance(b, str): - b = self.keyed_vectors.index2word[b] + b = self.keyed_vectors.index_to_key[b] cos_sim = self.keyed_vectors.similarity(a, b) return cos_sim @@ -421,7 +421,7 @@ def nearest_neighbours(self, index, topn, return_words=True): Returns: neighbours (list[int]): List of indices of the nearest neighbours """ - word = self.keyed_vectors.index2word[index] + word = self.keyed_vectors.index_to_key[index] return [ self.word2index(i[0]) for i in self.keyed_vectors.similar_by_word(word, topn) From 4e0abaea12f6c7241820080c33efcbbb9acda2c5 Mon Sep 17 00:00:00 2001 From: Jack Morris Date: Fri, 12 Aug 2022 16:15:41 -0400 Subject: [PATCH 2/4] make format --- .../multilingual_universal_sentence_encoder.py | 1 + textattack/shared/utils/tensor.py | 2 ++ textattack/shared/word_embeddings.py | 4 +--- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/textattack/constraints/semantics/sentence_encoders/universal_sentence_encoder/multilingual_universal_sentence_encoder.py b/textattack/constraints/semantics/sentence_encoders/universal_sentence_encoder/multilingual_universal_sentence_encoder.py index 3b2da0379..578e9b892 100644 --- a/textattack/constraints/semantics/sentence_encoders/universal_sentence_encoder/multilingual_universal_sentence_encoder.py +++ b/textattack/constraints/semantics/sentence_encoders/universal_sentence_encoder/multilingual_universal_sentence_encoder.py @@ -9,6 +9,7 @@ hub = LazyLoader("tensorflow_hub", globals(), "tensorflow_hub") tensorflow_text = LazyLoader("tensorflow_text", globals(), "tensorflow_text") + class MultilingualUniversalSentenceEncoder(SentenceEncoder): """Constraint using similarity between sentence encodings of x and x_adv where the text embeddings are created using the Multilingual Universal diff --git a/textattack/shared/utils/tensor.py b/textattack/shared/utils/tensor.py index 4f946675e..8b6a3a028 100644 --- a/textattack/shared/utils/tensor.py +++ b/textattack/shared/utils/tensor.py @@ -9,6 +9,8 @@ def batch_model_predict(model_predict, inputs, batch_size=32): """ outputs = [] i = 0 + # print("batch_model_predict", inputs.shape) + # print("inputs:", inputs) while i < len(inputs): batch = inputs[i : i + batch_size] batch_preds = model_predict(batch) diff --git a/textattack/shared/word_embeddings.py b/textattack/shared/word_embeddings.py index 49b6977e0..d9f2f3d3c 100644 --- a/textattack/shared/word_embeddings.py +++ b/textattack/shared/word_embeddings.py @@ -312,9 +312,7 @@ class GensimWordEmbedding(AbstractWordEmbedding): def __init__(self, keyed_vectors): gensim = utils.LazyLoader("gensim", globals(), "gensim") - if isinstance( - keyed_vectors, gensim.models.KeyedVectors - ): + if isinstance(keyed_vectors, gensim.models.KeyedVectors): self.keyed_vectors = keyed_vectors else: raise ValueError( From ed979ca666cae9f9e26316e24b775df1fabfc333 Mon Sep 17 00:00:00 2001 From: Jack Morris Date: Sat, 13 Aug 2022 09:29:31 -0400 Subject: [PATCH 3/4] update gensim --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 99164b064..6053e30bd 100644 --- a/setup.py +++ b/setup.py @@ -35,11 +35,11 @@ ] extras["optional"] = [ - "sentence_transformers>0.2.6", + "sentence_transformers==2.2.0", "stanza", "visdom", "wandb", - "gensim==3.8.3", + "gensim==4.1.2", ] # For developers, install development tools along with all optional dependencies. From 0f83bd618ae7ba615bfdf849100c2f172618fabd Mon Sep 17 00:00:00 2001 From: Jack Morris Date: Sun, 14 Aug 2022 12:17:10 -0400 Subject: [PATCH 4/4] v0.3.7 --- docs/conf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/conf.py b/docs/conf.py index 7daaa27aa..f789e2760 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -21,7 +21,7 @@ author = "UVA QData Lab" # The full version, including alpha/beta/rc tags -release = "0.3.6" +release = "0.3.7" # Set master doc to `index.rst`. master_doc = "index"