Skip to content

Commit

Permalink
Merge pull request #686 from QData/updates
Browse files Browse the repository at this point in the history
[Draft] Updating transformers, datasets, gensim
  • Loading branch information
jxmorris12 committed Aug 14, 2022
2 parents 2ac1bcb + 0f83bd6 commit f5817fc
Show file tree
Hide file tree
Showing 7 changed files with 20 additions and 20 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
author = "UVA QData Lab"

# The full version, including alpha/beta/rc tags
release = "0.3.6"
release = "0.3.7"

# Set master doc to `index.rst`.
master_doc = "index"
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ filelock
language_tool_python
lemminflect
lru-dict
datasets==2.2.2
datasets==2.4.0
nltk
numpy>=1.21.0
pandas>=1.0.1
scipy>=1.4.1
torch>=1.7.0,!=1.8
transformers>=3.3.0
transformers>=4.21.0
terminaltables
tqdm
word2number
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,11 @@
]

extras["optional"] = [
"sentence_transformers>0.2.6",
"sentence_transformers==2.2.0",
"stanza",
"visdom",
"wandb",
"gensim==3.8.3",
"gensim==4.1.2",
]

# For developers, install development tools along with all optional dependencies.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
multilingual universal sentence encoder
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"""
import tensorflow_text # noqa: F401

from textattack.constraints.semantics.sentence_encoders import SentenceEncoder
from textattack.shared.utils import LazyLoader

hub = LazyLoader("tensorflow_hub", globals(), "tensorflow_hub")
tensorflow_text = LazyLoader("tensorflow_text", globals(), "tensorflow_text")


class MultilingualUniversalSentenceEncoder(SentenceEncoder):
Expand Down
2 changes: 1 addition & 1 deletion textattack/models/tokenizers/glove_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(
word_list_file = tempfile.NamedTemporaryFile()
word_list_file.write(json.dumps(word_id_map).encode())

word_level = hf_tokenizers.models.WordLevel(
word_level = hf_tokenizers.models.WordLevel.from_file(
word_list_file.name, unk_token=str(unk_token)
)
tokenizer = hf_tokenizers.Tokenizer(word_level)
Expand Down
2 changes: 2 additions & 0 deletions textattack/shared/utils/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ def batch_model_predict(model_predict, inputs, batch_size=32):
"""
outputs = []
i = 0
# print("batch_model_predict", inputs.shape)
# print("inputs:", inputs)
while i < len(inputs):
batch = inputs[i : i + batch_size]
batch_preds = model_predict(batch)
Expand Down
24 changes: 11 additions & 13 deletions textattack/shared/word_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,7 @@ class GensimWordEmbedding(AbstractWordEmbedding):
def __init__(self, keyed_vectors):
gensim = utils.LazyLoader("gensim", globals(), "gensim")

if isinstance(
keyed_vectors, gensim.models.keyedvectors.WordEmbeddingsKeyedVectors
):
if isinstance(keyed_vectors, gensim.models.KeyedVectors):
self.keyed_vectors = keyed_vectors
else:
raise ValueError(
Expand All @@ -335,11 +333,11 @@ def __getitem__(self, index):
"""
if isinstance(index, str):
try:
index = self.keyed_vectors.vocab.get(index).index
index = self.keyed_vectors.key_to_index.get(index)
except KeyError:
return None
try:
return self.keyed_vectors.vectors_norm[index]
return self.keyed_vectors.get_normed_vectors()[index]
except IndexError:
# word embedding ID out of bounds
return None
Expand All @@ -352,10 +350,10 @@ def word2index(self, word):
Returns:
index (int)
"""
vocab = self.keyed_vectors.vocab.get(word)
vocab = self.keyed_vectors.key_to_index.get(word)
if vocab is None:
raise KeyError(word)
return vocab.index
return vocab

def index2word(self, index):
"""
Expand All @@ -368,7 +366,7 @@ def index2word(self, index):
"""
try:
# this is a list, so the error would be IndexError
return self.keyed_vectors.index2word[index]
return self.keyed_vectors.index_to_key[index]
except IndexError:
raise KeyError(index)

Expand All @@ -386,8 +384,8 @@ def get_mse_dist(self, a, b):
try:
mse_dist = self._mse_dist_mat[a][b]
except KeyError:
e1 = self.keyed_vectors.vectors_norm[a]
e2 = self.keyed_vectors.vectors_norm[b]
e1 = self.keyed_vectors.get_normed_vectors()[a]
e2 = self.keyed_vectors.get_normed_vectors()[b]
e1 = torch.tensor(e1).to(utils.device)
e2 = torch.tensor(e2).to(utils.device)
mse_dist = torch.sum((e1 - e2) ** 2).item()
Expand All @@ -406,9 +404,9 @@ def get_cos_sim(self, a, b):
distance (float): cosine similarity
"""
if not isinstance(a, str):
a = self.keyed_vectors.index2word[a]
a = self.keyed_vectors.index_to_key[a]
if not isinstance(b, str):
b = self.keyed_vectors.index2word[b]
b = self.keyed_vectors.index_to_key[b]
cos_sim = self.keyed_vectors.similarity(a, b)
return cos_sim

Expand All @@ -421,7 +419,7 @@ def nearest_neighbours(self, index, topn, return_words=True):
Returns:
neighbours (list[int]): List of indices of the nearest neighbours
"""
word = self.keyed_vectors.index2word[index]
word = self.keyed_vectors.index_to_key[index]
return [
self.word2index(i[0])
for i in self.keyed_vectors.similar_by_word(word, topn)
Expand Down

0 comments on commit f5817fc

Please sign in to comment.