From f36b2b5305f6be66b0c6f4248cabcd35f38dc96f Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 27 Feb 2020 16:26:41 -0600 Subject: [PATCH 01/89] add base class and restructure encoding extractor --- pliers/extractors/text.py | 200 ++++++++++++++++++++++---------------- 1 file changed, 118 insertions(+), 82 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 418e1858..a3481024 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -402,7 +402,112 @@ def _extract(self, stim): features=self.features, orders=order_list) -class PretrainedBertEncodingExtractor(ComplexTextExtractor): +class PretrainedBertExtractor(ComplexTextExtractor): + + ''' Base class for all Extractors based on pretrained BERT. + + Args: + pretrained_model_or_path (str): A string specifying which BERT + model to use. Can be one of pretrained BERT models listed at + https://huggingface.co/transformers/pretrained_models.html + (valid values include all the models with 'bert' prefix) + or path to custom model. + tokenizer (str): Type of tokenization used in the tokenization step. + If different from model, out-of-vocabulary tokens may be treated + as unknown tokens. + model_class (str): Specifies class of Bert model. Must be one of + 'BertModel' or 'BertForLM'. + framework (str): name deep learning framework to use. Must be 'pt' + (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. + tokenizer_kwargs (dict): Named arguments for tokenizer. + See https://huggingface.co/transformers/main_classes/tokenizer.html + for further info. + ''' + + _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', + 'model_class', 'model_kwargs', 'tokenizer_kwargs') + + def __init__(self, + pretrained_model_or_path='bert-base-uncased', + tokenizer='bert-base-uncased', + model_class='BertModel', + framework='pt', + model_kwargs=None, + tokenizer_kwargs=None): + + verify_dependencies(['transformers']) + + if framework not in ['pt', 'tf']: + raise(ValueError('''Invalid framework; + must be one of 'pt' (pytorch) or 'tf' (tensorflow)''')) + + if model_kwargs is None: + model_kwargs = {} + if tokenizer_kwargs is None: + tokenizer_kwargs = {} + + self.pretrained_model = pretrained_model_or_path + self.tokenizer_type = tokenizer + self.framework = framework + self.tokenizer_kwargs = tokenizer_kwargs + + model = model_class if self.framework == 'pt' else 'TF' + model_class + self.model = getattr(transformers, model).from_pretrained( + pretrained_model_or_path, **model_kwargs) + self.tokenizer = transformers.BertTokenizer.from_pretrained( + tokenizer, **tokenizer_kwargs) + super(PretrainedBertExtractor, self).__init__() + + def _mask(self, wds): + return wds + + def _preprocess(self, stims): + txt, ons, dur = zip(*[(s.text, s.onset, s.duration) + for s in stims.elements]) + wds = self._mask(txt) + t_tok = [self.tokenizer.tokenize(w) for w in wds] + t_nr = [len(t) for t in t_tok] + t_tok = list(flatten(t_tok)) + t_wds, t_ons, t_dur = map(lambda x: np.repeat(x, t_nr), [wds, ons, dur]) + t_idx = self.tokenizer.encode(t_tok, return_tensors=self.framework) + return t_wds, t_ons, t_dur, t_tok, t_idx + + def _postprocess(self, preds, t_tok, t_wds, t_ons, t_dur): + out = preds[0].squeeze() + t_tok = ['CLS'] + t_tok + ['SEP'] + t_wds = ['CLS'] + t_wds + ['SEP'] + return out, t_tok, t_wds, t_ons, t_dur + + def _get_feature_names(self): #needed? + return ['encoding', 'token', 'word'] + + def _extract(self, stims): + t_wds, t_ons, t_dur, t_tok, t_idx = self._preprocess(stims) + preds = self.model(t_idx) + preds = [p.detach().numpy() if self.framework == 'pt' else + p.numpy() for p in preds] + out, t_tok, t_wds, t_ons, t_dur = self._postprocess(preds, t_tok, t_wds, t_ons, t_dur) + data = [out.tolist(), t_tok, t_wds] + features = self._get_feature_names() + return ExtractorResult(data, stims, self, features=features, + onsets=t_ons, durations=t_dur) + + def _get_model_attributes(self): #needed? + return ['pretrained_model', 'framework', 'model_class', + 'tokenizer_type'] + + def _to_df(self, result, include_attributes=True): + df_dict = dict(zip(result.features, result._data)) + if include_attributes: + log_dict = {attr: getattr(result.extractor, attr) for + attr in self._get_model_attributes()} + df_dict.update(log_dict) + result_df = pd.DataFrame(df_dict) + result_df['object_id'] = range(result_df.shape[0]) + return result_df + + +class PretrainedBertEncodingExtractor(PretrainedBertExtractor): ''' Uses transformers library to extract contextualized encodings for words or text sequences using pre-trained BERT models. @@ -447,103 +552,35 @@ def __init__(self, pooling=None, model_kwargs=None, tokenizer_kwargs=None): - - verify_dependencies(['transformers']) - - if framework not in ['pt', 'tf']: - raise(ValueError('''Invalid framework; - must be one of 'pt' (pytorch) or 'tf' (tensorflow)''')) - if encoding_level not in ['token', 'sequence']: raise(ValueError('''Invalid encoding_level; must be one of 'token' or 'sequence'.''')) - - if model_kwargs is None: - model_kwargs = {} - - if tokenizer_kwargs is None: - tokenizer_kwargs = {} - - self.pretrained_model = pretrained_model_or_path - self.tokenizer_type = tokenizer - self.framework = framework self.encoding_level = encoding_level self.pooling = pooling - self.model_kwargs = model_kwargs - self.tokenizer_kwargs = tokenizer_kwargs - model_name = 'BertModel' if self.framework == 'pt' else 'TFBertModel' - - self.tokenizer = transformers.BertTokenizer.from_pretrained( - tokenizer, **tokenizer_kwargs) - self.model = getattr(transformers, model_name).from_pretrained( - pretrained_model_or_path, **model_kwargs) - - super(PretrainedBertEncodingExtractor, self).__init__() - - def _extract(self, stims): - if stims.name == '': - stims.name = ' '.join([s.text for s in stims.elements]) - - text, onsets, durations = zip( - *((s.text, s.onset, s.duration) for s in stims.elements)) - tokens = [self.tokenizer.tokenize(t) for t in text] - tokens_flat = list(flatten(tokens)) - - def cast_to_token_level(word_level_list): - token_level_list = [itertools.repeat(word_level_list[i], len(tok)) - for i, tok in enumerate(tokens)] - token_level_list = list(flatten(token_level_list)) - return token_level_list - - t_text, t_ons, t_dur = map(cast_to_token_level, - [text, onsets, durations]) - tensor_tokens = self.tokenizer.encode(tokens_flat, - return_tensors=self.framework) - output = self.model(tensor_tokens) - output = [out.detach().numpy() if self.framework == 'pt' else - out.numpy() for out in output] + super(PretrainedBertEncodingExtractor, self).__init__(pretrained_model_or_path, tokenizer, + framework, model_class='BertModel') + def _postprocess(self, preds, t_tok, t_wds, t_ons, t_dur): if self.encoding_level == 'token': - encoded_tokens = tokens_flat - encodings = output[0][:, 1:-1, :].squeeze() - + out = preds[0][:, 1:-1, :].squeeze() elif self.encoding_level == 'sequence': - encoded_tokens = [' '.join(text)] - t_text = ['None'] + t_tok = [' '.join(t_wds)] + t_wds = ['None'] if not any(val is None for val in [t_ons[-1],t_dur[-1],t_ons[0]]): t_dur = t_ons[-1] + t_dur[-1] - t_ons[0] else: t_dur = None t_ons = t_ons[0] - if self.pooling: pooling_function = getattr(np, self.pooling) - encodings = pooling_function(output[0][:, 1:-1, :], - axis=1, keepdims=True) + out = pooling_function(preds[0][:, 1:-1, :], axis=1, keepdims=True) else: - encodings = output[1] - - data = [encodings.tolist(), encoded_tokens, t_text] - features = ['encoding', 'token', 'word'] - - return ExtractorResult(data, stims, self, - features=features, - onsets=t_ons, durations=t_dur) - - def _to_df(self, result, model_attributes=True): - df_dict = dict(zip(result.features, result._data)) - - if model_attributes: - log_dict = {attr: getattr(result.extractor, attr) for - attr in ['pretrained_model', 'encoding_level', - 'pooling', 'framework', 'tokenizer_type']} - df_dict.update(log_dict) - - result_df = pd.DataFrame(df_dict) - result_df['object_id'] = range(result_df.shape[0]) - - return result_df + out = preds[1] + return out, t_tok, t_wds, t_ons, t_dur + def _get_model_attributes(self): + return ['pretrained_model', 'encoding_level', + 'pooling', 'framework', 'tokenizer_type'] class WordCounterExtractor(ComplexTextExtractor): @@ -575,4 +612,3 @@ def _extract(self, stims): return ExtractorResult(word_counter, stims, self, features=self.features, onsets=onsets, durations=durations) - From 1ddc26b3cfc16daae7cfdde9a730be33b8b690c0 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 2 Mar 2020 16:15:12 -0600 Subject: [PATCH 02/89] fix structure and add LM extractor --- pliers/extractors/text.py | 282 ++++++++++++++++++++++++++------------ 1 file changed, 196 insertions(+), 86 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index a3481024..2c69959a 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -402,12 +402,13 @@ def _extract(self, stim): features=self.features, orders=order_list) -class PretrainedBertExtractor(ComplexTextExtractor): +class BertExtractor(ComplexTextExtractor): ''' Base class for all Extractors based on pretrained BERT. + This model returns the last hidden layer (including special tokens) Args: - pretrained_model_or_path (str): A string specifying which BERT + pretrained_model (str): A string specifying which BERT model to use. Can be one of pretrained BERT models listed at https://huggingface.co/transformers/pretrained_models.html (valid values include all the models with 'bert' prefix) @@ -419,16 +420,17 @@ class PretrainedBertExtractor(ComplexTextExtractor): 'BertModel' or 'BertForLM'. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. + model_kwargs (dict): Named arguments for transformer model. + See https://huggingface.co/transformers/main_classes/model.html tokenizer_kwargs (dict): Named arguments for tokenizer. See https://huggingface.co/transformers/main_classes/tokenizer.html - for further info. ''' - _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'model_class', 'model_kwargs', 'tokenizer_kwargs') + _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', + 'model_class', 'model_kwargs', 'tokenizer_kwargs') def __init__(self, - pretrained_model_or_path='bert-base-uncased', + pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', model_class='BertModel', framework='pt', @@ -446,74 +448,68 @@ def __init__(self, if tokenizer_kwargs is None: tokenizer_kwargs = {} - self.pretrained_model = pretrained_model_or_path + self.pretrained_model = pretrained_model self.tokenizer_type = tokenizer self.framework = framework self.tokenizer_kwargs = tokenizer_kwargs model = model_class if self.framework == 'pt' else 'TF' + model_class self.model = getattr(transformers, model).from_pretrained( - pretrained_model_or_path, **model_kwargs) + pretrained_model, **model_kwargs) self.tokenizer = transformers.BertTokenizer.from_pretrained( tokenizer, **tokenizer_kwargs) - super(PretrainedBertExtractor, self).__init__() + super(BertExtractor, self).__init__() def _mask(self, wds): return wds def _preprocess(self, stims): - txt, ons, dur = zip(*[(s.text, s.onset, s.duration) - for s in stims.elements]) - wds = self._mask(txt) - t_tok = [self.tokenizer.tokenize(w) for w in wds] - t_nr = [len(t) for t in t_tok] - t_tok = list(flatten(t_tok)) - t_wds, t_ons, t_dur = map(lambda x: np.repeat(x, t_nr), [wds, ons, dur]) - t_idx = self.tokenizer.encode(t_tok, return_tensors=self.framework) - return t_wds, t_ons, t_dur, t_tok, t_idx - - def _postprocess(self, preds, t_tok, t_wds, t_ons, t_dur): - out = preds[0].squeeze() - t_tok = ['CLS'] + t_tok + ['SEP'] - t_wds = ['CLS'] + t_wds + ['SEP'] - return out, t_tok, t_wds, t_ons, t_dur - - def _get_feature_names(self): #needed? + wds, ons, dur = zip(*[(s.text, s.onset, s.duration) for s in stims.elements]) + tok = [self.tokenizer.tokenize(w) for w in self._mask(wds)] + n_tok = [len(t) for t in tok] + wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) + tok = list(flatten(tok)) + idx = self.tokenizer.encode(tok, return_tensors=self.framework) + return wds, ons, dur, tok, idx + + def _postprocess(self, preds, tok, wds, ons, dur): + out = preds[0].numpy().squeeze() + data = [out.tolist(), tok, wds] + return data, ons, dur + + def _get_feature_names(self): return ['encoding', 'token', 'word'] def _extract(self, stims): - t_wds, t_ons, t_dur, t_tok, t_idx = self._preprocess(stims) - preds = self.model(t_idx) - preds = [p.detach().numpy() if self.framework == 'pt' else - p.numpy() for p in preds] - out, t_tok, t_wds, t_ons, t_dur = self._postprocess(preds, t_tok, t_wds, t_ons, t_dur) - data = [out.tolist(), t_tok, t_wds] + wds, ons, dur, tok, idx = self._preprocess(stims) + preds = self.model(idx) + preds = [p.detach() if self.framework == 'pt' else p for p in preds] + preds = [p.numpy().squeeze() for p in preds] + data, ons, dur = self._postprocess(preds, tok, wds, ons, dur) features = self._get_feature_names() - return ExtractorResult(data, stims, self, features=features, - onsets=t_ons, durations=t_dur) + return ExtractorResult(data, stims, self, + features=features, onsets=ons, durations=dur) - def _get_model_attributes(self): #needed? + def _get_model_attributes(self): return ['pretrained_model', 'framework', 'model_class', - 'tokenizer_type'] + 'tokenizer_type'] def _to_df(self, result, include_attributes=True): - df_dict = dict(zip(result.features, result._data)) + res_dict = dict(zip(result.features, result._data)) if include_attributes: log_dict = {attr: getattr(result.extractor, attr) for attr in self._get_model_attributes()} - df_dict.update(log_dict) - result_df = pd.DataFrame(df_dict) - result_df['object_id'] = range(result_df.shape[0]) - return result_df - + res_df = pd.DataFrame(res_dict.update(log_dict)) + res_df['object_id'] = range(res_df.shape[0]) + return res_df -class PretrainedBertEncodingExtractor(PretrainedBertExtractor): - ''' Uses transformers library to extract contextualized encodings for words - or text sequences using pre-trained BERT models. +class BertSequenceEncodingExtractor(BertExtractor): + ''' Extract contextualized encodings for words or sequences using + pretrained BertModel. Args: - pretrained_model_or_path (str): A string specifying which BERT + pretrained_model (str): A string specifying which BERT model to use. Can be one of pretrained BERT models listed at https://huggingface.co/transformers/pretrained_models.html (valid values include all the models with 'bert' prefix) @@ -523,14 +519,11 @@ class PretrainedBertEncodingExtractor(PretrainedBertExtractor): unknown tokens. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. - encoding_level (str): A string specifying whether one encoding per token - ('token') or a single encoding for the input sequence ('sequence') - are to be returned. - pooling (str): Optional argument, relevant for sequence-level - embeddings only. If None and encoding_level='sequence', encodings - for [CLS] tokens are returned. If encoding_level='sequence' and - numpy function is specified, token-level embeddings are pooled - according to specified method (e.g. 'mean', 'max', 'min'). + pooling (str): defines whether to return encoding for [CLS] token + (None, default), or the numpy function to use to pool token-level + encodings. + return_sep (bool): defines whether to return encoding for the [SEP] + token. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html and https://huggingface.co/transformers/model_doc/bert.html for @@ -540,48 +533,166 @@ class PretrainedBertEncodingExtractor(PretrainedBertExtractor): for further info. ''' - _log_attributes = ('pretrained_model', 'encoding_level', - 'pooling', 'framework', 'tokenizer_type', - 'model_kwargs', 'tokenizer_kwargs') + _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', + 'pooling', 'return_sep', 'model_class', 'model_kwargs', + 'tokenizer_kwargs') def __init__(self, - pretrained_model_or_path='bert-base-uncased', + pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', framework='pt', - encoding_level='token', pooling=None, + return_sep=False, model_kwargs=None, tokenizer_kwargs=None): - if encoding_level not in ['token', 'sequence']: - raise(ValueError('''Invalid encoding_level; - must be one of 'token' or 'sequence'.''')) - self.encoding_level = encoding_level + + if pooling: + if return_sep: + raise(ValueError('Pooling and return_seq argument are ' + 'mutually exclusive.')) + try: + getattr(np, pooling) + except: + raise(ValueError('Pooling must be a valid numpy function.')) + + self.return_sep = return_sep self.pooling = pooling - super(PretrainedBertEncodingExtractor, self).__init__(pretrained_model_or_path, tokenizer, - framework, model_class='BertModel') - - def _postprocess(self, preds, t_tok, t_wds, t_ons, t_dur): - if self.encoding_level == 'token': - out = preds[0][:, 1:-1, :].squeeze() - elif self.encoding_level == 'sequence': - t_tok = [' '.join(t_wds)] - t_wds = ['None'] - if not any(val is None for val in [t_ons[-1],t_dur[-1],t_ons[0]]): - t_dur = t_ons[-1] + t_dur[-1] - t_ons[0] - else: - t_dur = None - t_ons = t_ons[0] - if self.pooling: - pooling_function = getattr(np, self.pooling) - out = pooling_function(preds[0][:, 1:-1, :], axis=1, keepdims=True) + super(BertExtractor, self).__init__(pretrained_model, + tokenizer, framework, + pooling, return_sep, + model_class='BertModel') + + def _postprocess(preds, tok, wds, ons, dur): + tok = [' '.join(wds)] + try: + dur = ons[-1] + dur[-1] - ons[0] + except: + dur = None + ons = ons[0] + if self.return_sep: + out = preds[0][:,-1,:] + elif self.pooling: + pool_func = getattr(np, self.pooling) + out = pool_func(preds[0][:, 1:-1, :], axis=1, keepdims=True) + else: + out = preds[1] + data = [out.tolist(), tok] + return data, ons, dur + + def _get_feature_names(self): + return ['encoding', 'sequence'] + + def _get_model_attributes(self): + return ['pretrained_model', 'framework', 'model_class', + 'pooling', 'return_sep', 'tokenizer_type'] + +class BertLMExtractor(BertExtractor): + + ''' Use BERT for masked language model task. + + Args: + pretrained_model (str): A string specifying which BERT + model to use. Can be one of pretrained BERT models listed at + https://huggingface.co/transformers/pretrained_models.html + (valid values include all the models with 'bert' prefix) + or path to custom model. + tokenizer (str): Type of tokenization used in the tokenization step. + If different from model, out-of-vocabulary tokens may be treated as + unknown tokens. + framework (str): name deep learning framework to use. Must be 'pt' + (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. + top_n (int): TBD + mask (int or str or list): TBD + target (str or list): TBD + model_kwargs (dict): Named arguments for pretrained model. + See: https://huggingface.co/transformers/main_classes/model.html + and https://huggingface.co/transformers/model_doc/bert.html for + further info. + tokenizer_kwargs (dict): Named arguments for tokenizer. + See https://huggingface.co/transformers/main_classes/tokenizer.html + for further info. + ''' + + _log_attributes = ('pretrained_model', 'framework', 'top_n', 'mask', 'target', + 'tokenizer_type', 'model_class', 'model_kwargs', 'tokenizer_kwargs') + + def __init__(self, + pretrained_model='bert-base-uncased', + tokenizer='bert-base-uncased', + framework='pt', + top_n=None, + mask=None, + target=None, + model_kwargs=None, + tokenizer_kwargs=None): + self.top_n = top_n + self.mask = listify(mask) + self.target = listify(target) + super(BertExtractor, self).__init__(pretrained_model, + tokenizer, framework, + model_class='BertForMaskedLM') + + def _mask(self, wds): + for m in self.mask: + if type(m) == int: + wds[m] = '[MASK]' + elif type(m) == str: + wds = ['[MASK]' if w == m else w for i, w in enumerate(wds)] else: - out = preds[1] - return out, t_tok, t_wds, t_ons, t_dur + logging.warning(f'{m} is not a valid mask index or string.') + if '[MASK]' not in wds: + raise ValueError('No valid mask tokens.') + return wds + + def _postprocess(preds, tok, wds, ons, dur): + preds_softmax = scipy.special.softmax(preds, axis=-1) + + m_idx = [idx for idx, tok in enumerate(tok) if tok == '[MASK]'] + m_wds, m_ons, m_dur = ([] for i in range(3)) + top_wd, top_score, top_softmax = ([] for i in range(3)) + gold_score, gold_softmax, gold_rank = ([] for i in range(3)) + + top_n = self.top_n or preds.shape[2] + + for i in m_idx: + top_pred = np.argsort(preds[0,i,:], axis=-1)[-top_n:] + g_idx = self.tokenizer.convert_tokens_to_ids(wds[i]) + g_rank = len(np.where(preds[0,i,:] >= preds[0,i,g_idx])[0]) + 1 + + m_wds.append(wds[i]) + m_ons.append(ons[i]) + m_dur.append(dur[i]) + + top_wd.append(self.tokenizer.convert_ids_to_tokens(top_pred)) + top_score.append(preds[0,i,top_pred]) + top_softmax.append(preds_softmax[0,i,top_pred]) + gold_score.append(preds[0,i,g_idx]) + gold_softmax.append(preds_softmax[0,i,g_idx]) + gold_rank.append(g_rank) + + # add target words routine here + + data = [top_wd, top_score, top_softmax, + gold_score, gold_softmax, gold_rank, m_wds] + return data, m_ons, m_dur + + def _get_feature_names(self): + return ['top_wd', 'top_score', 'top_softmax', + 'gold_score', 'gold_softmax', 'gold_rank', 'masked_word'] + def _get_model_attributes(self): - return ['pretrained_model', 'encoding_level', - 'pooling', 'framework', 'tokenizer_type'] - + return ['pretrained_model', 'framework', 'top_n', 'mask', 'target', + 'tokenizer_type', 'model_class'] + +# TO DOs: +# Add routine for tracking target words +# Add option to extract other layers / attention heads from encoding extractor +# Add option to return encodings to LM? +# Softmax-ed or raw scores? +# Move to models? + + class WordCounterExtractor(ComplexTextExtractor): ''' Extracts number of times each unique word has occurred within text @@ -594,7 +705,6 @@ class WordCounterExtractor(ComplexTextExtractor): _log_attributes = ('case_sensitive', 'log_scale') def __init__(self, case_sensitive=False, log_scale=False): - self.log_scale = log_scale self.case_sensitive = case_sensitive self.features = ['log_word_count'] if self.log_scale else ['word_count'] From 05e7670a29fb8355487599f1826e72c1611d333e Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 2 Mar 2020 16:15:43 -0600 Subject: [PATCH 03/89] add one more annotation --- pliers/extractors/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 2c69959a..66c2c9fb 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -691,7 +691,7 @@ def _get_model_attributes(self): # Add option to return encodings to LM? # Softmax-ed or raw scores? # Move to models? - +# Select subset of metrics class WordCounterExtractor(ComplexTextExtractor): From a89ed651c08a92ef7408008fbb9a7179727cbd4f Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 2 Mar 2020 16:38:34 -0600 Subject: [PATCH 04/89] add --- pliers/extractors/text.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 66c2c9fb..b4cd577e 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -692,6 +692,7 @@ def _get_model_attributes(self): # Softmax-ed or raw scores? # Move to models? # Select subset of metrics +# Softmax over whole distribution? class WordCounterExtractor(ComplexTextExtractor): From 20eef0a3074c22cda5ab3bea118803cef6cd5143 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 2 Mar 2020 17:45:04 -0600 Subject: [PATCH 05/89] fix prediction shape --- pliers/extractors/__init__.py | 7 +++-- pliers/extractors/text.py | 53 +++++++++++++++++++---------------- 2 files changed, 34 insertions(+), 26 deletions(-) diff --git a/pliers/extractors/__init__.py b/pliers/extractors/__init__.py index 18134d71..dd82db8e 100644 --- a/pliers/extractors/__init__.py +++ b/pliers/extractors/__init__.py @@ -63,7 +63,8 @@ NumUniqueWordsExtractor, PartOfSpeechExtractor, WordEmbeddingExtractor, TextVectorizerExtractor, VADERSentimentExtractor, SpaCyExtractor, - WordCounterExtractor, PretrainedBertEncodingExtractor) + WordCounterExtractor, BertExtractor, + BertSequenceEncodingExtractor, BertLMExtractor) from .video import (FarnebackOpticalFlowExtractor) __all__ = [ @@ -138,6 +139,8 @@ 'BeatTrackExtractor', 'HarmonicExtractor', 'PercussiveExtractor', - 'PretrainedBertEncodingExtractor', + 'BertExtractor', + 'BertSequenceEncodingExtractor', + 'BertLMExtractor', 'WordCounterExtractor' ] diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index b4cd577e..f28e2bc3 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -14,6 +14,7 @@ import itertools import numpy as np import pandas as pd +import scipy import nltk from nltk.sentiment.vader import SentimentIntensityAnalyzer import logging @@ -465,6 +466,7 @@ def _mask(self, wds): def _preprocess(self, stims): wds, ons, dur = zip(*[(s.text, s.onset, s.duration) for s in stims.elements]) + wds, ons, dur = map(list, [wds, ons, dur]) tok = [self.tokenizer.tokenize(w) for w in self._mask(wds)] n_tok = [len(t) for t in tok] wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) @@ -484,7 +486,6 @@ def _extract(self, stims): wds, ons, dur, tok, idx = self._preprocess(stims) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] - preds = [p.numpy().squeeze() for p in preds] data, ons, dur = self._postprocess(preds, tok, wds, ons, dur) features = self._get_feature_names() return ExtractorResult(data, stims, self, @@ -546,6 +547,10 @@ def __init__(self, model_kwargs=None, tokenizer_kwargs=None): + super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, + tokenizer, framework, model_kwargs, tokenizer_kwargs, + model_class='BertModel') + if pooling: if return_sep: raise(ValueError('Pooling and return_seq argument are ' @@ -557,12 +562,9 @@ def __init__(self, self.return_sep = return_sep self.pooling = pooling - super(BertExtractor, self).__init__(pretrained_model, - tokenizer, framework, - pooling, return_sep, - model_class='BertModel') def _postprocess(preds, tok, wds, ons, dur): + preds = [p.numpy().squeeze() for p in preds] tok = [' '.join(wds)] try: dur = ons[-1] + dur[-1] - ons[0] @@ -614,7 +616,7 @@ class BertLMExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'top_n', 'mask', 'target', - 'tokenizer_type', 'model_class', 'model_kwargs', 'tokenizer_kwargs') + 'tokenizer_type') def __init__(self, pretrained_model='bert-base-uncased', @@ -625,12 +627,16 @@ def __init__(self, target=None, model_kwargs=None, tokenizer_kwargs=None): + + super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, + tokenizer=tokenizer, + framework=framework, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, + model_class='BertForMaskedLM') self.top_n = top_n self.mask = listify(mask) self.target = listify(target) - super(BertExtractor, self).__init__(pretrained_model, - tokenizer, framework, - model_class='BertForMaskedLM') def _mask(self, wds): for m in self.mask: @@ -644,13 +650,14 @@ def _mask(self, wds): raise ValueError('No valid mask tokens.') return wds - def _postprocess(preds, tok, wds, ons, dur): + def _postprocess(self, preds, tok, wds, ons, dur): + preds = preds[0].numpy() preds_softmax = scipy.special.softmax(preds, axis=-1) m_idx = [idx for idx, tok in enumerate(tok) if tok == '[MASK]'] m_wds, m_ons, m_dur = ([] for i in range(3)) - top_wd, top_score, top_softmax = ([] for i in range(3)) - gold_score, gold_softmax, gold_rank = ([] for i in range(3)) + top_wd, top_softmax = ([] for i in range(2)) + gold_softmax, gold_rank = ([] for i in range(2)) top_n = self.top_n or preds.shape[2] @@ -664,35 +671,33 @@ def _postprocess(preds, tok, wds, ons, dur): m_dur.append(dur[i]) top_wd.append(self.tokenizer.convert_ids_to_tokens(top_pred)) - top_score.append(preds[0,i,top_pred]) top_softmax.append(preds_softmax[0,i,top_pred]) - gold_score.append(preds[0,i,g_idx]) gold_softmax.append(preds_softmax[0,i,g_idx]) gold_rank.append(g_rank) # add target words routine here + # probability + # rank - data = [top_wd, top_score, top_softmax, - gold_score, gold_softmax, gold_rank, m_wds] + data = [top_wd, top_softmax, gold_softmax, gold_rank, m_wds] return data, m_ons, m_dur def _get_feature_names(self): - return ['top_wd', 'top_score', 'top_softmax', - 'gold_score', 'gold_softmax', 'gold_rank', 'masked_word'] + return ['top_wd', 'top_softmax', 'gold_softmax', 'gold_rank', 'masked_word'] def _get_model_attributes(self): return ['pretrained_model', 'framework', 'top_n', 'mask', 'target', - 'tokenizer_type', 'model_class'] + 'tokenizer_type'] # TO DOs: +# fix _to_df # Add routine for tracking target words # Add option to extract other layers / attention heads from encoding extractor -# Add option to return encodings to LM? -# Softmax-ed or raw scores? -# Move to models? -# Select subset of metrics -# Softmax over whole distribution? +# Add option to return encodings for LM? +# Softmax-ed or raw scores (softmax over whole dist?), which metrics? +# Fix init +# Move to models class WordCounterExtractor(ComplexTextExtractor): From a0fbb8ffa36711dec3020f4afc436ca09d3d499c Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 3 Mar 2020 12:48:17 -0600 Subject: [PATCH 06/89] start implementing target routine + other fixes --- pliers/extractors/text.py | 77 +++++++++++++++++++++++++-------------- 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index f28e2bc3..bee4951f 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -453,6 +453,9 @@ def __init__(self, self.tokenizer_type = tokenizer self.framework = framework self.tokenizer_kwargs = tokenizer_kwargs + self.model_class = model_class + self.model_kwargs = model_kwargs + self.tokenizer_kwargs = tokenizer_kwargs model = model_class if self.framework == 'pt' else 'TF' + model_class self.model = getattr(transformers, model).from_pretrained( @@ -475,7 +478,7 @@ def _preprocess(self, stims): return wds, ons, dur, tok, idx def _postprocess(self, preds, tok, wds, ons, dur): - out = preds[0].numpy().squeeze() + out = preds[0][:, 1:-1, :].numpy().squeeze() data = [out.tolist(), tok, wds] return data, ons, dur @@ -500,11 +503,11 @@ def _to_df(self, result, include_attributes=True): if include_attributes: log_dict = {attr: getattr(result.extractor, attr) for attr in self._get_model_attributes()} - res_df = pd.DataFrame(res_dict.update(log_dict)) + res_dict.update(log_dict) + res_df = pd.DataFrame(res_dict) res_df['object_id'] = range(res_df.shape[0]) return res_df - class BertSequenceEncodingExtractor(BertExtractor): ''' Extract contextualized encodings for words or sequences using @@ -559,12 +562,11 @@ def __init__(self, getattr(np, pooling) except: raise(ValueError('Pooling must be a valid numpy function.')) - self.return_sep = return_sep self.pooling = pooling - def _postprocess(preds, tok, wds, ons, dur): - preds = [p.numpy().squeeze() for p in preds] + def _postprocess(self, preds, tok, wds, ons, dur): + preds = [p.numpy().squeeze() for p in preds] #check tok = [' '.join(wds)] try: dur = ons[-1] + dur[-1] - ons[0] @@ -590,7 +592,7 @@ def _get_model_attributes(self): class BertLMExtractor(BertExtractor): - ''' Use BERT for masked language model task. + ''' Use BERT for masked words prediction. Args: pretrained_model (str): A string specifying which BERT @@ -603,9 +605,14 @@ class BertLMExtractor(BertExtractor): unknown tokens. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. - top_n (int): TBD - mask (int or str or list): TBD - target (str or list): TBD + top_n (int): Specifies how many of the highest-probability tokens are + to be returned. + mask (int or str or list): Words to be masked (string) or indices of + words in the sequence to be masked (indexing starts at 0). Can + be either a single word/index or a list of words/indices. + target (str or list): Vocabulary token(s) for which probability is to + be returned. Tokens defined in the vocabulary change across + tokenizers. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html and https://huggingface.co/transformers/model_doc/bert.html for @@ -634,35 +641,49 @@ def __init__(self, model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs, model_class='BertForMaskedLM') + if top_n and target: + raise ValueError('top_n and target are mutually exclusive arguments') self.top_n = top_n self.mask = listify(mask) self.target = listify(target) + if self.target: + for t in self.target: + if t not in list(self.tokenizer.vocab.keys()): + logging.warning(f'{t} is not in vocabulary. Dropping.') + self.target.remove(t) + if self.target == []: + raise ValueError('No valid target tokens provided. Import ' + 'transformers and run transformers.BertTokenizer.' + f'from_pretrained(\'{tokenizer}\').vocab.keys() to see' + 'which tokens are part of the tokenizer vocabulary.') def _mask(self, wds): + mwds = wds.copy() for m in self.mask: if type(m) == int: - wds[m] = '[MASK]' + mwds[m] = '[MASK]' elif type(m) == str: - wds = ['[MASK]' if w == m else w for i, w in enumerate(wds)] + mwds = ['[MASK]' if w==m else w for i, w in enumerate(mwds)] else: logging.warning(f'{m} is not a valid mask index or string.') - if '[MASK]' not in wds: + if '[MASK]' not in mwds: raise ValueError('No valid mask tokens.') - return wds + return mwds def _postprocess(self, preds, tok, wds, ons, dur): - preds = preds[0].numpy() + preds = preds[0].numpy()[:,1:-1,:] preds_softmax = scipy.special.softmax(preds, axis=-1) - m_idx = [idx for idx, tok in enumerate(tok) if tok == '[MASK]'] m_wds, m_ons, m_dur = ([] for i in range(3)) top_wd, top_softmax = ([] for i in range(2)) gold_softmax, gold_rank = ([] for i in range(2)) top_n = self.top_n or preds.shape[2] - + for i in m_idx: - top_pred = np.argsort(preds[0,i,:], axis=-1)[-top_n:] + sorted_idx = preds[0,i,:].argsort(axis=-1) + + top_idx = np.flip(sorted_idx[-top_n:]) g_idx = self.tokenizer.convert_tokens_to_ids(wds[i]) g_rank = len(np.where(preds[0,i,:] >= preds[0,i,g_idx])[0]) + 1 @@ -670,32 +691,34 @@ def _postprocess(self, preds, tok, wds, ons, dur): m_ons.append(ons[i]) m_dur.append(dur[i]) - top_wd.append(self.tokenizer.convert_ids_to_tokens(top_pred)) - top_softmax.append(preds_softmax[0,i,top_pred]) - + top_wd.append(self.tokenizer.convert_ids_to_tokens(top_idx)) + top_softmax.append([preds_softmax[0,i,t] for t in top_idx]) gold_softmax.append(preds_softmax[0,i,g_idx]) gold_rank.append(g_rank) + + # add target words routine here # probability # rank - data = [top_wd, top_softmax, gold_softmax, gold_rank, m_wds] + seq = ' '.join(tok) + data = [top_wd, top_softmax, gold_softmax, gold_rank, m_wds, seq] return data, m_ons, m_dur def _get_feature_names(self): - return ['top_wd', 'top_softmax', 'gold_softmax', 'gold_rank', 'masked_word'] + return ['top_wd', 'top_softmax', 'gold_softmax', 'gold_rank', + 'masked_word', 'sequence'] def _get_model_attributes(self): return ['pretrained_model', 'framework', 'top_n', 'mask', 'target', 'tokenizer_type'] # TO DOs: -# fix _to_df # Add routine for tracking target words -# Add option to extract other layers / attention heads from encoding extractor -# Add option to return encodings for LM? -# Softmax-ed or raw scores (softmax over whole dist?), which metrics? +# Add option to extract other layers/attention heads from BertExtractor +# Add option to return encodings? +# Softmax-ed or raw scores (softmax over whole dist?), which metrics? Rank over vocab size? # Fix init # Move to models From 27f996a2f72faf8cc5a83890b74494cbade04752 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 3 Mar 2020 14:19:04 -0600 Subject: [PATCH 07/89] add softmax as option --- pliers/extractors/text.py | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index bee4951f..dbc5c59d 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -613,6 +613,8 @@ class BertLMExtractor(BertExtractor): target (str or list): Vocabulary token(s) for which probability is to be returned. Tokens defined in the vocabulary change across tokenizers. + return_softmax (bool): if True, returns probability scores instead of + raw predictions scores for language modeling. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html and https://huggingface.co/transformers/model_doc/bert.html for @@ -643,6 +645,7 @@ def __init__(self, model_class='BertForMaskedLM') if top_n and target: raise ValueError('top_n and target are mutually exclusive arguments') + self.return_softmax = return_softmax self.top_n = top_n self.mask = listify(mask) self.target = listify(target) @@ -672,42 +675,44 @@ def _mask(self, wds): def _postprocess(self, preds, tok, wds, ons, dur): preds = preds[0].numpy()[:,1:-1,:] - preds_softmax = scipy.special.softmax(preds, axis=-1) - m_idx = [idx for idx, tok in enumerate(tok) if tok == '[MASK]'] + if self.return_softmax: + preds = scipy.special.softmax(preds, axis=-1) + if self.target: + target_ids = self.tokenizer.convert_tokens_to_ids(self.target) + preds = preds[:,:,target_ids] + m_idx = [idx for idx, tok in enumerate(tok) if tok == '[MASK]'] # edit so to inherit index from elsewhere m_wds, m_ons, m_dur = ([] for i in range(3)) - top_wd, top_softmax = ([] for i in range(2)) - gold_softmax, gold_rank = ([] for i in range(2)) - + top_wd, top_scores = ([] for i in range(2)) + gold_scores, gold_rank = ([] for i in range(2)) top_n = self.top_n or preds.shape[2] for i in m_idx: - sorted_idx = preds[0,i,:].argsort(axis=-1) + sorted_idx = preds[0,i,:].argsort(axis=-1) top_idx = np.flip(sorted_idx[-top_n:]) - g_idx = self.tokenizer.convert_tokens_to_ids(wds[i]) - g_rank = len(np.where(preds[0,i,:] >= preds[0,i,g_idx])[0]) + 1 m_wds.append(wds[i]) m_ons.append(ons[i]) m_dur.append(dur[i]) + + g_idx = self.tokenizer.convert_tokens_to_ids(wds[i]) + g_rank = len(np.where(preds[0,i,:] >= preds[0,i,g_idx])[0]) + 1 top_wd.append(self.tokenizer.convert_ids_to_tokens(top_idx)) - top_softmax.append([preds_softmax[0,i,t] for t in top_idx]) - gold_softmax.append(preds_softmax[0,i,g_idx]) + top_scores.append([preds[0,i,t] for t in top_idx]) + gold_scores.append(preds[0,i,g_idx]) gold_rank.append(g_rank) - - # add target words routine here # probability # rank seq = ' '.join(tok) - data = [top_wd, top_softmax, gold_softmax, gold_rank, m_wds, seq] + data = [top_wd, top_scores, gold_scores, gold_rank, m_wds, seq] return data, m_ons, m_dur def _get_feature_names(self): - return ['top_wd', 'top_softmax', 'gold_softmax', 'gold_rank', + return ['top_wd', 'top_scores', 'gold_scores', 'gold_rank', 'masked_word', 'sequence'] def _get_model_attributes(self): @@ -716,11 +721,11 @@ def _get_model_attributes(self): # TO DOs: # Add routine for tracking target words +# Output as array or columns? # Add option to extract other layers/attention heads from BertExtractor # Add option to return encodings? -# Softmax-ed or raw scores (softmax over whole dist?), which metrics? Rank over vocab size? +# Which metrics? Rank over vocab size? # Fix init -# Move to models class WordCounterExtractor(ComplexTextExtractor): From ac01e154f49924817ac851577aed4b34304e22b2 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 3 Mar 2020 14:49:39 -0600 Subject: [PATCH 08/89] only allow one mask --- pliers/extractors/text.py | 66 +++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index dbc5c59d..74f20356 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -607,9 +607,11 @@ class BertLMExtractor(BertExtractor): (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. top_n (int): Specifies how many of the highest-probability tokens are to be returned. - mask (int or str or list): Words to be masked (string) or indices of + mask (int or str): Words to be masked (string) or indices of words in the sequence to be masked (indexing starts at 0). Can be either a single word/index or a list of words/indices. + If str is passed and more than one word in the input matches the + string, only the first one is masked. target (str or list): Vocabulary token(s) for which probability is to be returned. Tokens defined in the vocabulary change across tokenizers. @@ -625,15 +627,16 @@ class BertLMExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'top_n', 'mask', 'target', - 'tokenizer_type') + 'tokenizer_type', 'return_softmax') def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', framework='pt', top_n=None, - mask=None, + mask='[MASK]', target=None, + return_softmax=False, model_kwargs=None, tokenizer_kwargs=None): @@ -644,10 +647,13 @@ def __init__(self, tokenizer_kwargs=tokenizer_kwargs, model_class='BertForMaskedLM') if top_n and target: - raise ValueError('top_n and target are mutually exclusive arguments') + raise ValueError('top_n and target are mutually exclusive') self.return_softmax = return_softmax self.top_n = top_n - self.mask = listify(mask) + self.mask = mask + if type(self.mask) not in [int, str]: + raise ValueError('mask arguments must be an integer or a string' + f' ({type(self.mask)} passed).') self.target = listify(target) if self.target: for t in self.target: @@ -662,13 +668,13 @@ def __init__(self, def _mask(self, wds): mwds = wds.copy() - for m in self.mask: - if type(m) == int: - mwds[m] = '[MASK]' - elif type(m) == str: - mwds = ['[MASK]' if w==m else w for i, w in enumerate(mwds)] - else: - logging.warning(f'{m} is not a valid mask index or string.') + if type(self.mask) == int: + mwds[self.mask] = '[MASK]' + elif type(self.mask) == str: + w_idx = np.where(np.array(mwds)==self.mask)[0][0] + mwds[w_idx] = '[MASK]' + else: + logging.warning(f'{self.mask} is not a valid mask value') if '[MASK]' not in mwds: raise ValueError('No valid mask tokens.') return mwds @@ -680,40 +686,26 @@ def _postprocess(self, preds, tok, wds, ons, dur): if self.target: target_ids = self.tokenizer.convert_tokens_to_ids(self.target) preds = preds[:,:,target_ids] - m_idx = [idx for idx, tok in enumerate(tok) if tok == '[MASK]'] # edit so to inherit index from elsewhere + m_wds, m_ons, m_dur = ([] for i in range(3)) top_wd, top_scores = ([] for i in range(2)) - gold_scores, gold_rank = ([] for i in range(2)) top_n = self.top_n or preds.shape[2] - - for i in m_idx: - - sorted_idx = preds[0,i,:].argsort(axis=-1) - top_idx = np.flip(sorted_idx[-top_n:]) - - m_wds.append(wds[i]) - m_ons.append(ons[i]) - m_dur.append(dur[i]) - - - g_idx = self.tokenizer.convert_tokens_to_ids(wds[i]) - g_rank = len(np.where(preds[0,i,:] >= preds[0,i,g_idx])[0]) + 1 - top_wd.append(self.tokenizer.convert_ids_to_tokens(top_idx)) - top_scores.append([preds[0,i,t] for t in top_idx]) - gold_scores.append(preds[0,i,g_idx]) - gold_rank.append(g_rank) - # add target words routine here - # probability - # rank + m_idx = [i for i, t in enumerate(tok) if t=='[MASK]'] + sorted_idx = preds[0,m_idx,:].argsort(axis=-1) + top_idx = np.flip(sorted_idx[-top_n:]) + m_wds.append(wds[m_idx]) + m_ons.append(ons[m_idx]) + m_dur.append(dur[m_idx]) + top_wd.append(self.tokenizer.convert_ids_to_tokens(top_idx)) + top_scores.append([preds[0,m_idx,t] for t in top_idx]) seq = ' '.join(tok) - data = [top_wd, top_scores, gold_scores, gold_rank, m_wds, seq] + data = [top_wd, top_scores, m_wds, seq] return data, m_ons, m_dur def _get_feature_names(self): - return ['top_wd', 'top_scores', 'gold_scores', 'gold_rank', - 'masked_word', 'sequence'] + return ['top_wd', 'top_scores', 'masked_word', 'sequence'] def _get_model_attributes(self): return ['pretrained_model', 'framework', 'top_n', 'mask', 'target', From 597835c0f2109ddde91c0e9befe7be6b7cc073d6 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 3 Mar 2020 16:58:12 -0600 Subject: [PATCH 09/89] add threshold option, refine target tokens option (both mutually exclusive w/ top_n) --- pliers/extractors/text.py | 117 +++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 58 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 74f20356..8bc42bfa 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -479,20 +479,17 @@ def _preprocess(self, stims): def _postprocess(self, preds, tok, wds, ons, dur): out = preds[0][:, 1:-1, :].numpy().squeeze() - data = [out.tolist(), tok, wds] - return data, ons, dur - - def _get_feature_names(self): - return ['encoding', 'token', 'word'] + data = [out.tolist(), tok, wds] + feat = ['encoding', 'token', 'word'] + return data, feat, ons, dur def _extract(self, stims): wds, ons, dur, tok, idx = self._preprocess(stims) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] - data, ons, dur = self._postprocess(preds, tok, wds, ons, dur) - features = self._get_feature_names() + data, feat, ons, dur = self._postprocess(preds, tok, wds, ons, dur) return ExtractorResult(data, stims, self, - features=features, onsets=ons, durations=dur) + features=feat, onsets=ons, durations=dur) def _get_model_attributes(self): return ['pretrained_model', 'framework', 'model_class', @@ -504,6 +501,7 @@ def _to_df(self, result, include_attributes=True): log_dict = {attr: getattr(result.extractor, attr) for attr in self._get_model_attributes()} res_dict.update(log_dict) + # include sequence? res_df = pd.DataFrame(res_dict) res_df['object_id'] = range(res_df.shape[0]) return res_df @@ -580,11 +578,9 @@ def _postprocess(self, preds, tok, wds, ons, dur): out = pool_func(preds[0][:, 1:-1, :], axis=1, keepdims=True) else: out = preds[1] - data = [out.tolist(), tok] - return data, ons, dur - - def _get_feature_names(self): - return ['encoding', 'sequence'] + data = [out.tolist(), tok] + feat = ['encoding', 'sequence'] + return data, feat, ons, dur def _get_model_attributes(self): return ['pretrained_model', 'framework', 'model_class', @@ -626,16 +622,17 @@ class BertLMExtractor(BertExtractor): for further info. ''' - _log_attributes = ('pretrained_model', 'framework', 'top_n', 'mask', 'target', - 'tokenizer_type', 'return_softmax') + _log_attributes = ('pretrained_model', 'framework', 'top_n', 'mask_pos', + 'mask_token', 'target', 'tokenizer_type', 'return_softmax') def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', framework='pt', - top_n=None, + top_n=100, mask='[MASK]', target=None, + threshold=None, return_softmax=False, model_kwargs=None, tokenizer_kwargs=None): @@ -648,12 +645,19 @@ def __init__(self, model_class='BertForMaskedLM') if top_n and target: raise ValueError('top_n and target are mutually exclusive') - self.return_softmax = return_softmax + if top_n and threshold: + raise ValueError('top_n and threshold are mutually exclusive') self.top_n = top_n - self.mask = mask - if type(self.mask) not in [int, str]: - raise ValueError('mask arguments must be an integer or a string' - f' ({type(self.mask)} passed).') + + if type(mask) == int: + self.mask_pos = mask + self.mask_token = None + if type(mask) == str: + self.mask_pos = None + self.mask_token = mask + else: + raise ValueError('mask argument must be an integer or a string') + self.target = listify(target) if self.target: for t in self.target: @@ -666,58 +670,55 @@ def __init__(self, f'from_pretrained(\'{tokenizer}\').vocab.keys() to see' 'which tokens are part of the tokenizer vocabulary.') + self.return_softmax = return_softmax + self.threshold = threshold + def _mask(self, wds): mwds = wds.copy() - if type(self.mask) == int: - mwds[self.mask] = '[MASK]' - elif type(self.mask) == str: - w_idx = np.where(np.array(mwds)==self.mask)[0][0] + if self.mask_pos: + mwds[self.mask_pos] = '[MASK]' + self.mask_token = wds[self.mask_pos] + elif self.mask_token: + w_idx = np.where(np.array(mwds)==self.mask_token)[0][0] mwds[w_idx] = '[MASK]' - else: - logging.warning(f'{self.mask} is not a valid mask value') - if '[MASK]' not in mwds: - raise ValueError('No valid mask tokens.') + self.mask_pos = w_idx + nr_masks = len(np.where(np.array(mwds)=='[MASK]')[0]) + if nr_masks == 0: + raise ValueError('No valid mask tokens found.') + elif nr_masks > 1: + raise ValueError('Too many masked items.') return mwds def _postprocess(self, preds, tok, wds, ons, dur): preds = preds[0].numpy()[:,1:-1,:] + if self.return_softmax: preds = scipy.special.softmax(preds, axis=-1) + + out_idx = preds[0,self.mask_pos,:].argsort(axis=-1)[::-1] if self.target: - target_ids = self.tokenizer.convert_tokens_to_ids(self.target) - preds = preds[:,:,target_ids] + target_idx = self.tokenizer.convert_tokens_to_ids(self.target) + out_idx = list(set(out_idx) & set(target_idx)) + elif self.top_n: + out_idx = out_idx[:self.top_n] + if self.threshold: + th_idx = np.where(preds[0,self.mask_pos,:] > self.threshold)[0] + out_idx = list(set(out_idx) & set(th_idx)) + out_wds = self.tokenizer.convert_ids_to_tokens(out_idx) + out_scores = preds[0,self.mask_pos,out_idx] - m_wds, m_ons, m_dur = ([] for i in range(3)) - top_wd, top_scores = ([] for i in range(2)) - top_n = self.top_n or preds.shape[2] - - m_idx = [i for i, t in enumerate(tok) if t=='[MASK]'] - sorted_idx = preds[0,m_idx,:].argsort(axis=-1) - top_idx = np.flip(sorted_idx[-top_n:]) - m_wds.append(wds[m_idx]) - m_ons.append(ons[m_idx]) - m_dur.append(dur[m_idx]) - top_wd.append(self.tokenizer.convert_ids_to_tokens(top_idx)) - top_scores.append([preds[0,m_idx,t] for t in top_idx]) - - seq = ' '.join(tok) - data = [top_wd, top_scores, m_wds, seq] - return data, m_ons, m_dur - - def _get_feature_names(self): - return ['top_wd', 'top_scores', 'masked_word', 'sequence'] + data = out_scores + return data, out_wds, listify(ons[self.mask_pos]), listify(dur[self.mask_pos]) def _get_model_attributes(self): - return ['pretrained_model', 'framework', 'top_n', 'mask', 'target', - 'tokenizer_type'] + return ['pretrained_model', 'framework', 'top_n', 'mask_idx', + 'target', 'mask_token', 'tokenizer_type'] -# TO DOs: -# Add routine for tracking target words -# Output as array or columns? +# TO DO: +# Add option to support additional metadata to to_df +# Check output as columns and init # Add option to extract other layers/attention heads from BertExtractor -# Add option to return encodings? -# Which metrics? Rank over vocab size? -# Fix init +# Add option to return encodings from LM or SentModels class WordCounterExtractor(ComplexTextExtractor): From 60b91f0014396b1bb8d9dca1d86000df4407cd8b Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 3 Mar 2020 17:13:36 -0600 Subject: [PATCH 10/89] edit docstring --- pliers/extractors/text.py | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 8bc42bfa..b501862d 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -443,19 +443,12 @@ def __init__(self, if framework not in ['pt', 'tf']: raise(ValueError('''Invalid framework; must be one of 'pt' (pytorch) or 'tf' (tensorflow)''')) - - if model_kwargs is None: - model_kwargs = {} - if tokenizer_kwargs is None: - tokenizer_kwargs = {} - self.pretrained_model = pretrained_model self.tokenizer_type = tokenizer self.framework = framework - self.tokenizer_kwargs = tokenizer_kwargs self.model_class = model_class - self.model_kwargs = model_kwargs - self.tokenizer_kwargs = tokenizer_kwargs + self.model_kwargs = model_kwargs if model_kwargs else {} + self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} model = model_class if self.framework == 'pt' else 'TF' + model_class self.model = getattr(transformers, model).from_pretrained( @@ -528,11 +521,9 @@ class BertSequenceEncodingExtractor(BertExtractor): token. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html - and https://huggingface.co/transformers/model_doc/bert.html for - further info. + and https://huggingface.co/transformers/model_doc/bert.html tokenizer_kwargs (dict): Named arguments for tokenizer. See https://huggingface.co/transformers/main_classes/tokenizer.html - for further info. ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', @@ -602,7 +593,7 @@ class BertLMExtractor(BertExtractor): framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. top_n (int): Specifies how many of the highest-probability tokens are - to be returned. + to be returned. Mutually exclusive with target and threshold. mask (int or str): Words to be masked (string) or indices of words in the sequence to be masked (indexing starts at 0). Can be either a single word/index or a list of words/indices. @@ -611,15 +602,15 @@ class BertLMExtractor(BertExtractor): target (str or list): Vocabulary token(s) for which probability is to be returned. Tokens defined in the vocabulary change across tokenizers. + threshold (float): If defined, only values above this threshold will + be returned. Mutually exclusive with top_n. return_softmax (bool): if True, returns probability scores instead of raw predictions scores for language modeling. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html - and https://huggingface.co/transformers/model_doc/bert.html for - further info. + and https://huggingface.co/transformers/model_doc/bert.html. tokenizer_kwargs (dict): Named arguments for tokenizer. - See https://huggingface.co/transformers/main_classes/tokenizer.html - for further info. + See https://huggingface.co/transformers/main_classes/tokenizer.html. ''' _log_attributes = ('pretrained_model', 'framework', 'top_n', 'mask_pos', @@ -629,10 +620,10 @@ def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', framework='pt', - top_n=100, mask='[MASK]', - target=None, + top_n=100, threshold=None, + target=None, return_softmax=False, model_kwargs=None, tokenizer_kwargs=None): From b4fbb45d3552712b61e154ee78bb94af44c6de86 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 6 Mar 2020 16:37:40 -0600 Subject: [PATCH 11/89] allow keep info on true word --- pliers/extractors/text.py | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index b501862d..fc21c875 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -406,7 +406,7 @@ def _extract(self, stim): class BertExtractor(ComplexTextExtractor): ''' Base class for all Extractors based on pretrained BERT. - This model returns the last hidden layer (including special tokens) + This model returns the last hidden layer (wihtout special tokens) Args: pretrained_model (str): A string specifying which BERT @@ -570,15 +570,15 @@ def _postprocess(self, preds, tok, wds, ons, dur): else: out = preds[1] data = [out.tolist(), tok] - feat = ['encoding', 'sequence'] + feat = ['encoding', 'sequence'] return data, feat, ons, dur def _get_model_attributes(self): return ['pretrained_model', 'framework', 'model_class', 'pooling', 'return_sep', 'tokenizer_type'] -class BertLMExtractor(BertExtractor): +class BertLMExtractor(BertExtractor): ''' Use BERT for masked words prediction. Args: @@ -606,6 +606,8 @@ class BertLMExtractor(BertExtractor): be returned. Mutually exclusive with top_n. return_softmax (bool): if True, returns probability scores instead of raw predictions scores for language modeling. + return_true (bool): if True, returns masked word (if defined in the + tokenizer dictionary) and its probability. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html and https://huggingface.co/transformers/model_doc/bert.html. @@ -625,6 +627,7 @@ def __init__(self, threshold=None, target=None, return_softmax=False, + return_true=False, model_kwargs=None, tokenizer_kwargs=None): @@ -663,6 +666,7 @@ def __init__(self, self.return_softmax = return_softmax self.threshold = threshold + self.return_true = return_true def _mask(self, wds): mwds = wds.copy() @@ -693,23 +697,29 @@ def _postprocess(self, preds, tok, wds, ons, dur): elif self.top_n: out_idx = out_idx[:self.top_n] if self.threshold: - th_idx = np.where(preds[0,self.mask_pos,:] > self.threshold)[0] - out_idx = list(set(out_idx) & set(th_idx)) - out_wds = self.tokenizer.convert_ids_to_tokens(out_idx) - out_scores = preds[0,self.mask_pos,out_idx] - - data = out_scores - return data, out_wds, listify(ons[self.mask_pos]), listify(dur[self.mask_pos]) + thr_idx = np.where(preds[0,self.mask_pos,:] > self.threshold)[0] + out_idx = list(set(out_idx) & set(thr_idx)) + feats = self.tokenizer.convert_ids_to_tokens(out_idx) + data = preds[0,self.mask_pos,out_idx] + if self.return_true: + if self.mask_token in self.tokenizer.vocab: + true_vocab_idx = self.tokenizer.vocab[self.mask_token] + true_score = preds[0, self.mask_pos, true_vocab_idx] + else: + true_vocab_idx, true_score = ('true_word', np.nan) + feats += ['true_word', 'true_word_score'] + data += [self.mask_token, true_score] + + return data, feats, listify(ons[self.mask_pos]), \ + listify(dur[self.mask_pos]) def _get_model_attributes(self): return ['pretrained_model', 'framework', 'top_n', 'mask_idx', 'target', 'mask_token', 'tokenizer_type'] # TO DO: -# Add option to support additional metadata to to_df -# Check output as columns and init -# Add option to extract other layers/attention heads from BertExtractor -# Add option to return encodings from LM or SentModels +# Target words called in extract + class WordCounterExtractor(ComplexTextExtractor): From c66dfb0e5691755532ce19bdf500af89e1d13487 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 6 Mar 2020 17:48:17 -0600 Subject: [PATCH 12/89] move mask specification to extract --- pliers/extractors/text.py | 39 ++++++++++++++------------------------- 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index fc21c875..ff52d00e 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -457,13 +457,13 @@ def __init__(self, tokenizer, **tokenizer_kwargs) super(BertExtractor, self).__init__() - def _mask(self, wds): + def _mask(self, wds, mask): return wds - def _preprocess(self, stims): + def _preprocess(self, stims, mask): wds, ons, dur = zip(*[(s.text, s.onset, s.duration) for s in stims.elements]) wds, ons, dur = map(list, [wds, ons, dur]) - tok = [self.tokenizer.tokenize(w) for w in self._mask(wds)] + tok = [self.tokenizer.tokenize(w) for w in self._mask(wds, mask)] n_tok = [len(t) for t in tok] wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) tok = list(flatten(tok)) @@ -476,8 +476,9 @@ def _postprocess(self, preds, tok, wds, ons, dur): feat = ['encoding', 'token', 'word'] return data, feat, ons, dur - def _extract(self, stims): - wds, ons, dur, tok, idx = self._preprocess(stims) + def _extract(self, stims, **kwargs): + mask = kwargs['mask'] if 'mask' in kwargs else None + wds, ons, dur, tok, idx = self._preprocess(stims, mask) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] data, feat, ons, dur = self._postprocess(preds, tok, wds, ons, dur) @@ -643,15 +644,6 @@ def __init__(self, raise ValueError('top_n and threshold are mutually exclusive') self.top_n = top_n - if type(mask) == int: - self.mask_pos = mask - self.mask_token = None - if type(mask) == str: - self.mask_pos = None - self.mask_token = mask - else: - raise ValueError('mask argument must be an integer or a string') - self.target = listify(target) if self.target: for t in self.target: @@ -668,15 +660,17 @@ def __init__(self, self.threshold = threshold self.return_true = return_true - def _mask(self, wds): + def _mask(self, wds, mask): mwds = wds.copy() - if self.mask_pos: + if type(mask) == int: mwds[self.mask_pos] = '[MASK]' - self.mask_token = wds[self.mask_pos] - elif self.mask_token: - w_idx = np.where(np.array(mwds)==self.mask_token)[0][0] + self.mask_pos, self.mask_token = (mask, wds[self.mask_pos]) + if type(mask) == str: + w_idx = np.where(np.array(mwds)==mask)[0][0] mwds[w_idx] = '[MASK]' - self.mask_pos = w_idx + self.mask_token, self.mask_pos = (mask, w_idx) + else: + raise ValueError('mask argument must be an integer or a string') nr_masks = len(np.where(np.array(mwds)=='[MASK]')[0]) if nr_masks == 0: raise ValueError('No valid mask tokens found.') @@ -686,10 +680,8 @@ def _mask(self, wds): def _postprocess(self, preds, tok, wds, ons, dur): preds = preds[0].numpy()[:,1:-1,:] - if self.return_softmax: preds = scipy.special.softmax(preds, axis=-1) - out_idx = preds[0,self.mask_pos,:].argsort(axis=-1)[::-1] if self.target: target_idx = self.tokenizer.convert_tokens_to_ids(self.target) @@ -717,9 +709,6 @@ def _get_model_attributes(self): return ['pretrained_model', 'framework', 'top_n', 'mask_idx', 'target', 'mask_token', 'tokenizer_type'] -# TO DO: -# Target words called in extract - class WordCounterExtractor(ComplexTextExtractor): From a9d24f38564a1b3ff84457a73c178f9a96fe396f Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 9 Mar 2020 15:46:09 -0500 Subject: [PATCH 13/89] fix mask-based indexing in mask method --- pliers/extractors/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index ff52d00e..0fdbee30 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -663,8 +663,8 @@ def __init__(self, def _mask(self, wds, mask): mwds = wds.copy() if type(mask) == int: - mwds[self.mask_pos] = '[MASK]' - self.mask_pos, self.mask_token = (mask, wds[self.mask_pos]) + mwds[mask] = '[MASK]' + self.mask_pos, self.mask_token = (mask, wds[mask]) if type(mask) == str: w_idx = np.where(np.array(mwds)==mask)[0][0] mwds[w_idx] = '[MASK]' From 68ddd94e9f80f43cb623e41056c0759ee3883df0 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 10 Mar 2020 13:26:13 -0500 Subject: [PATCH 14/89] refine logic --- pliers/extractors/text.py | 165 ++++++++++++++++++++------------------ 1 file changed, 86 insertions(+), 79 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 0fdbee30..94967fb3 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -421,6 +421,8 @@ class BertExtractor(ComplexTextExtractor): 'BertModel' or 'BertForLM'. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. + return_metadata (bool): if True, the extractor returns encoded token + and encoded word as features. model_kwargs (dict): Named arguments for transformer model. See https://huggingface.co/transformers/main_classes/model.html tokenizer_kwargs (dict): Named arguments for tokenizer. @@ -435,6 +437,7 @@ def __init__(self, tokenizer='bert-base-uncased', model_class='BertModel', framework='pt', + return_metadata=False, model_kwargs=None, tokenizer_kwargs=None): @@ -445,8 +448,9 @@ def __init__(self, must be one of 'pt' (pytorch) or 'tf' (tensorflow)''')) self.pretrained_model = pretrained_model self.tokenizer_type = tokenizer - self.framework = framework self.model_class = model_class + self.framework = framework + self.return_metadata = return_metadata self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} @@ -461,8 +465,8 @@ def _mask(self, wds, mask): return wds def _preprocess(self, stims, mask): - wds, ons, dur = zip(*[(s.text, s.onset, s.duration) for s in stims.elements]) - wds, ons, dur = map(list, [wds, ons, dur]) + els = [(e.text, e.onset, e.duration) for e in stims.elements] + wds, ons, dur = map(list, zip(*els)) tok = [self.tokenizer.tokenize(w) for w in self._mask(wds, mask)] n_tok = [len(t) for t in tok] wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) @@ -470,20 +474,23 @@ def _preprocess(self, stims, mask): idx = self.tokenizer.encode(tok, return_tensors=self.framework) return wds, ons, dur, tok, idx - def _postprocess(self, preds, tok, wds, ons, dur): - out = preds[0][:, 1:-1, :].numpy().squeeze() - data = [out.tolist(), tok, wds] - feat = ['encoding', 'token', 'word'] - return data, feat, ons, dur - def _extract(self, stims, **kwargs): mask = kwargs['mask'] if 'mask' in kwargs else None wds, ons, dur, tok, idx = self._preprocess(stims, mask) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] data, feat, ons, dur = self._postprocess(preds, tok, wds, ons, dur) - return ExtractorResult(data, stims, self, - features=feat, onsets=ons, durations=dur) + return ExtractorResult(data, stims, self, features=feat, onsets=ons, + durations=dur) + + def _postprocess(self, preds, tok, wds, ons, dur): + out = preds[0][:, 1:-1, :].numpy().squeeze() + data = [out.tolist()] + feat = ['encoding'] + if self.return_metadata: + data += [tok, wds] + feat += ['token', 'word'] + return data, feat, ons, dur def _get_model_attributes(self): return ['pretrained_model', 'framework', 'model_class', @@ -495,7 +502,6 @@ def _to_df(self, result, include_attributes=True): log_dict = {attr: getattr(result.extractor, attr) for attr in self._get_model_attributes()} res_dict.update(log_dict) - # include sequence? res_df = pd.DataFrame(res_dict) res_df['object_id'] = range(res_df.shape[0]) return res_df @@ -520,6 +526,8 @@ class BertSequenceEncodingExtractor(BertExtractor): encodings. return_sep (bool): defines whether to return encoding for the [SEP] token. + return_metadata (bool): If True, the extractor returns an additional + feature column with the encoded sequence. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html and https://huggingface.co/transformers/model_doc/bert.html @@ -537,13 +545,13 @@ def __init__(self, framework='pt', pooling=None, return_sep=False, + return_metadata=False, model_kwargs=None, tokenizer_kwargs=None): super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, - tokenizer, framework, model_kwargs, tokenizer_kwargs, - model_class='BertModel') - + tokenizer, framework, return_metadata, model_kwargs, + tokenizer_kwargs, model_class='BertModel') if pooling: if return_sep: raise(ValueError('Pooling and return_seq argument are ' @@ -552,11 +560,11 @@ def __init__(self, getattr(np, pooling) except: raise(ValueError('Pooling must be a valid numpy function.')) - self.return_sep = return_sep self.pooling = pooling + self.return_sep = return_sep def _postprocess(self, preds, tok, wds, ons, dur): - preds = [p.numpy().squeeze() for p in preds] #check + preds = [p.numpy().squeeze() for p in preds] tok = [' '.join(wds)] try: dur = ons[-1] + dur[-1] - ons[0] @@ -570,8 +578,9 @@ def _postprocess(self, preds, tok, wds, ons, dur): out = pool_func(preds[0][:, 1:-1, :], axis=1, keepdims=True) else: out = preds[1] - data = [out.tolist(), tok] - feat = ['encoding', 'sequence'] + if self.return_metadata: + data += [tok] + feat += ['sequence'] return data, feat, ons, dur def _get_model_attributes(self): @@ -595,20 +604,15 @@ class BertLMExtractor(BertExtractor): (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. top_n (int): Specifies how many of the highest-probability tokens are to be returned. Mutually exclusive with target and threshold. - mask (int or str): Words to be masked (string) or indices of - words in the sequence to be masked (indexing starts at 0). Can - be either a single word/index or a list of words/indices. - If str is passed and more than one word in the input matches the - string, only the first one is masked. target (str or list): Vocabulary token(s) for which probability is to be returned. Tokens defined in the vocabulary change across - tokenizers. + tokenizers. Mutually exclusive with top_n and threshold. threshold (float): If defined, only values above this threshold will - be returned. Mutually exclusive with top_n. + be returned. Mutually exclusive with top_n and target. return_softmax (bool): if True, returns probability scores instead of - raw predictions scores for language modeling. - return_true (bool): if True, returns masked word (if defined in the - tokenizer dictionary) and its probability. + raw predictions scores. + return_metadata (bool): if True, returns masked word (if defined in the + tokenizer vocabulary) and its probability. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html and https://huggingface.co/transformers/model_doc/bert.html. @@ -638,77 +642,80 @@ def __init__(self, model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs, model_class='BertForMaskedLM') - if top_n and target: - raise ValueError('top_n and target are mutually exclusive') - if top_n and threshold: - raise ValueError('top_n and threshold are mutually exclusive') + if any([top_n and target, top_n and threshold, threshold and target]): + raise ValueError('top_n, threshold and target arguments ' + 'are mutually exclusive') self.top_n = top_n - + self.threshold = threshold self.target = listify(target) if self.target: - for t in self.target: - if t not in list(self.tokenizer.vocab.keys()): - logging.warning(f'{t} is not in vocabulary. Dropping.') - self.target.remove(t) + missing = set(self.target) - set(self.tokenizer.vocab.keys()) + if missing: + logging.warning(f'{missing} is not in vocabulary. Dropping.') + self.target = set(self.target) & set(self.tokenizer.vocab.keys()) if self.target == []: - raise ValueError('No valid target tokens provided. Import ' - 'transformers and run transformers.BertTokenizer.' - f'from_pretrained(\'{tokenizer}\').vocab.keys() to see' - 'which tokens are part of the tokenizer vocabulary.') - + raise ValueError('No valid target token. Import transformers' + ' and run transformers.BertTokenizer.from_pretrained' + f'(\'{tokenizer}\').vocab.keys() to see available tokens') self.return_softmax = return_softmax - self.threshold = threshold self.return_true = return_true def _mask(self, wds, mask): - mwds = wds.copy() - if type(mask) == int: - mwds[mask] = '[MASK]' - self.mask_pos, self.mask_token = (mask, wds[mask]) - if type(mask) == str: - w_idx = np.where(np.array(mwds)==mask)[0][0] - mwds[w_idx] = '[MASK]' - self.mask_token, self.mask_pos = (mask, w_idx) - else: + if not type(mask) in [int, str]: raise ValueError('mask argument must be an integer or a string') - nr_masks = len(np.where(np.array(mwds)=='[MASK]')[0]) - if nr_masks == 0: - raise ValueError('No valid mask tokens found.') - elif nr_masks > 1: - raise ValueError('Too many masked items.') + mwds = wds.copy() + self.mask_token = mask if type(mask) == str else mwds[mask] + self.mask_pos = np.where(np.array(mwds)==self.mask_token)[0][0] + mwds[self.mask_pos] = '[MASK]' return mwds + def _extract(self, stims, mask): + ''' + Args: + mask (int or str): Words to be masked (string) or indices of + words in the sequence to be masked (indexing starts at 0). Can + be either a single word/index or a list of words/indices. + If str is passed and more than one word in the input matches + the string, only the first one is masked. + ''' + return super()._extract(stims=stims, mask=mask) + def _postprocess(self, preds, tok, wds, ons, dur): preds = preds[0].numpy()[:,1:-1,:] if self.return_softmax: preds = scipy.special.softmax(preds, axis=-1) out_idx = preds[0,self.mask_pos,:].argsort(axis=-1)[::-1] - if self.target: - target_idx = self.tokenizer.convert_tokens_to_ids(self.target) - out_idx = list(set(out_idx) & set(target_idx)) - elif self.top_n: - out_idx = out_idx[:self.top_n] - if self.threshold: - thr_idx = np.where(preds[0,self.mask_pos,:] > self.threshold)[0] - out_idx = list(set(out_idx) & set(thr_idx)) - feats = self.tokenizer.convert_ids_to_tokens(out_idx) + if self.top_n: + sub_idx = range(self.top_n) + elif self.target: + sub_idx = self.tokenizer.convert_tokens_to_ids(self.target) + elif self.threshold: + sub_idx = np.where(preds[0,self.mask_pos,:] > self.threshold)[0] + out_idx = list(set(out_idx) & set(sub_idx)) if sub_idx else out_idx + feat = self.tokenizer.convert_ids_to_tokens(out_idx) data = preds[0,self.mask_pos,out_idx] - if self.return_true: - if self.mask_token in self.tokenizer.vocab: - true_vocab_idx = self.tokenizer.vocab[self.mask_token] - true_score = preds[0, self.mask_pos, true_vocab_idx] - else: - true_vocab_idx, true_score = ('true_word', np.nan) - feats += ['true_word', 'true_word_score'] - data += [self.mask_token, true_score] + if self.return_metadata: + feat, data = self._retrieve_true_token(preds, feat, data) + ons, dur = map(lambda x: listify(x[self.mask_pos]), [ons, dur]) + return data, feat, ons, dur - return data, feats, listify(ons[self.mask_pos]), \ - listify(dur[self.mask_pos]) + def _retrieve_true_token(self, preds, feat, data): + if self.mask_token in self.tokenizer.vocab: + true_vocab_idx = self.tokenizer.vocab[self.mask_token] + true_score = preds[0, self.mask_pos, true_vocab_idx] + feat += ['true_word', 'true_word_score'] + data += [self.mask_token, true_score] + else: + logging.warning('True token not in vocabulary, cannot return') + return feat, data def _get_model_attributes(self): - return ['pretrained_model', 'framework', 'top_n', 'mask_idx', - 'target', 'mask_token', 'tokenizer_type'] + return ['pretrained_model', 'framework', 'top_n', 'mask_pos', + 'target', 'threshold', 'mask_token', 'tokenizer_type'] +# What to do with SEP token? Does it need to be there? +# Return other layers +# Return attention class WordCounterExtractor(ComplexTextExtractor): From 0f2918f8b437110433c5ee8a9056c9be85f29f34 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 10 Mar 2020 14:11:45 -0500 Subject: [PATCH 15/89] checkpoint --- pliers/extractors/text.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 94967fb3..3a48e0e7 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -578,6 +578,8 @@ def _postprocess(self, preds, tok, wds, ons, dur): out = pool_func(preds[0][:, 1:-1, :], axis=1, keepdims=True) else: out = preds[1] + data = [out.tolist()] + feat = ['encoding'] if self.return_metadata: data += [tok] feat += ['sequence'] @@ -589,6 +591,7 @@ def _get_model_attributes(self): class BertLMExtractor(BertExtractor): + ''' Use BERT for masked words prediction. Args: @@ -695,11 +698,11 @@ def _postprocess(self, preds, tok, wds, ons, dur): feat = self.tokenizer.convert_ids_to_tokens(out_idx) data = preds[0,self.mask_pos,out_idx] if self.return_metadata: - feat, data = self._retrieve_true_token(preds, feat, data) + feat, data = self._return_true_token(preds, feat, data) ons, dur = map(lambda x: listify(x[self.mask_pos]), [ons, dur]) return data, feat, ons, dur - def _retrieve_true_token(self, preds, feat, data): + def _return_true_token(self, preds, feat, data): if self.mask_token in self.tokenizer.vocab: true_vocab_idx = self.tokenizer.vocab[self.mask_token] true_score = preds[0, self.mask_pos, true_vocab_idx] @@ -714,8 +717,11 @@ def _get_model_attributes(self): 'target', 'threshold', 'mask_token', 'tokenizer_type'] # What to do with SEP token? Does it need to be there? -# Return other layers -# Return attention +# Return other layers and/or attentions? +# Couple of mixins (sequence coherence, probability) +# Look into the sentiment extractor +# Discuss probability mixin with Tal +# Metadata as features / Add other field to store additional info? class WordCounterExtractor(ComplexTextExtractor): From dbf4d20c5af15f8fc73ff14421decf70ae739dd4 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 10 Mar 2020 19:50:26 -0400 Subject: [PATCH 16/89] restore mask in init --- pliers/extractors/text.py | 48 +++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 3a48e0e7..e18f5db2 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -456,9 +456,9 @@ def __init__(self, model = model_class if self.framework == 'pt' else 'TF' + model_class self.model = getattr(transformers, model).from_pretrained( - pretrained_model, **model_kwargs) + pretrained_model, **self.model_kwargs) self.tokenizer = transformers.BertTokenizer.from_pretrained( - tokenizer, **tokenizer_kwargs) + tokenizer, **self.tokenizer_kwargs) super(BertExtractor, self).__init__() def _mask(self, wds, mask): @@ -474,9 +474,9 @@ def _preprocess(self, stims, mask): idx = self.tokenizer.encode(tok, return_tensors=self.framework) return wds, ons, dur, tok, idx - def _extract(self, stims, **kwargs): - mask = kwargs['mask'] if 'mask' in kwargs else None - wds, ons, dur, tok, idx = self._preprocess(stims, mask) + def _extract(self, stims): + mask = self.mask or None + wds, ons, dur, tok, idx = self._preprocess(stims, mask=mask) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] data, feat, ons, dur = self._postprocess(preds, tok, wds, ons, dur) @@ -605,6 +605,11 @@ class BertLMExtractor(BertExtractor): unknown tokens. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. + mask (int or str): Words to be masked (string) or indices of + words in the sequence to be masked (indexing starts at 0). Can + be either a single word/index or a list of words/indices. + If str is passed and more than one word in the input matches + the string, only the first one is masked. top_n (int): Specifies how many of the highest-probability tokens are to be returned. Mutually exclusive with target and threshold. target (str or list): Vocabulary token(s) for which probability is to @@ -623,8 +628,7 @@ class BertLMExtractor(BertExtractor): See https://huggingface.co/transformers/main_classes/tokenizer.html. ''' - _log_attributes = ('pretrained_model', 'framework', 'top_n', 'mask_pos', - 'mask_token', 'target', 'tokenizer_type', 'return_softmax') + _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', 'tokenizer_type', 'return_softmax') def __init__(self, pretrained_model='bert-base-uncased', @@ -662,6 +666,7 @@ def __init__(self, f'(\'{tokenizer}\').vocab.keys() to see available tokens') self.return_softmax = return_softmax self.return_true = return_true + self.mask = mask def _mask(self, wds, mask): if not type(mask) in [int, str]: @@ -672,17 +677,6 @@ def _mask(self, wds, mask): mwds[self.mask_pos] = '[MASK]' return mwds - def _extract(self, stims, mask): - ''' - Args: - mask (int or str): Words to be masked (string) or indices of - words in the sequence to be masked (indexing starts at 0). Can - be either a single word/index or a list of words/indices. - If str is passed and more than one word in the input matches - the string, only the first one is masked. - ''' - return super()._extract(stims=stims, mask=mask) - def _postprocess(self, preds, tok, wds, ons, dur): preds = preds[0].numpy()[:,1:-1,:] if self.return_softmax: @@ -706,22 +700,28 @@ def _return_true_token(self, preds, feat, data): if self.mask_token in self.tokenizer.vocab: true_vocab_idx = self.tokenizer.vocab[self.mask_token] true_score = preds[0, self.mask_pos, true_vocab_idx] - feat += ['true_word', 'true_word_score'] - data += [self.mask_token, true_score] else: - logging.warning('True token not in vocabulary, cannot return') + true_score = np.nan + logging.warning('True token not in vocabulary. Returning NaN') + feat += ['true_word', 'true_word_score'] + data += [self.mask_token, true_score] return feat, data def _get_model_attributes(self): - return ['pretrained_model', 'framework', 'top_n', 'mask_pos', - 'target', 'threshold', 'mask_token', 'tokenizer_type'] + return ['pretrained_model', 'framework', 'top_n', 'mask', + 'target', 'threshold', 'tokenizer_type'] +# To discuss: # What to do with SEP token? Does it need to be there? # Return other layers and/or attentions? # Couple of mixins (sequence coherence, probability) # Look into the sentiment extractor # Discuss probability mixin with Tal -# Metadata as features / Add other field to store additional info? + +# To dos: +# Metadata as features / Add other field to store additional info (?) +# Log input sequence in LM extractor +# NB: a bit suboptimal to set mask in init, but handier class WordCounterExtractor(ComplexTextExtractor): From 527c6d477b92cfc421d7da00c448ab5deaf4d6da Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 10 Mar 2020 20:43:15 -0400 Subject: [PATCH 17/89] fix to_df and indexing --- pliers/extractors/text.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index e18f5db2..789422fc 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -681,16 +681,16 @@ def _postprocess(self, preds, tok, wds, ons, dur): preds = preds[0].numpy()[:,1:-1,:] if self.return_softmax: preds = scipy.special.softmax(preds, axis=-1) - out_idx = preds[0,self.mask_pos,:].argsort(axis=-1)[::-1] + out_idx = preds[0,self.mask_pos,:].argsort()[::-1] if self.top_n: - sub_idx = range(self.top_n) + sub_idx = out_idx[:self.top_n] elif self.target: sub_idx = self.tokenizer.convert_tokens_to_ids(self.target) elif self.threshold: sub_idx = np.where(preds[0,self.mask_pos,:] > self.threshold)[0] - out_idx = list(set(out_idx) & set(sub_idx)) if sub_idx else out_idx + out_idx = [idx for idx in out_idx if idx in sub_idx] feat = self.tokenizer.convert_ids_to_tokens(out_idx) - data = preds[0,self.mask_pos,out_idx] + data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] if self.return_metadata: feat, data = self._return_true_token(preds, feat, data) ons, dur = map(lambda x: listify(x[self.mask_pos]), [ons, dur]) From c8b1c368b973f9f98f41b2bac52373d34596b7eb Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 10 Mar 2020 20:43:48 -0400 Subject: [PATCH 18/89] notes --- pliers/extractors/text.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 789422fc..c8e46f14 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -711,18 +711,17 @@ def _get_model_attributes(self): return ['pretrained_model', 'framework', 'top_n', 'mask', 'target', 'threshold', 'tokenizer_type'] -# To discuss: -# What to do with SEP token? Does it need to be there? -# Return other layers and/or attentions? -# Couple of mixins (sequence coherence, probability) -# Look into the sentiment extractor -# Discuss probability mixin with Tal - # To dos: # Metadata as features / Add other field to store additional info (?) # Log input sequence in LM extractor # NB: a bit suboptimal to set mask in init, but handier +# To discuss: +# Return other layers and/or attentions? +# Couple of mixins (sequence coherence, probability)? +# Look into the sentiment extractor +# Sep token for sliding window + class WordCounterExtractor(ComplexTextExtractor): ''' Extracts number of times each unique word has occurred within text From 445ef09764fdaaee44719018d12e9b7f71784d9d Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 08:47:21 +0100 Subject: [PATCH 19/89] checkpoint --- pliers/extractors/text.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index c8e46f14..11c141b3 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -469,6 +469,7 @@ def _preprocess(self, stims, mask): wds, ons, dur = map(list, zip(*els)) tok = [self.tokenizer.tokenize(w) for w in self._mask(wds, mask)] n_tok = [len(t) for t in tok] + stims.name = ' '.join(wds) if stims.name == '' else stims.name wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) tok = list(flatten(tok)) idx = self.tokenizer.encode(tok, return_tensors=self.framework) @@ -628,7 +629,8 @@ class BertLMExtractor(BertExtractor): See https://huggingface.co/transformers/main_classes/tokenizer.html. ''' - _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', 'tokenizer_type', 'return_softmax') + _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', + 'tokenizer_type', 'return_softmax') def __init__(self, pretrained_model='bert-base-uncased', @@ -711,16 +713,6 @@ def _get_model_attributes(self): return ['pretrained_model', 'framework', 'top_n', 'mask', 'target', 'threshold', 'tokenizer_type'] -# To dos: -# Metadata as features / Add other field to store additional info (?) -# Log input sequence in LM extractor -# NB: a bit suboptimal to set mask in init, but handier - -# To discuss: -# Return other layers and/or attentions? -# Couple of mixins (sequence coherence, probability)? -# Look into the sentiment extractor -# Sep token for sliding window class WordCounterExtractor(ComplexTextExtractor): From 92ff1eb7daedcd7ec8e959545c80bc8cba243b37 Mon Sep 17 00:00:00 2001 From: Roberta Rocca <32483140+rbroc@users.noreply.github.com> Date: Wed, 18 Mar 2020 12:03:41 +0100 Subject: [PATCH 20/89] Update pliers/extractors/text.py Co-Authored-By: Tal Yarkoni --- pliers/extractors/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index c8e46f14..e0060a57 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -618,7 +618,7 @@ class BertLMExtractor(BertExtractor): threshold (float): If defined, only values above this threshold will be returned. Mutually exclusive with top_n and target. return_softmax (bool): if True, returns probability scores instead of - raw predictions scores. + raw predictions. return_metadata (bool): if True, returns masked word (if defined in the tokenizer vocabulary) and its probability. model_kwargs (dict): Named arguments for pretrained model. From 6089fc6aee309a4e985130436ee96f0713af0506 Mon Sep 17 00:00:00 2001 From: Roberta Rocca <32483140+rbroc@users.noreply.github.com> Date: Wed, 18 Mar 2020 12:04:11 +0100 Subject: [PATCH 21/89] Update pliers/extractors/text.py Co-Authored-By: Tal Yarkoni --- pliers/extractors/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index e0060a57..088ddd9a 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -406,7 +406,7 @@ def _extract(self, stim): class BertExtractor(ComplexTextExtractor): ''' Base class for all Extractors based on pretrained BERT. - This model returns the last hidden layer (wihtout special tokens) + This model returns the last hidden layer (without special tokens) Args: pretrained_model (str): A string specifying which BERT From c6020a95e2ccf713bdb3a182c4666cab34ff8f76 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 12:38:58 +0100 Subject: [PATCH 22/89] _model_attributes as class attribute --- pliers/extractors/text.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 69e25ee1..5256d6ed 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -431,6 +431,8 @@ class BertExtractor(ComplexTextExtractor): _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', 'model_class', 'model_kwargs', 'tokenizer_kwargs') + _model_attributes = ('pretrained_model', 'framework', 'model_class', + 'tokenizer_type') def __init__(self, pretrained_model='bert-base-uncased', @@ -493,20 +495,17 @@ def _postprocess(self, preds, tok, wds, ons, dur): feat += ['token', 'word'] return data, feat, ons, dur - def _get_model_attributes(self): - return ['pretrained_model', 'framework', 'model_class', - 'tokenizer_type'] - def _to_df(self, result, include_attributes=True): res_dict = dict(zip(result.features, result._data)) if include_attributes: log_dict = {attr: getattr(result.extractor, attr) for - attr in self._get_model_attributes()} + attr in self._model_attributes} res_dict.update(log_dict) res_df = pd.DataFrame(res_dict) res_df['object_id'] = range(res_df.shape[0]) return res_df + class BertSequenceEncodingExtractor(BertExtractor): ''' Extract contextualized encodings for words or sequences using @@ -537,8 +536,10 @@ class BertSequenceEncodingExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'pooling', 'return_sep', 'model_class', 'model_kwargs', - 'tokenizer_kwargs') + 'pooling', 'return_sep', 'model_class', 'model_kwargs', + 'tokenizer_kwargs') + _model_attributes = ('pretrained_model', 'framework', 'model_class', + 'pooling', 'return_sep', 'tokenizer_type') def __init__(self, pretrained_model='bert-base-uncased', @@ -585,10 +586,6 @@ def _postprocess(self, preds, tok, wds, ons, dur): data += [tok] feat += ['sequence'] return data, feat, ons, dur - - def _get_model_attributes(self): - return ['pretrained_model', 'framework', 'model_class', - 'pooling', 'return_sep', 'tokenizer_type'] class BertLMExtractor(BertExtractor): @@ -630,7 +627,9 @@ class BertLMExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', - 'tokenizer_type', 'return_softmax') + 'tokenizer_type', 'return_softmax') + _model_attributes = ('pretrained_model', 'framework', 'top_n', 'mask', + 'target', 'threshold', 'tokenizer_type') def __init__(self, pretrained_model='bert-base-uncased', @@ -709,10 +708,6 @@ def _return_true_token(self, preds, feat, data): data += [self.mask_token, true_score] return feat, data - def _get_model_attributes(self): - return ['pretrained_model', 'framework', 'top_n', 'mask', - 'target', 'threshold', 'tokenizer_type'] - class WordCounterExtractor(ComplexTextExtractor): From 9dc891df06f3358844303356afc25cc00793569a Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 12:39:55 +0100 Subject: [PATCH 23/89] check pooling arg before superclass initializer --- pliers/extractors/text.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 5256d6ed..1e62b030 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -550,10 +550,6 @@ def __init__(self, return_metadata=False, model_kwargs=None, tokenizer_kwargs=None): - - super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, - tokenizer, framework, return_metadata, model_kwargs, - tokenizer_kwargs, model_class='BertModel') if pooling: if return_sep: raise(ValueError('Pooling and return_seq argument are ' @@ -562,6 +558,9 @@ def __init__(self, getattr(np, pooling) except: raise(ValueError('Pooling must be a valid numpy function.')) + super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, + tokenizer, framework, return_metadata, model_kwargs, + tokenizer_kwargs, model_class='BertModel') self.pooling = pooling self.return_sep = return_sep From 5629a0bfcb7f623a5a942cc066c8f6e98f7ee3a8 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 12:45:05 +0100 Subject: [PATCH 24/89] move superclass init after argument validation --- pliers/extractors/text.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 1e62b030..489867f7 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -558,11 +558,11 @@ def __init__(self, getattr(np, pooling) except: raise(ValueError('Pooling must be a valid numpy function.')) + self.pooling = pooling + self.return_sep = return_sep super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, tokenizer, framework, return_metadata, model_kwargs, tokenizer_kwargs, model_class='BertModel') - self.pooling = pooling - self.return_sep = return_sep def _postprocess(self, preds, tok, wds, ons, dur): preds = [p.numpy().squeeze() for p in preds] @@ -642,13 +642,6 @@ def __init__(self, return_true=False, model_kwargs=None, tokenizer_kwargs=None): - - super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, - tokenizer=tokenizer, - framework=framework, - model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, - model_class='BertForMaskedLM') if any([top_n and target, top_n and threshold, threshold and target]): raise ValueError('top_n, threshold and target arguments ' 'are mutually exclusive') @@ -667,7 +660,13 @@ def __init__(self, self.return_softmax = return_softmax self.return_true = return_true self.mask = mask - + super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, + tokenizer=tokenizer, + framework=framework, + model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, + model_class='BertForMaskedLM') + def _mask(self, wds, mask): if not type(mask) in [int, str]: raise ValueError('mask argument must be an integer or a string') From d74ace4b9c60031f210a12ad01e92d1928ca4772 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 15:29:03 +0100 Subject: [PATCH 25/89] add docstring to additional methods --- pliers/extractors/text.py | 42 +++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 489867f7..5a434fe9 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -444,7 +444,6 @@ def __init__(self, tokenizer_kwargs=None): verify_dependencies(['transformers']) - if framework not in ['pt', 'tf']: raise(ValueError('''Invalid framework; must be one of 'pt' (pytorch) or 'tf' (tensorflow)''')) @@ -453,6 +452,7 @@ def __init__(self, self.model_class = model_class self.framework = framework self.return_metadata = return_metadata + self.mask = None self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} @@ -463,13 +463,33 @@ def __init__(self, tokenizer, **self.tokenizer_kwargs) super(BertExtractor, self).__init__() - def _mask(self, wds, mask): + def _mask_words(self, wds, mask): + ''' Method called by _preprocess to mask word in the input sequence. + If masking is not relevant (e.g. if objective is not language + modelling), returns list of input words. Overridden in subclasses. + Args: + wds (list): list of words in the input sequence (i.e. the .text + attribute of the input ComplexTextStim) + mask (str or int): the self.mask attribute, i.e. an integer + indicating the index of the word in wds that is to be masked, + or a string that matches the word in wds to be masked. + ''' return wds def _preprocess(self, stims, mask): + ''' Extracts text, onset, duration from ComplexTextStim, masks target + words (if relevant), tokenizes the input, and casts word, onset, + and duration information to token level lists. This is fairly + model-specific, so it needs to be overridden by each subclass. + Args: + stims (Stim): the ComplexTextStim input + mask (str or int): an integer indicating the index of the word to + masked in the input sequence, or a string that matches the + word. + ''' els = [(e.text, e.onset, e.duration) for e in stims.elements] wds, ons, dur = map(list, zip(*els)) - tok = [self.tokenizer.tokenize(w) for w in self._mask(wds, mask)] + tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds, mask)] n_tok = [len(t) for t in tok] stims.name = ' '.join(wds) if stims.name == '' else stims.name wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) @@ -478,8 +498,7 @@ def _preprocess(self, stims, mask): return wds, ons, dur, tok, idx def _extract(self, stims): - mask = self.mask or None - wds, ons, dur, tok, idx = self._preprocess(stims, mask=mask) + wds, ons, dur, tok, idx = self._preprocess(stims, mask=self.mask) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] data, feat, ons, dur = self._postprocess(preds, tok, wds, ons, dur) @@ -487,6 +506,17 @@ def _extract(self, stims): durations=dur) def _postprocess(self, preds, tok, wds, ons, dur): + ''' Processes the output of the model (subsets relevant information, + transforms it where relevant, adds model metadata if requested). + Needs to be overridden by subclasses. + Args: + preds (array): model output + tok (list): list of tokens (strings) used as input for the model + wds (list): list of words in the original sequence each token is + part of. + ons (list): list of onsets (one per token) + dur (list): list of durations (one per token) + ''' out = preds[0][:, 1:-1, :].numpy().squeeze() data = [out.tolist()] feat = ['encoding'] @@ -667,7 +697,7 @@ def __init__(self, tokenizer_kwargs=tokenizer_kwargs, model_class='BertForMaskedLM') - def _mask(self, wds, mask): + def _mask_words(self, wds, mask): if not type(mask) in [int, str]: raise ValueError('mask argument must be an integer or a string') mwds = wds.copy() From 8dc3202a520435a4271132c784cbcc032aafce74 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 16:03:48 +0100 Subject: [PATCH 26/89] add self.mask in __init__ --- pliers/extractors/text.py | 54 ++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 5a434fe9..f04db6ab 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -423,6 +423,9 @@ class BertExtractor(ComplexTextExtractor): (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. return_metadata (bool): if True, the extractor returns encoded token and encoded word as features. + mask (str or int): if defined, specifies which word is to be replaced + with [MASK] token either by its index (int) in the input sequence, + or by specifying the word itself (str). model_kwargs (dict): Named arguments for transformer model. See https://huggingface.co/transformers/main_classes/model.html tokenizer_kwargs (dict): Named arguments for tokenizer. @@ -441,7 +444,8 @@ def __init__(self, framework='pt', return_metadata=False, model_kwargs=None, - tokenizer_kwargs=None): + tokenizer_kwargs=None, + mask=None): verify_dependencies(['transformers']) if framework not in ['pt', 'tf']: @@ -452,7 +456,7 @@ def __init__(self, self.model_class = model_class self.framework = framework self.return_metadata = return_metadata - self.mask = None + self.mask = mask self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} @@ -463,33 +467,34 @@ def __init__(self, tokenizer, **self.tokenizer_kwargs) super(BertExtractor, self).__init__() - def _mask_words(self, wds, mask): - ''' Method called by _preprocess to mask word in the input sequence. - If masking is not relevant (e.g. if objective is not language - modelling), returns list of input words. Overridden in subclasses. + def _mask_words(self, wds): + ''' Preprocessing step called by _preprocess method. Replaces word + in the input sequence with [MASK] token. Args: wds (list): list of words in the input sequence (i.e. the .text attribute of the input ComplexTextStim) - mask (str or int): the self.mask attribute, i.e. an integer - indicating the index of the word in wds that is to be masked, - or a string that matches the word in wds to be masked. ''' + if self.mask: + if type(self.mask) == int: + wds[self.mask] = '[MASK]' + elif type(self.mask) == str: + idx = np.where(np.array(wds)==self.mask)[0][0] + wds[idx] = '[MASK]' + else: + raise ValueError('Invalid mask argument') return wds - def _preprocess(self, stims, mask): + def _preprocess(self, stims): ''' Extracts text, onset, duration from ComplexTextStim, masks target words (if relevant), tokenizes the input, and casts word, onset, and duration information to token level lists. This is fairly model-specific, so it needs to be overridden by each subclass. Args: stims (Stim): the ComplexTextStim input - mask (str or int): an integer indicating the index of the word to - masked in the input sequence, or a string that matches the - word. ''' els = [(e.text, e.onset, e.duration) for e in stims.elements] wds, ons, dur = map(list, zip(*els)) - tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds, mask)] + tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)] n_tok = [len(t) for t in tok] stims.name = ' '.join(wds) if stims.name == '' else stims.name wds, ons, dur = map(lambda x: np.repeat(x, n_tok), [wds, ons, dur]) @@ -498,7 +503,7 @@ def _preprocess(self, stims, mask): return wds, ons, dur, tok, idx def _extract(self, stims): - wds, ons, dur, tok, idx = self._preprocess(stims, mask=self.mask) + wds, ons, dur, tok, idx = self._preprocess(stims) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] data, feat, ons, dur = self._postprocess(preds, tok, wds, ons, dur) @@ -578,6 +583,7 @@ def __init__(self, pooling=None, return_sep=False, return_metadata=False, + mask=None, model_kwargs=None, tokenizer_kwargs=None): if pooling: @@ -591,7 +597,7 @@ def __init__(self, self.pooling = pooling self.return_sep = return_sep super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, - tokenizer, framework, return_metadata, model_kwargs, + tokenizer, framework, return_metadata, mask, model_kwargs, tokenizer_kwargs, model_class='BertModel') def _postprocess(self, preds, tok, wds, ons, dur): @@ -646,7 +652,7 @@ class BertLMExtractor(BertExtractor): be returned. Mutually exclusive with top_n and target. return_softmax (bool): if True, returns probability scores instead of raw predictions. - return_metadata (bool): if True, returns masked word (if defined in the + return_true (bool): if True, returns masked word (if defined in the tokenizer vocabulary) and its probability. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html @@ -689,19 +695,19 @@ def __init__(self, f'(\'{tokenizer}\').vocab.keys() to see available tokens') self.return_softmax = return_softmax self.return_true = return_true - self.mask = mask super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, tokenizer=tokenizer, framework=framework, model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs, + mask=mask, model_class='BertForMaskedLM') - def _mask_words(self, wds, mask): - if not type(mask) in [int, str]: + def _mask_words(self, wds): + if not type(self.mask) in [int, str]: raise ValueError('mask argument must be an integer or a string') mwds = wds.copy() - self.mask_token = mask if type(mask) == str else mwds[mask] + self.mask_token = self.mask if type(self.mask) == str else mwds[self.mask] self.mask_pos = np.where(np.array(mwds)==self.mask_token)[0][0] mwds[self.mask_pos] = '[MASK]' return mwds @@ -720,12 +726,12 @@ def _postprocess(self, preds, tok, wds, ons, dur): out_idx = [idx for idx in out_idx if idx in sub_idx] feat = self.tokenizer.convert_ids_to_tokens(out_idx) data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] - if self.return_metadata: - feat, data = self._return_true_token(preds, feat, data) + if self.return_true: + feat, data = self._return_true(preds, feat, data) ons, dur = map(lambda x: listify(x[self.mask_pos]), [ons, dur]) return data, feat, ons, dur - def _return_true_token(self, preds, feat, data): + def _return_true(self, preds, feat, data): if self.mask_token in self.tokenizer.vocab: true_vocab_idx = self.tokenizer.vocab[self.mask_token] true_score = preds[0, self.mask_pos, true_vocab_idx] From d4c8c87c0f949b6047dff7602ee82ab590d92c07 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 16:11:56 +0100 Subject: [PATCH 27/89] fix docstrings --- pliers/extractors/text.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index f04db6ab..dabd4c75 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -486,11 +486,20 @@ def _mask_words(self, wds): def _preprocess(self, stims): ''' Extracts text, onset, duration from ComplexTextStim, masks target - words (if relevant), tokenizes the input, and casts word, onset, - and duration information to token level lists. This is fairly - model-specific, so it needs to be overridden by each subclass. + words (if relevant), tokenizes the input, and casts words, onsets, + and durations to token-level lists. Called within _extract method + to prepare input for the model. Same for all subclasses. Args: stims (Stim): the ComplexTextStim input + Outputs: + tok (list): list of tokens (strings) + wds (list): list of words in stimulus sequence. Same length as tok, + needed to keep mapping between transformers sub-word tokens + and whole words. + ons (list): list of onsets (one per token) + dur (list): list of durations (one per token) + idx (list): index of each token in model vocabulary + ''' els = [(e.text, e.onset, e.duration) for e in stims.elements] wds, ons, dur = map(list, zip(*els)) @@ -511,16 +520,13 @@ def _extract(self, stims): durations=dur) def _postprocess(self, preds, tok, wds, ons, dur): - ''' Processes the output of the model (subsets relevant information, - transforms it where relevant, adds model metadata if requested). - Needs to be overridden by subclasses. + ''' Takes model output as input and processes it (subsets relevant + information,transforms it where relevant, adds model metadata + if requested). Fairly model-specific, therefore overridden by + each subclass. Args: preds (array): model output - tok (list): list of tokens (strings) used as input for the model - wds (list): list of words in the original sequence each token is - part of. - ons (list): list of onsets (one per token) - dur (list): list of durations (one per token) + tok, wds, ons, dur, tok: see output of _preprocess method. ''' out = preds[0][:, 1:-1, :].numpy().squeeze() data = [out.tolist()] From 0aa2f9338dd21c373e8951aba082da7da8666d46 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 16:16:56 +0100 Subject: [PATCH 28/89] rename return_metadata args --- pliers/extractors/text.py | 42 +++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index dabd4c75..ba0fa617 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -442,7 +442,7 @@ def __init__(self, tokenizer='bert-base-uncased', model_class='BertModel', framework='pt', - return_metadata=False, + return_tokens=False, model_kwargs=None, tokenizer_kwargs=None, mask=None): @@ -455,7 +455,7 @@ def __init__(self, self.tokenizer_type = tokenizer self.model_class = model_class self.framework = framework - self.return_metadata = return_metadata + self.return_tokens = return_tokens self.mask = mask self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} @@ -531,7 +531,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): out = preds[0][:, 1:-1, :].numpy().squeeze() data = [out.tolist()] feat = ['encoding'] - if self.return_metadata: + if self.return_tokens: data += [tok, wds] feat += ['token', 'word'] return data, feat, ons, dur @@ -567,7 +567,7 @@ class BertSequenceEncodingExtractor(BertExtractor): encodings. return_sep (bool): defines whether to return encoding for the [SEP] token. - return_metadata (bool): If True, the extractor returns an additional + return_sequence (bool): If True, the extractor returns an additional feature column with the encoded sequence. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html @@ -588,7 +588,7 @@ def __init__(self, framework='pt', pooling=None, return_sep=False, - return_metadata=False, + return_sequence=False, mask=None, model_kwargs=None, tokenizer_kwargs=None): @@ -602,9 +602,10 @@ def __init__(self, raise(ValueError('Pooling must be a valid numpy function.')) self.pooling = pooling self.return_sep = return_sep + self.return_sequence = return_sequence super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, - tokenizer, framework, return_metadata, mask, model_kwargs, - tokenizer_kwargs, model_class='BertModel') + tokenizer, framework, mask, model_kwargs, tokenizer_kwargs, + model_class='BertModel') def _postprocess(self, preds, tok, wds, ons, dur): preds = [p.numpy().squeeze() for p in preds] @@ -623,7 +624,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): out = preds[1] data = [out.tolist()] feat = ['encoding'] - if self.return_metadata: + if self.return_sequence: data += [tok] feat += ['sequence'] return data, feat, ons, dur @@ -658,8 +659,8 @@ class BertLMExtractor(BertExtractor): be returned. Mutually exclusive with top_n and target. return_softmax (bool): if True, returns probability scores instead of raw predictions. - return_true (bool): if True, returns masked word (if defined in the - tokenizer vocabulary) and its probability. + return_masked_word (bool): if True, returns masked word (if defined + in the tokenizer vocabulary) and its probability. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html and https://huggingface.co/transformers/model_doc/bert.html. @@ -681,7 +682,7 @@ def __init__(self, threshold=None, target=None, return_softmax=False, - return_true=False, + return_masked_word=False, model_kwargs=None, tokenizer_kwargs=None): if any([top_n and target, top_n and threshold, threshold and target]): @@ -700,14 +701,11 @@ def __init__(self, ' and run transformers.BertTokenizer.from_pretrained' f'(\'{tokenizer}\').vocab.keys() to see available tokens') self.return_softmax = return_softmax - self.return_true = return_true - super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, - tokenizer=tokenizer, - framework=framework, - model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, - mask=mask, - model_class='BertForMaskedLM') + self.return_masked_word = return_masked_word + super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, + tokenizer=tokenizer, framework=framework, model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, mask=mask, + model_class='BertForMaskedLM') def _mask_words(self, wds): if not type(self.mask) in [int, str]: @@ -732,12 +730,12 @@ def _postprocess(self, preds, tok, wds, ons, dur): out_idx = [idx for idx in out_idx if idx in sub_idx] feat = self.tokenizer.convert_ids_to_tokens(out_idx) data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] - if self.return_true: - feat, data = self._return_true(preds, feat, data) + if self.return_masked_word: + feat, data = self._return_masked_word(preds, feat, data) ons, dur = map(lambda x: listify(x[self.mask_pos]), [ons, dur]) return data, feat, ons, dur - def _return_true(self, preds, feat, data): + def _return_masked_word(self, preds, feat, data): if self.mask_token in self.tokenizer.vocab: true_vocab_idx = self.tokenizer.vocab[self.mask_token] true_score = preds[0, self.mask_pos, true_vocab_idx] From f6060495446165d75f9a0404e7df282f4de1f954 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 18 Mar 2020 16:47:51 +0100 Subject: [PATCH 29/89] set class to AutoModel to enable any BERT-like model (ALBERT, RoBERTA, etc.) --- pliers/extractors/text.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index ba0fa617..824ba417 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -409,16 +409,20 @@ class BertExtractor(ComplexTextExtractor): This model returns the last hidden layer (without special tokens) Args: - pretrained_model (str): A string specifying which BERT - model to use. Can be one of pretrained BERT models listed at + pretrained_model (str): A string specifying which transformer + model to use. Can be any pretrained BERT or BERT-derived (ALBERT, + DistilBERT, RoBERTa, CamemBERT etc.) models listed at https://huggingface.co/transformers/pretrained_models.html - (valid values include all the models with 'bert' prefix) or path to custom model. tokenizer (str): Type of tokenization used in the tokenization step. If different from model, out-of-vocabulary tokens may be treated as unknown tokens. - model_class (str): Specifies class of Bert model. Must be one of - 'BertModel' or 'BertForLM'. + model_class (str): Specifies model type. Must be one of 'AutoModel' + (encoding extractor) or 'AutoModelWithLMHead' (language model). + These are generic model classes, which use the value of + pretrained_model to infer the model-specific transformers + class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel + or RobertaForMaskedLM for RoBERTa). Fixed by each subclass. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. return_metadata (bool): if True, the extractor returns encoded token @@ -440,8 +444,8 @@ class BertExtractor(ComplexTextExtractor): def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', - model_class='BertModel', framework='pt', + model_class='AutoModel', return_tokens=False, model_kwargs=None, tokenizer_kwargs=None, @@ -552,10 +556,10 @@ class BertSequenceEncodingExtractor(BertExtractor): ''' Extract contextualized encodings for words or sequences using pretrained BertModel. Args: - pretrained_model (str): A string specifying which BERT - model to use. Can be one of pretrained BERT models listed at + pretrained_model (str): A string specifying which transformer + model to use. Can be any pretrained BERT or BERT-derived (ALBERT, + DistilBERT, RoBERTa, CamemBERT etc.) models listed at https://huggingface.co/transformers/pretrained_models.html - (valid values include all the models with 'bert' prefix) or path to custom model. tokenizer (str): Type of tokenization used in the tokenization step. If different from model, out-of-vocabulary tokens may be treated as @@ -604,8 +608,7 @@ def __init__(self, self.return_sep = return_sep self.return_sequence = return_sequence super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, - tokenizer, framework, mask, model_kwargs, tokenizer_kwargs, - model_class='BertModel') + tokenizer, framework, mask, model_kwargs, tokenizer_kwargs) def _postprocess(self, preds, tok, wds, ons, dur): preds = [p.numpy().squeeze() for p in preds] @@ -635,10 +638,10 @@ class BertLMExtractor(BertExtractor): ''' Use BERT for masked words prediction. Args: - pretrained_model (str): A string specifying which BERT - model to use. Can be one of pretrained BERT models listed at + pretrained_model (str): A string specifying which transformer + model to use. Can be any pretrained BERT or BERT-derived (ALBERT, + DistilBERT, RoBERTa, CamemBERT etc.) models listed at https://huggingface.co/transformers/pretrained_models.html - (valid values include all the models with 'bert' prefix) or path to custom model. tokenizer (str): Type of tokenization used in the tokenization step. If different from model, out-of-vocabulary tokens may be treated as @@ -705,7 +708,7 @@ def __init__(self, super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, tokenizer=tokenizer, framework=framework, model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs, mask=mask, - model_class='BertForMaskedLM') + model_class='AutoModelWithLMHead') def _mask_words(self, wds): if not type(self.mask) in [int, str]: From 6a4c8deeee4531c05d155c0ed5d80f0997d06e51 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 19 Mar 2020 09:26:49 +0100 Subject: [PATCH 30/89] try BertBase as metaclass --- pliers/extractors/text.py | 122 ++++++++++++++++++++++++++------------ 1 file changed, 85 insertions(+), 37 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 824ba417..ce797679 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -2,7 +2,7 @@ Extractors that operate primarily or exclusively on Text stimuli. ''' import sys - +from abc import ABCMeta, abstractmethod from pliers.stimuli.text import TextStim, ComplexTextStim from pliers.extractors.base import Extractor, ExtractorResult from pliers.support.exceptions import PliersError @@ -403,8 +403,7 @@ def _extract(self, stim): features=self.features, orders=order_list) -class BertExtractor(ComplexTextExtractor): - +class BertBaseExtractor(ComplexTextExtractor, metaclass=ABCMeta): ''' Base class for all Extractors based on pretrained BERT. This model returns the last hidden layer (without special tokens) @@ -425,7 +424,7 @@ class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel or RobertaForMaskedLM for RoBERTa). Fixed by each subclass. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. - return_metadata (bool): if True, the extractor returns encoded token + return_tokens (bool): if True, the extractor returns encoded token and encoded word as features. mask (str or int): if defined, specifies which word is to be replaced with [MASK] token either by its index (int) in the input sequence, @@ -463,14 +462,14 @@ def __init__(self, self.mask = mask self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} - model = model_class if self.framework == 'pt' else 'TF' + model_class self.model = getattr(transformers, model).from_pretrained( pretrained_model, **self.model_kwargs) self.tokenizer = transformers.BertTokenizer.from_pretrained( tokenizer, **self.tokenizer_kwargs) - super(BertExtractor, self).__init__() + super(BertBaseExtractor, self).__init__() + @abstractmethod def _mask_words(self, wds): ''' Preprocessing step called by _preprocess method. Replaces word in the input sequence with [MASK] token. @@ -478,32 +477,13 @@ def _mask_words(self, wds): wds (list): list of words in the input sequence (i.e. the .text attribute of the input ComplexTextStim) ''' - if self.mask: - if type(self.mask) == int: - wds[self.mask] = '[MASK]' - elif type(self.mask) == str: - idx = np.where(np.array(wds)==self.mask)[0][0] - wds[idx] = '[MASK]' - else: - raise ValueError('Invalid mask argument') - return wds + pass def _preprocess(self, stims): ''' Extracts text, onset, duration from ComplexTextStim, masks target words (if relevant), tokenizes the input, and casts words, onsets, and durations to token-level lists. Called within _extract method - to prepare input for the model. Same for all subclasses. - Args: - stims (Stim): the ComplexTextStim input - Outputs: - tok (list): list of tokens (strings) - wds (list): list of words in stimulus sequence. Same length as tok, - needed to keep mapping between transformers sub-word tokens - and whole words. - ons (list): list of onsets (one per token) - dur (list): list of durations (one per token) - idx (list): index of each token in model vocabulary - + to prepare input for the model. ''' els = [(e.text, e.onset, e.duration) for e in stims.elements] wds, ons, dur = map(list, zip(*els)) @@ -523,7 +503,8 @@ def _extract(self, stims): return ExtractorResult(data, stims, self, features=feat, onsets=ons, durations=dur) - def _postprocess(self, preds, tok, wds, ons, dur): + @abstractmethod + def _postprocess(self): ''' Takes model output as input and processes it (subsets relevant information,transforms it where relevant, adds model metadata if requested). Fairly model-specific, therefore overridden by @@ -532,13 +513,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): preds (array): model output tok, wds, ons, dur, tok: see output of _preprocess method. ''' - out = preds[0][:, 1:-1, :].numpy().squeeze() - data = [out.tolist()] - feat = ['encoding'] - if self.return_tokens: - data += [tok, wds] - feat += ['token', 'word'] - return data, feat, ons, dur + pass def _to_df(self, result, include_attributes=True): res_dict = dict(zip(result.features, result._data)) @@ -550,8 +525,81 @@ def _to_df(self, result, include_attributes=True): res_df['object_id'] = range(res_df.shape[0]) return res_df +class BertEncodingExtractor(BertBaseExtractor): + + ''' Extractor returning encodings from Bert or Bert-derived (ALBERT, + DistilBERT, RoBERTa, CamemBERT, etc.) encodings from the last + hidden layer (excludes special tokens). + + Args: + pretrained_model (str): A string specifying which transformer + model to use. Can be any pretrained BERT or BERT-derived (ALBERT, + DistilBERT, RoBERTa, CamemBERT etc.) models listed at + https://huggingface.co/transformers/pretrained_models.html + or path to custom model. + tokenizer (str): Type of tokenization used in the tokenization step. + If different from model, out-of-vocabulary tokens may be treated + as unknown tokens. + model_class (str): Specifies model type. Must be one of 'AutoModel' + (encoding extractor) or 'AutoModelWithLMHead' (language model). + These are generic model classes, which use the value of + pretrained_model to infer the model-specific transformers + class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel + or RobertaForMaskedLM for RoBERTa). Fixed by each subclass. + framework (str): name deep learning framework to use. Must be 'pt' + (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. + return_tokens (bool): if True, the extractor returns encoded token + and encoded word as features. + mask (str or int): if defined, specifies which word is to be replaced + with [MASK] token either by its index (int) in the input sequence, + or by specifying the word itself (str). + model_kwargs (dict): Named arguments for transformer model. + See https://huggingface.co/transformers/main_classes/model.html + tokenizer_kwargs (dict): Named arguments for tokenizer. + See https://huggingface.co/transformers/main_classes/tokenizer.html + ''' + + _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', + 'model_class', 'model_kwargs', 'tokenizer_kwargs') + _model_attributes = ('pretrained_model', 'framework', 'model_class', + 'tokenizer_type') + + def _mask_words(self, wds): + ''' Preprocessing step called by _preprocess method. Replaces word + in the input sequence with [MASK] token. + Args: + wds (list): list of words in the input sequence (i.e. the .text + attribute of the input ComplexTextStim) + ''' + if self.mask: + if type(self.mask) == int: + wds[self.mask] = '[MASK]' + elif type(self.mask) == str: + idx = np.where(np.array(wds)==self.mask)[0][0] + wds[idx] = '[MASK]' + else: + raise ValueError('Invalid mask argument') + return wds + + def _postprocess(self, preds, tok, wds, ons, dur): + ''' Takes model output as input and processes it (subsets relevant + information,transforms it where relevant, adds model metadata + if requested). Fairly model-specific, therefore overridden by + each subclass. + Args: + preds (array): model output + tok, wds, ons, dur, tok: see output of _preprocess method. + ''' + out = preds[0][:, 1:-1, :].numpy().squeeze() + data = [out.tolist()] + feat = ['encoding'] + if self.return_tokens: + data += [tok, wds] + feat += ['token', 'word'] + return data, feat, ons, dur + -class BertSequenceEncodingExtractor(BertExtractor): +class BertSequenceEncodingExtractor(BertBaseExtractor): ''' Extract contextualized encodings for words or sequences using pretrained BertModel. @@ -633,7 +681,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): return data, feat, ons, dur -class BertLMExtractor(BertExtractor): +class BertLMExtractor(BertBaseExtractor): ''' Use BERT for masked words prediction. From 98a5fa0f65d6204070248b6cddf9ff910866188d Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 19 Mar 2020 09:38:45 +0100 Subject: [PATCH 31/89] simplify children classes --- pliers/extractors/text.py | 86 +++++++-------------------------------- 1 file changed, 15 insertions(+), 71 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index ce797679..8a6872d4 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -449,7 +449,6 @@ def __init__(self, model_kwargs=None, tokenizer_kwargs=None, mask=None): - verify_dependencies(['transformers']) if framework not in ['pt', 'tf']: raise(ValueError('''Invalid framework; @@ -471,11 +470,9 @@ def __init__(self, @abstractmethod def _mask_words(self, wds): - ''' Preprocessing step called by _preprocess method. Replaces word - in the input sequence with [MASK] token. - Args: - wds (list): list of words in the input sequence (i.e. the .text - attribute of the input ComplexTextStim) + ''' Called by _preprocess method. Replaces word with [MASK] token. + Takes list of words in the Stim as input (i.e. the .text attribute + for each TextStim in the ComplexTextStim) ''' pass @@ -496,6 +493,8 @@ def _preprocess(self, stims): return wds, ons, dur, tok, idx def _extract(self, stims): + ''' Takes stim as input, preprocesses it, feeds it to Bert model, + then postprocesses the output ''' wds, ons, dur, tok, idx = self._preprocess(stims) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] @@ -504,14 +503,11 @@ def _extract(self, stims): durations=dur) @abstractmethod - def _postprocess(self): - ''' Takes model output as input and processes it (subsets relevant - information,transforms it where relevant, adds model metadata - if requested). Fairly model-specific, therefore overridden by - each subclass. - Args: - preds (array): model output - tok, wds, ons, dur, tok: see output of _preprocess method. + def _postprocess(self, preds, tok, wds, ons, dur): + ''' Postprocesses model output (subsets relevant information, + transforms it where relevant, adds model metadata). + Takes prediction array, token list, word list, onsets + and durations and input. ''' pass @@ -526,51 +522,10 @@ def _to_df(self, result, include_attributes=True): return res_df class BertEncodingExtractor(BertBaseExtractor): - - ''' Extractor returning encodings from Bert or Bert-derived (ALBERT, - DistilBERT, RoBERTa, CamemBERT, etc.) encodings from the last - hidden layer (excludes special tokens). - - Args: - pretrained_model (str): A string specifying which transformer - model to use. Can be any pretrained BERT or BERT-derived (ALBERT, - DistilBERT, RoBERTa, CamemBERT etc.) models listed at - https://huggingface.co/transformers/pretrained_models.html - or path to custom model. - tokenizer (str): Type of tokenization used in the tokenization step. - If different from model, out-of-vocabulary tokens may be treated - as unknown tokens. - model_class (str): Specifies model type. Must be one of 'AutoModel' - (encoding extractor) or 'AutoModelWithLMHead' (language model). - These are generic model classes, which use the value of - pretrained_model to infer the model-specific transformers - class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel - or RobertaForMaskedLM for RoBERTa). Fixed by each subclass. - framework (str): name deep learning framework to use. Must be 'pt' - (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. - return_tokens (bool): if True, the extractor returns encoded token - and encoded word as features. - mask (str or int): if defined, specifies which word is to be replaced - with [MASK] token either by its index (int) in the input sequence, - or by specifying the word itself (str). - model_kwargs (dict): Named arguments for transformer model. - See https://huggingface.co/transformers/main_classes/model.html - tokenizer_kwargs (dict): Named arguments for tokenizer. - See https://huggingface.co/transformers/main_classes/tokenizer.html + ''' Returns from Bert or Bert-derived (ALBERT, DistilBERT, RoBERTa, + CamemBERT) encodings from the last hidden layer (excludes special tokens). ''' - - _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'model_class', 'model_kwargs', 'tokenizer_kwargs') - _model_attributes = ('pretrained_model', 'framework', 'model_class', - 'tokenizer_type') - def _mask_words(self, wds): - ''' Preprocessing step called by _preprocess method. Replaces word - in the input sequence with [MASK] token. - Args: - wds (list): list of words in the input sequence (i.e. the .text - attribute of the input ComplexTextStim) - ''' if self.mask: if type(self.mask) == int: wds[self.mask] = '[MASK]' @@ -582,14 +537,7 @@ def _mask_words(self, wds): return wds def _postprocess(self, preds, tok, wds, ons, dur): - ''' Takes model output as input and processes it (subsets relevant - information,transforms it where relevant, adds model metadata - if requested). Fairly model-specific, therefore overridden by - each subclass. - Args: - preds (array): model output - tok, wds, ons, dur, tok: see output of _preprocess method. - ''' + ' Only returns encoding for tokens (excludes special tokens) ' out = preds[0][:, 1:-1, :].numpy().squeeze() data = [out.tolist()] feat = ['encoding'] @@ -600,7 +548,6 @@ def _postprocess(self, preds, tok, wds, ons, dur): class BertSequenceEncodingExtractor(BertBaseExtractor): - ''' Extract contextualized encodings for words or sequences using pretrained BertModel. Args: @@ -634,8 +581,7 @@ class BertSequenceEncodingExtractor(BertBaseExtractor): _model_attributes = ('pretrained_model', 'framework', 'model_class', 'pooling', 'return_sep', 'tokenizer_type') - def __init__(self, - pretrained_model='bert-base-uncased', + def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', framework='pt', pooling=None, @@ -682,9 +628,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): class BertLMExtractor(BertBaseExtractor): - - ''' Use BERT for masked words prediction. - + ''' Returns masked words predictions for BERT (or BERT-derived) models. Args: pretrained_model (str): A string specifying which transformer model to use. Can be any pretrained BERT or BERT-derived (ALBERT, From 835726a4052d46feb90f9a6fda44fa8d5ae2a1ae Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 19 Mar 2020 10:30:20 +0100 Subject: [PATCH 32/89] add update_mask method and remove mask from nonLM extractors --- pliers/extractors/text.py | 58 ++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 8a6872d4..2cab2a5b 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -445,10 +445,8 @@ def __init__(self, tokenizer='bert-base-uncased', framework='pt', model_class='AutoModel', - return_tokens=False, model_kwargs=None, - tokenizer_kwargs=None, - mask=None): + tokenizer_kwargs=None): verify_dependencies(['transformers']) if framework not in ['pt', 'tf']: raise(ValueError('''Invalid framework; @@ -457,8 +455,6 @@ def __init__(self, self.tokenizer_type = tokenizer self.model_class = model_class self.framework = framework - self.return_tokens = return_tokens - self.mask = mask self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} model = model_class if self.framework == 'pt' else 'TF' + model_class @@ -468,13 +464,14 @@ def __init__(self, tokenizer, **self.tokenizer_kwargs) super(BertBaseExtractor, self).__init__() - @abstractmethod def _mask_words(self, wds): - ''' Called by _preprocess method. Replaces word with [MASK] token. - Takes list of words in the Stim as input (i.e. the .text attribute - for each TextStim in the ComplexTextStim) + ''' Called by _preprocess method. Takes list of words in the Stim as + input (i.e. the .text attribute for each TextStim in the + ComplexTextStim). If class has mask attribute, replaces word in + the input sequence with [MASK] token based on the value of mask + (either index in the sequence, or word to replace) ''' - pass + return wds def _preprocess(self, stims): ''' Extracts text, onset, duration from ComplexTextStim, masks target @@ -521,20 +518,22 @@ def _to_df(self, result, include_attributes=True): res_df['object_id'] = range(res_df.shape[0]) return res_df + class BertEncodingExtractor(BertBaseExtractor): ''' Returns from Bert or Bert-derived (ALBERT, DistilBERT, RoBERTa, CamemBERT) encodings from the last hidden layer (excludes special tokens). ''' - def _mask_words(self, wds): - if self.mask: - if type(self.mask) == int: - wds[self.mask] = '[MASK]' - elif type(self.mask) == str: - idx = np.where(np.array(wds)==self.mask)[0][0] - wds[idx] = '[MASK]' - else: - raise ValueError('Invalid mask argument') - return wds + def __init__(self, + pretrained_model='bert-base-uncased', + tokenizer='bert-base-uncased', + framework='pt', + return_tokens=None, + model_kwargs=None, + tokenizer_kwargs=None): + super(BertEncodingExtractor, self).__init__(pretrained_model, + tokenizer, framework, 'AutoModelWithLMHead', model_kwargs, + tokenizer_kwargs) + self.return_tokens = return_tokens def _postprocess(self, preds, tok, wds, ons, dur): ' Only returns encoding for tokens (excludes special tokens) ' @@ -587,7 +586,6 @@ def __init__(self, pretrained_model='bert-base-uncased', pooling=None, return_sep=False, return_sequence=False, - mask=None, model_kwargs=None, tokenizer_kwargs=None): if pooling: @@ -601,9 +599,9 @@ def __init__(self, pretrained_model='bert-base-uncased', self.pooling = pooling self.return_sep = return_sep self.return_sequence = return_sequence - super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, - tokenizer, framework, mask, model_kwargs, tokenizer_kwargs) - + super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, + tokenizer, framework, 'AutoModel', model_kwargs, tokenizer_kwargs) + def _postprocess(self, preds, tok, wds, ons, dur): preds = [p.numpy().squeeze() for p in preds] tok = [' '.join(wds)] @@ -683,6 +681,8 @@ def __init__(self, if any([top_n and target, top_n and threshold, threshold and target]): raise ValueError('top_n, threshold and target arguments ' 'are mutually exclusive') + if type(mask) not in [int, str]: + raise ValueError('Mask must be a string or an integer.') self.top_n = top_n self.threshold = threshold self.target = listify(target) @@ -699,12 +699,14 @@ def __init__(self, self.return_masked_word = return_masked_word super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, tokenizer=tokenizer, framework=framework, model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, mask=mask, - model_class='AutoModelWithLMHead') + tokenizer_kwargs=tokenizer_kwargs, model_class='AutoModelWithLMHead') + def update_mask(self, new_mask): + if type(new_mask) not in [str, int]: + raise ValueError('Mask must be an integer or a string') + self.mask = new_mask + def _mask_words(self, wds): - if not type(self.mask) in [int, str]: - raise ValueError('mask argument must be an integer or a string') mwds = wds.copy() self.mask_token = self.mask if type(self.mask) == str else mwds[self.mask] self.mask_pos = np.where(np.array(mwds)==self.mask_token)[0][0] From 031cbd10b3aab83cbf86ab50df2001053977b01f Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 19 Mar 2020 11:11:40 +0100 Subject: [PATCH 33/89] add prototype sentiment extractor --- pliers/extractors/text.py | 55 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 2cab2a5b..3d9b42d0 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -744,6 +744,61 @@ def _return_masked_word(self, preds, feat, data): return feat, data +class BertSentimentExtractor(BertBaseExtractor): + ''' Extracts sentiment for sequences using Bert or Bert-derived models + fine-tuned for sentiment classification. + Args: + pretrained_model (str): A string specifying which transformer + model to use (must be one fine-tuned for sentiment classification) + tokenizer (str): Type of tokenization used in the tokenization step. + framework (str): name deep learning framework to use. Must be 'pt' + (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. + return_softmax (bool): If True, the extractor returns softmaxed + sentiment scores instead of raw model predictions. + return_sequence (bool): If True, the extractor returns an additional + feature column with the encoded sequence. + model_kwargs (dict): Named arguments for pretrained model. + tokenizer_kwargs (dict): Named arguments for tokenizer. + ''' + + _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', + 'return_softmax', 'return_sequence', 'model_class', 'model_kwargs', + 'tokenizer_kwargs') + _model_attributes = ('pretrained_model', 'framework', 'tokenizer_type', + 'return_sequence', 'return_softmax',) + + def __init__(self, + pretrained_model='distilbert-base-uncased-finetuned-sst-2-english', + tokenizer='distilbert-base-uncased', + framework='pt', + return_softmax=True, + return_sequence=True, + model_kwargs=None, + tokenizer_kwargs=None): + self.return_sequence = return_sequence + self.return_softmax = return_softmax + super(BertSentimentExtractor, self).__init__(pretrained_model, + tokenizer, framework, 'AutoModelForSequenceClassification', + model_kwargs, tokenizer_kwargs) + + def _postprocess(self, preds, tok, wds, ons, dur): + data = preds[0].numpy().squeeze() + if self.return_softmax: + data = scipy.special.softmax(data) + data = data.tolist() + tok = [' '.join(wds)] + try: + dur = ons[-1] + dur[-1] - ons[0] + except: + dur = None + ons = ons[0] + feat = ['sent_pos', 'sent_neg'] + if self.return_sequence: + data += [tok] + feat += ['sequence'] + return data, feat, ons, dur + + class WordCounterExtractor(ComplexTextExtractor): ''' Extracts number of times each unique word has occurred within text From e577149fa85e0880faf8c08c5c6f2652ca4c0acd Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 24 Mar 2020 11:28:17 +0100 Subject: [PATCH 34/89] restore class hierarchy --- pliers/extractors/text.py | 73 +++++++------------ .../tests/extractors/test_text_extractors.py | 5 ++ 2 files changed, 31 insertions(+), 47 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 3d9b42d0..d616b9dd 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -403,10 +403,10 @@ def _extract(self, stim): features=self.features, orders=order_list) -class BertBaseExtractor(ComplexTextExtractor, metaclass=ABCMeta): - ''' Base class for all Extractors based on pretrained BERT. - This model returns the last hidden layer (without special tokens) - +class BertExtractor(ComplexTextExtractor): + ''' Returns encodings from the last hidden layer of a Bert or Bert-derived + model (ALBERT, DistilBERT, RoBERTa, CamemBERT). Excludes special tokens. + Base class for other Bert extractors. Args: pretrained_model (str): A string specifying which transformer model to use. Can be any pretrained BERT or BERT-derived (ALBERT, @@ -426,9 +426,6 @@ class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. return_tokens (bool): if True, the extractor returns encoded token and encoded word as features. - mask (str or int): if defined, specifies which word is to be replaced - with [MASK] token either by its index (int) in the input sequence, - or by specifying the word itself (str). model_kwargs (dict): Named arguments for transformer model. See https://huggingface.co/transformers/main_classes/model.html tokenizer_kwargs (dict): Named arguments for tokenizer. @@ -443,8 +440,9 @@ class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', - framework='pt', model_class='AutoModel', + framework='pt', + return_tokens=False, model_kwargs=None, tokenizer_kwargs=None): verify_dependencies(['transformers']) @@ -455,6 +453,7 @@ def __init__(self, self.tokenizer_type = tokenizer self.model_class = model_class self.framework = framework + self.return_tokens=return_tokens self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} model = model_class if self.framework == 'pt' else 'TF' + model_class @@ -462,14 +461,15 @@ def __init__(self, pretrained_model, **self.model_kwargs) self.tokenizer = transformers.BertTokenizer.from_pretrained( tokenizer, **self.tokenizer_kwargs) - super(BertBaseExtractor, self).__init__() + super(BertExtractor, self).__init__() def _mask_words(self, wds): ''' Called by _preprocess method. Takes list of words in the Stim as input (i.e. the .text attribute for each TextStim in the ComplexTextStim). If class has mask attribute, replaces word in the input sequence with [MASK] token based on the value of mask - (either index in the sequence, or word to replace) + (either index in the sequence, or word to replace). Here, returns + list of words (without masking) ''' return wds @@ -477,8 +477,7 @@ def _preprocess(self, stims): ''' Extracts text, onset, duration from ComplexTextStim, masks target words (if relevant), tokenizes the input, and casts words, onsets, and durations to token-level lists. Called within _extract method - to prepare input for the model. - ''' + to prepare input for the model. ''' els = [(e.text, e.onset, e.duration) for e in stims.elements] wds, ons, dur = map(list, zip(*els)) tok = [self.tokenizer.tokenize(w) for w in self._mask_words(wds)] @@ -491,7 +490,7 @@ def _preprocess(self, stims): def _extract(self, stims): ''' Takes stim as input, preprocesses it, feeds it to Bert model, - then postprocesses the output ''' + then postprocesses the output ''' wds, ons, dur, tok, idx = self._preprocess(stims) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] @@ -499,16 +498,23 @@ def _extract(self, stims): return ExtractorResult(data, stims, self, features=feat, onsets=ons, durations=dur) - @abstractmethod def _postprocess(self, preds, tok, wds, ons, dur): ''' Postprocesses model output (subsets relevant information, transforms it where relevant, adds model metadata). Takes prediction array, token list, word list, onsets - and durations and input. + and durations and input. Here, returns token-level encodings + (excluding special tokens). ''' - pass - + out = preds[0][:, 1:-1, :].numpy().squeeze() + data = [out.tolist()] + feat = ['encoding'] + if self.return_tokens: + data += [tok, wds] + feat += ['token', 'word'] + return data, feat, ons, dur + def _to_df(self, result, include_attributes=True): + pass res_dict = dict(zip(result.features, result._data)) if include_attributes: log_dict = {attr: getattr(result.extractor, attr) for @@ -519,34 +525,7 @@ def _to_df(self, result, include_attributes=True): return res_df -class BertEncodingExtractor(BertBaseExtractor): - ''' Returns from Bert or Bert-derived (ALBERT, DistilBERT, RoBERTa, - CamemBERT) encodings from the last hidden layer (excludes special tokens). - ''' - def __init__(self, - pretrained_model='bert-base-uncased', - tokenizer='bert-base-uncased', - framework='pt', - return_tokens=None, - model_kwargs=None, - tokenizer_kwargs=None): - super(BertEncodingExtractor, self).__init__(pretrained_model, - tokenizer, framework, 'AutoModelWithLMHead', model_kwargs, - tokenizer_kwargs) - self.return_tokens = return_tokens - - def _postprocess(self, preds, tok, wds, ons, dur): - ' Only returns encoding for tokens (excludes special tokens) ' - out = preds[0][:, 1:-1, :].numpy().squeeze() - data = [out.tolist()] - feat = ['encoding'] - if self.return_tokens: - data += [tok, wds] - feat += ['token', 'word'] - return data, feat, ons, dur - - -class BertSequenceEncodingExtractor(BertBaseExtractor): +class BertSequenceEncodingExtractor(BertExtractor): ''' Extract contextualized encodings for words or sequences using pretrained BertModel. Args: @@ -625,7 +604,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): return data, feat, ons, dur -class BertLMExtractor(BertBaseExtractor): +class BertLMExtractor(BertExtractor): ''' Returns masked words predictions for BERT (or BERT-derived) models. Args: pretrained_model (str): A string specifying which transformer @@ -744,7 +723,7 @@ def _return_masked_word(self, preds, feat, data): return feat, data -class BertSentimentExtractor(BertBaseExtractor): +class BertSentimentExtractor(BertExtractor): ''' Extracts sentiment for sequences using Bert or Bert-derived models fine-tuned for sentiment classification. Args: diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 49bf7897..22dc1533 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -318,7 +318,12 @@ def test_pretrained_bert_large_extractor(): res = ext.transform(stim).to_df() assert len(res['encoding'][0]) == 1024 +def test_bert_sequence_extract(): +def test_bert_LM_extractor(): + +def test_bert_sentiment_extractor(): + def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' ' again and again Sometimes they are ' From 1b9c4aea8ae13db9aec48087e4a8b8d33fb99ccb Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 24 Mar 2020 14:17:13 +0100 Subject: [PATCH 35/89] checkpoint --- pliers/extractors/__init__.py | 4 ++- pliers/extractors/text.py | 32 +++++++++++-------- .../tests/extractors/test_text_extractors.py | 5 +-- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/pliers/extractors/__init__.py b/pliers/extractors/__init__.py index dd82db8e..64d73fac 100644 --- a/pliers/extractors/__init__.py +++ b/pliers/extractors/__init__.py @@ -64,7 +64,8 @@ WordEmbeddingExtractor, TextVectorizerExtractor, VADERSentimentExtractor, SpaCyExtractor, WordCounterExtractor, BertExtractor, - BertSequenceEncodingExtractor, BertLMExtractor) + BertSequenceEncodingExtractor, BertLMExtractor, + BertSentimentExtractor) from .video import (FarnebackOpticalFlowExtractor) __all__ = [ @@ -142,5 +143,6 @@ 'BertExtractor', 'BertSequenceEncodingExtractor', 'BertLMExtractor', + 'BertSentimentExtractor', 'WordCounterExtractor' ] diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index d616b9dd..22553b1b 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -514,7 +514,6 @@ def _postprocess(self, preds, tok, wds, ons, dur): return data, feat, ons, dur def _to_df(self, result, include_attributes=True): - pass res_dict = dict(zip(result.features, result._data)) if include_attributes: log_dict = {attr: getattr(result.extractor, attr) for @@ -578,11 +577,14 @@ def __init__(self, pretrained_model='bert-base-uncased', self.pooling = pooling self.return_sep = return_sep self.return_sequence = return_sequence - super(BertSequenceEncodingExtractor, self).__init__(pretrained_model, - tokenizer, framework, 'AutoModel', model_kwargs, tokenizer_kwargs) + super(BertSequenceEncodingExtractor, self).__init__( + pretrained_model=pretrained_model, tokenizer=tokenizer, + model_class='AutoModel', framework=framework, model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs) def _postprocess(self, preds, tok, wds, ons, dur): preds = [p.numpy().squeeze() for p in preds] + self.preds = preds tok = [' '.join(wds)] try: dur = ons[-1] + dur[-1] - ons[0] @@ -590,13 +592,13 @@ def _postprocess(self, preds, tok, wds, ons, dur): dur = None ons = ons[0] if self.return_sep: - out = preds[0][:,-1,:] + out = preds[0][-1,:] elif self.pooling: pool_func = getattr(np, self.pooling) - out = pool_func(preds[0][:, 1:-1, :], axis=1, keepdims=True) + out = pool_func(preds[0][1:-1, :], axis=1, keepdims=True) else: out = preds[1] - data = [out.tolist()] + data = [[out.tolist()]] feat = ['encoding'] if self.return_sequence: data += [tok] @@ -650,7 +652,7 @@ def __init__(self, tokenizer='bert-base-uncased', framework='pt', mask='[MASK]', - top_n=100, + top_n=None, threshold=None, target=None, return_softmax=False, @@ -662,23 +664,25 @@ def __init__(self, 'are mutually exclusive') if type(mask) not in [int, str]: raise ValueError('Mask must be a string or an integer.') - self.top_n = top_n - self.threshold = threshold + super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, + tokenizer=tokenizer, framework=framework, model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, model_class='AutoModelWithLMHead') self.target = listify(target) if self.target: missing = set(self.target) - set(self.tokenizer.vocab.keys()) if missing: logging.warning(f'{missing} is not in vocabulary. Dropping.') - self.target = set(self.target) & set(self.tokenizer.vocab.keys()) + present = set(self.target) & set(self.tokenizer.vocab.keys()) + self.target = list(present) if self.target == []: raise ValueError('No valid target token. Import transformers' ' and run transformers.BertTokenizer.from_pretrained' f'(\'{tokenizer}\').vocab.keys() to see available tokens') + self.mask = mask + self.top_n = top_n + self.threshold = threshold self.return_softmax = return_softmax self.return_masked_word = return_masked_word - super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, - tokenizer=tokenizer, framework=framework, model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, model_class='AutoModelWithLMHead') def update_mask(self, new_mask): if type(new_mask) not in [str, int]: @@ -708,6 +712,8 @@ def _postprocess(self, preds, tok, wds, ons, dur): data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] if self.return_masked_word: feat, data = self._return_masked_word(preds, feat, data) + if len(self.target) > 1: + self.target = [self.target] ons, dur = map(lambda x: listify(x[self.mask_pos]), [ons, dur]) return data, feat, ons, dur diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 22dc1533..4b9fe062 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -313,8 +313,9 @@ def test_pretrained_bert_encoding_extractor(): reason='model too large') def test_pretrained_bert_large_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') - ext = PretrainedBertEncodingExtractor(pretrained_model_or_path='bert-large-uncased', - tokenizer='bert-large-uncased') + ext = PretrainedBertEncodingExtractor( + pretrained_model_or_path='bert-large-uncased', + tokenizer='bert-large-uncased') res = ext.transform(stim).to_df() assert len(res['encoding'][0]) == 1024 From 15e09e4b86e4da6ca05f9002f4a909daa4420f6e Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 25 Mar 2020 14:55:05 +0100 Subject: [PATCH 36/89] added full test suite for token-level encoding extractor --- pliers/extractors/text.py | 13 ++- .../tests/extractors/test_text_extractors.py | 80 +++++++++---------- 2 files changed, 49 insertions(+), 44 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 22553b1b..16059c30 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -433,7 +433,7 @@ class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'model_class', 'model_kwargs', 'tokenizer_kwargs') + 'model_class', 'return_tokens', 'model_kwargs', 'tokenizer_kwargs') _model_attributes = ('pretrained_model', 'framework', 'model_class', 'tokenizer_type') @@ -553,8 +553,8 @@ class BertSequenceEncodingExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'pooling', 'return_sep', 'model_class', 'model_kwargs', - 'tokenizer_kwargs') + 'pooling', 'return_sep', 'return_sequence', 'model_class', + 'model_kwargs', 'tokenizer_kwargs') _model_attributes = ('pretrained_model', 'framework', 'model_class', 'pooling', 'return_sep', 'tokenizer_type') @@ -643,7 +643,7 @@ class BertLMExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', - 'tokenizer_type', 'return_softmax') + 'tokenizer_type', 'return_softmax', 'return_masked_word') _model_attributes = ('pretrained_model', 'framework', 'top_n', 'mask', 'target', 'threshold', 'tokenizer_type') @@ -657,6 +657,7 @@ def __init__(self, target=None, return_softmax=False, return_masked_word=False, + return_sequence=False, model_kwargs=None, tokenizer_kwargs=None): if any([top_n and target, top_n and threshold, threshold and target]): @@ -683,6 +684,7 @@ def __init__(self, self.threshold = threshold self.return_softmax = return_softmax self.return_masked_word = return_masked_word + self.return_sequence = return_sequence def update_mask(self, new_mask): if type(new_mask) not in [str, int]: @@ -712,6 +714,9 @@ def _postprocess(self, preds, tok, wds, ons, dur): data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] if self.return_masked_word: feat, data = self._return_masked_word(preds, feat, data) + if self.return_sequence: + data += [' '.join(wds)] + feat += ['sequence'] if len(self.target) > 1: self.target = [self.target] ons, dur = map(lambda x: listify(x[self.mask_pos]), [ons, dur]) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 4b9fe062..01215cd8 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -7,7 +7,7 @@ WordEmbeddingExtractor, VADERSentimentExtractor, SpaCyExtractor, - PretrainedBertEncodingExtractor, + BertExtractor, WordCounterExtractor) from pliers.extractors.base import merge_results @@ -265,59 +265,59 @@ def test_spacy_doc_extractor(): assert result['is_sentenced'][3] -def test_pretrained_bert_encoding_extractor(): +def test_bert_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + stim_cam = ComplexTextStim(text='ceci n\'est pas un pipe') - ext_base = PretrainedBertEncodingExtractor(pretrained_model_or_path='bert-base-uncased') - ext_base_tf = PretrainedBertEncodingExtractor(framework='tf') - ext_sequence = PretrainedBertEncodingExtractor(encoding_level='sequence') - ext_sequence_pooling = PretrainedBertEncodingExtractor(encoding_level='sequence', pooling='mean') + ext_base = BertExtractor(pretrained_model='bert-base-uncased') + ext_large = BertExtractor(pretrained_model='bert-large-uncased') + ext_tf = BertExtractor(pretrained_model='bert-base-uncased', framework='tf') + ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', + return_tokens=True) + ext_distilbert = BertExtractor(pretrained_model='distilbert-base-uncased') + ext_roberta = BertExtractor(pretrained_model='roberta-base') + ext_camembert = BertExtractor(pretrained_model='camembert-base') - res = ext_base.transform(stim).to_df(metadata=True) - res_file = ext_base.transform(stim_file).to_df(metadata=True) - res_base_tf = ext_base_tf.transform(stim).to_df(metadata=True) - res_sequence = ext_sequence.transform(stim).to_df(metadata=True) - res_sequence_pooling = ext_sequence_pooling.transform(stim).to_df(metadata=True) + base_result = ext_base.transform(stim) + res = base_result.to_df() + res_model_attr = base_result.to_df(include_attributes=True) + res_large = ext_large.transform(stim).to_df() + res_tf = ext_tf.transform(stim).to_df() + res_token = ext_base_token.transform(stim).to_df() + res_file = ext_base.transform(stim_file).to_df() + res_distilbert = ext_base.transform(stim).to_df() + res_roberta = ext_base.transform(stim).to_df() + res_camembert = ext_base.transform(stim_cam).to_df() + # Test encoding shape assert len(res['encoding'][0]) == 768 + assert len(res_large['encoding'][0]) == 1024 + assert len(res_file['encoding'][0]) == 768 + assert len(res_distilbert['encoding'][0]) == 768 + assert len(res_roberta['encoding'][0]) == 768 + assert len(res_camembert['encoding'][0]) == 768 + + # test base extractor assert res.shape[0] == 8 - assert res['token'][5] == '##ized' - assert res['word'][5] == 'tokenized' - assert res['object_id'][5] == 5 - - assert len(res_base_tf['encoding'][0]) == 768 - assert all(np.round(res_base_tf['encoding'][0],3) == np.round(res['encoding'][0],3)) + assert res_token.shape[0] == 8 + assert res_token['token'][5] == '##ized' + assert res_token['word'][5] == 'tokenized' + assert res_token['object_id'][5] == 5 + # test base extractor on file assert res_file.shape[0] == 8 assert res_file['onset'][3] == 1.3 assert res_file['duration'][5] == 0.5 - assert res_file['duration'][5] == 0.5 - assert res_file['token'][5] == 'transform' - assert res_file['word'][5] == 'transformer' assert res_file['object_id'][5] == 5 - - assert res_sequence.shape[0] == 1 - assert len(res_sequence['encoding'][0]) == 768 - assert res_sequence_pooling.shape[0] == 1 - assert res_sequence_pooling['pooling'][0] == 'mean' - assert res_sequence['encoding'][0] != res_sequence_pooling['encoding'][0] - assert res_sequence['token'][0] == 'This is not a tokenized sentence .' - assert res_sequence['word'][0] == 'None' - del ext_base, ext_base_tf, ext_sequence, ext_sequence_pooling - del res, res_file, res_base_tf, res_sequence, res_sequence_pooling + # test tensorflow vs torch + cors = [np.corrcoef(res['encoding'][i], res_tf['encoding'][i])[0,1] + for i in range(res.shape[0])] + assert all(np.isclose(cors, 1)) - -@pytest.mark.skipif(environ.get('TRAVIS', False) == 'true', - reason='model too large') -def test_pretrained_bert_large_extractor(): - stim = ComplexTextStim(text='This is not a tokenized sentence.') - ext = PretrainedBertEncodingExtractor( - pretrained_model_or_path='bert-large-uncased', - tokenizer='bert-large-uncased') - res = ext.transform(stim).to_df() - assert len(res['encoding'][0]) == 1024 + # test model attributes + assert all([a in res_model_attr.columns for a in ext_base._model_attributes]) def test_bert_sequence_extract(): From 15a54468cfdd6fe10287e0b1d6180e9b5f49fb7c Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 25 Mar 2020 16:52:41 +0100 Subject: [PATCH 37/89] added BertSequenceEncoding test suit --- pliers/extractors/text.py | 53 ++++++------ .../tests/extractors/test_text_extractors.py | 83 +++++++++++++++++-- 2 files changed, 104 insertions(+), 32 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 16059c30..e3b0497b 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -424,7 +424,7 @@ class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel or RobertaForMaskedLM for RoBERTa). Fixed by each subclass. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. - return_tokens (bool): if True, the extractor returns encoded token + return_input (bool): if True, the extractor returns encoded token and encoded word as features. model_kwargs (dict): Named arguments for transformer model. See https://huggingface.co/transformers/main_classes/model.html @@ -433,7 +433,7 @@ class (e.g. BertModel or BertForMaskedLM for BERT, RobertaModel ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'model_class', 'return_tokens', 'model_kwargs', 'tokenizer_kwargs') + 'model_class', 'return_input', 'model_kwargs', 'tokenizer_kwargs') _model_attributes = ('pretrained_model', 'framework', 'model_class', 'tokenizer_type') @@ -442,7 +442,7 @@ def __init__(self, tokenizer='bert-base-uncased', model_class='AutoModel', framework='pt', - return_tokens=False, + return_input=False, model_kwargs=None, tokenizer_kwargs=None): verify_dependencies(['transformers']) @@ -453,7 +453,7 @@ def __init__(self, self.tokenizer_type = tokenizer self.model_class = model_class self.framework = framework - self.return_tokens=return_tokens + self.return_input=return_input self.model_kwargs = model_kwargs if model_kwargs else {} self.tokenizer_kwargs = tokenizer_kwargs if tokenizer_kwargs else {} model = model_class if self.framework == 'pt' else 'TF' + model_class @@ -508,7 +508,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): out = preds[0][:, 1:-1, :].numpy().squeeze() data = [out.tolist()] feat = ['encoding'] - if self.return_tokens: + if self.return_input: data += [tok, wds] feat += ['token', 'word'] return data, feat, ons, dur @@ -543,7 +543,7 @@ class BertSequenceEncodingExtractor(BertExtractor): encodings. return_sep (bool): defines whether to return encoding for the [SEP] token. - return_sequence (bool): If True, the extractor returns an additional + return_input (bool): If True, the extractor returns an additional feature column with the encoded sequence. model_kwargs (dict): Named arguments for pretrained model. See: https://huggingface.co/transformers/main_classes/model.html @@ -553,7 +553,7 @@ class BertSequenceEncodingExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'pooling', 'return_sep', 'return_sequence', 'model_class', + 'pooling', 'return_sep', 'return_input', 'model_class', 'model_kwargs', 'tokenizer_kwargs') _model_attributes = ('pretrained_model', 'framework', 'model_class', 'pooling', 'return_sep', 'tokenizer_type') @@ -563,7 +563,7 @@ def __init__(self, pretrained_model='bert-base-uncased', framework='pt', pooling=None, return_sep=False, - return_sequence=False, + return_input=False, model_kwargs=None, tokenizer_kwargs=None): if pooling: @@ -576,10 +576,10 @@ def __init__(self, pretrained_model='bert-base-uncased', raise(ValueError('Pooling must be a valid numpy function.')) self.pooling = pooling self.return_sep = return_sep - self.return_sequence = return_sequence super(BertSequenceEncodingExtractor, self).__init__( pretrained_model=pretrained_model, tokenizer=tokenizer, - model_class='AutoModel', framework=framework, model_kwargs=model_kwargs, + return_input=return_input, model_class='AutoModel', + framework=framework, model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs) def _postprocess(self, preds, tok, wds, ons, dur): @@ -600,7 +600,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): out = preds[1] data = [[out.tolist()]] feat = ['encoding'] - if self.return_sequence: + if self.return_input: data += [tok] feat += ['sequence'] return data, feat, ons, dur @@ -657,7 +657,7 @@ def __init__(self, target=None, return_softmax=False, return_masked_word=False, - return_sequence=False, + return_input=False, model_kwargs=None, tokenizer_kwargs=None): if any([top_n and target, top_n and threshold, threshold and target]): @@ -666,8 +666,9 @@ def __init__(self, if type(mask) not in [int, str]: raise ValueError('Mask must be a string or an integer.') super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, - tokenizer=tokenizer, framework=framework, model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, model_class='AutoModelWithLMHead') + tokenizer=tokenizer, framework=framework, return_input=return_input, + model_class='AutoModelWithLMHead', model_kwargs=model_kwargs, + tokenizer_kwargs=tokenizer_kwargs, ) self.target = listify(target) if self.target: missing = set(self.target) - set(self.tokenizer.vocab.keys()) @@ -684,7 +685,6 @@ def __init__(self, self.threshold = threshold self.return_softmax = return_softmax self.return_masked_word = return_masked_word - self.return_sequence = return_sequence def update_mask(self, new_mask): if type(new_mask) not in [str, int]: @@ -714,7 +714,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] if self.return_masked_word: feat, data = self._return_masked_word(preds, feat, data) - if self.return_sequence: + if self.return_input: data += [' '.join(wds)] feat += ['sequence'] if len(self.target) > 1: @@ -745,31 +745,32 @@ class BertSentimentExtractor(BertExtractor): (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. return_softmax (bool): If True, the extractor returns softmaxed sentiment scores instead of raw model predictions. - return_sequence (bool): If True, the extractor returns an additional + return_input (bool): If True, the extractor returns an additional feature column with the encoded sequence. model_kwargs (dict): Named arguments for pretrained model. tokenizer_kwargs (dict): Named arguments for tokenizer. ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'return_softmax', 'return_sequence', 'model_class', 'model_kwargs', + 'return_softmax', 'return_input', 'model_class', 'model_kwargs', 'tokenizer_kwargs') _model_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'return_sequence', 'return_softmax',) + 'return_input', 'return_softmax',) def __init__(self, pretrained_model='distilbert-base-uncased-finetuned-sst-2-english', tokenizer='distilbert-base-uncased', framework='pt', return_softmax=True, - return_sequence=True, + return_input=True, model_kwargs=None, - tokenizer_kwargs=None): - self.return_sequence = return_sequence + tokenizer_kwargs=None): self.return_softmax = return_softmax - super(BertSentimentExtractor, self).__init__(pretrained_model, - tokenizer, framework, 'AutoModelForSequenceClassification', - model_kwargs, tokenizer_kwargs) + super(BertSentimentExtractor, self).__init__( + pretrained_model=pretrained_model, tokenizer=tokenizer, + framework=framework, return_input=return_input, + model_class='AutoModelForSequenceClassification', + model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs) def _postprocess(self, preds, tok, wds, ons, dur): data = preds[0].numpy().squeeze() @@ -783,7 +784,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): dur = None ons = ons[0] feat = ['sent_pos', 'sent_neg'] - if self.return_sequence: + if self.return_input: data += [tok] feat += ['sequence'] return data, feat, ons, dur diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 01215cd8..cd9ec009 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -8,6 +8,7 @@ VADERSentimentExtractor, SpaCyExtractor, BertExtractor, + BertSequenceEncodingExtractor, WordCounterExtractor) from pliers.extractors.base import merge_results @@ -274,10 +275,11 @@ def test_bert_extractor(): ext_large = BertExtractor(pretrained_model='bert-large-uncased') ext_tf = BertExtractor(pretrained_model='bert-base-uncased', framework='tf') ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', - return_tokens=True) + return_input=True) ext_distilbert = BertExtractor(pretrained_model='distilbert-base-uncased') ext_roberta = BertExtractor(pretrained_model='roberta-base') - ext_camembert = BertExtractor(pretrained_model='camembert-base') + ext_camembert = BertExtractor(pretrained_model='camembert-base', + return_input=True) base_result = ext_base.transform(stim) res = base_result.to_df() @@ -286,9 +288,9 @@ def test_bert_extractor(): res_tf = ext_tf.transform(stim).to_df() res_token = ext_base_token.transform(stim).to_df() res_file = ext_base.transform(stim_file).to_df() - res_distilbert = ext_base.transform(stim).to_df() - res_roberta = ext_base.transform(stim).to_df() - res_camembert = ext_base.transform(stim_cam).to_df() + res_distilbert = ext_distilbert.transform(stim).to_df() + res_roberta = ext_roberta.transform(stim).to_df() + res_camembert = ext_camembert.transform(stim_cam).to_df() # Test encoding shape assert len(res['encoding'][0]) == 768 @@ -316,10 +318,79 @@ def test_bert_extractor(): for i in range(res.shape[0])] assert all(np.isclose(cors, 1)) + # test camembert + assert res_camembert['token'][4] == 'est' + # test model attributes assert all([a in res_model_attr.columns for a in ext_base._model_attributes]) -def test_bert_sequence_extract(): + # catch error if framework is invalid + with pytest.raises(ValueError) as err: + BertExtractor(framework='keras') + assert 'Invalid framework' in str(err.value) + + +def test_bert_sequence_extractor(): + stim = ComplexTextStim(text='This is not a tokenized sentence.') + stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + + ext = BertSequenceEncodingExtractor() + ext_tf = BertSequenceEncodingExtractor(framework='tf') + ext_sequence = BertSequenceEncodingExtractor(return_input=True) + ext_sep = BertSequenceEncodingExtractor(return_sep=True) + ext_mean = BertSequenceEncodingExtractor(pooling='mean') + ext_max = BertSequenceEncodingExtractor(pooling='max') + #ext_distil = BertSequenceEncodingExtractor(pretrained_model='distilbert-base-uncased') + + res = ext.transform(stim).to_df() + res_tf = ext.transform(stim).to_df() + res_file = ext.transform(stim_file).to_df() + res_sequence = ext_sequence.transform(stim).to_df() + res_sep = ext_sep.transform(stim).to_df() + res_mean = ext_mean.transform(stim).to_df() + res_max = ext_max.transform(stim).to_df() + #res_distil = ext_distil.transform(stim).to_df() + + # Check shape + assert len(res['encoding'][0]) == 768 + assert len(res_sep['encoding'][0]) == 768 + assert len(res_mean['encoding'][0]) == 768 + assert len(res_max['encoding'][0]) == 768 + #assert len(res_distil['encoding']) == 768 + assert res.shape[0] == 1 + assert res_sep.shape[0] == 1 + assert res_mean.shape[0] == 1 + assert res_max.shape[0] == 1 + #assert res_distil.shape[0] == 1 + + # Make sure pool/sep/no arguments return different encodings + assert res['encoding'][0] != res_sep['encoding'][0] + assert res['encoding'][0] != res_mean['encoding'][0] + assert res_sep['encoding'][0] != res_mean['encoding'][0] + assert res_max['encoding'][0] != res_mean['encoding'][0] + assert all(res_max['encoding'][0] >= res_mean['encoding'][0]) + + # test return sequence + assert res_sequence['sequence'][0] == 'This is not a tokenized sentence .' + + # test file stim + assert res_file['duration'][0] == 3.1 + assert res_file['onset'][0] == 0.2 + + # test tf vs. torch + cor = np.corrcoef(res['encoding'][0], res_tf['encoding'][0])[0,1] + assert np.isclose(cor, 1) + + # catch error with wrong numpy function + with pytest.raises(ValueError) as err: + BertSequenceEncodingExtractor(pooling='avg') + assert 'valid numpy function' in str(err.value) + + # catch error when both pooling and return_sep are defined + with pytest.raises(ValueError) as err: + BertSequenceEncodingExtractor(return_sep=True, pooling='mean') + assert 'mutually exclusive' in str(err.value) + def test_bert_LM_extractor(): From e16e743532de9ddd3727a73321b6620bc1137272 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 26 Mar 2020 12:19:17 +0100 Subject: [PATCH 38/89] fix pooling and return_special for sequence extractor; also fix return_input arg --- pliers/extractors/text.py | 63 +++++++++++-------- .../tests/extractors/test_text_extractors.py | 56 +++++++++-------- 2 files changed, 68 insertions(+), 51 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index e3b0497b..d06d75ce 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -498,7 +498,7 @@ def _extract(self, stims): return ExtractorResult(data, stims, self, features=feat, onsets=ons, durations=dur) - def _postprocess(self, preds, tok, wds, ons, dur): + def _postprocess(self, stims, preds, tok, wds, ons, dur): ''' Postprocesses model output (subsets relevant information, transforms it where relevant, adds model metadata). Takes prediction array, token list, word list, onsets @@ -538,11 +538,17 @@ class BertSequenceEncodingExtractor(BertExtractor): unknown tokens. framework (str): name deep learning framework to use. Must be 'pt' (PyTorch) or 'tf' (tensorflow). Defaults to 'pt'. - pooling (str): defines whether to return encoding for [CLS] token - (None, default), or the numpy function to use to pool token-level - encodings. - return_sep (bool): defines whether to return encoding for the [SEP] - token. + pooling (str): defines numpy function to use to pool token-level + encodings (excludes special tokens). + return_special (str): defines whether to return encoding for special + sequence tokens ('[CLS]' or '[SEP]'), instead of pooling of + other tokens. Must be '[CLS]', '[SEP]', or 'pooler_output'. + The latter option returns last layer hidden-state of [CLS] token + further processed by a linear layer and tanh activation function, + with linear weights trained on the next sentence classification + task. Note that some Bert-derived models, such as DistilBert, + were not trained on this task. For these models, setting this + argument to 'pooler_output' will return an error. return_input (bool): If True, the extractor returns an additional feature column with the encoded sequence. model_kwargs (dict): Named arguments for pretrained model. @@ -553,55 +559,62 @@ class BertSequenceEncodingExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'tokenizer_type', - 'pooling', 'return_sep', 'return_input', 'model_class', + 'pooling', 'return_special', 'return_input', 'model_class', 'model_kwargs', 'tokenizer_kwargs') _model_attributes = ('pretrained_model', 'framework', 'model_class', - 'pooling', 'return_sep', 'tokenizer_type') + 'pooling', 'return_special', 'tokenizer_type') def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', framework='pt', - pooling=None, - return_sep=False, + pooling='mean', + return_special=None, return_input=False, model_kwargs=None, tokenizer_kwargs=None): + if return_special and pooling: + logging.warning('Pooling and return_special argument are ' + 'mutually exclusive. Setting pooling to None.') + pooling = None if pooling: - if return_sep: - raise(ValueError('Pooling and return_seq argument are ' - 'mutually exclusive.')) try: getattr(np, pooling) except: raise(ValueError('Pooling must be a valid numpy function.')) + elif return_special: + if return_special not in ['[CLS]', '[SEP]', 'pooler_output']: + raise(ValueError('Value of return_special argument must be ' + 'one of \'[CLS]\', \'[SEP]\' or \'pooler_output\'')) self.pooling = pooling - self.return_sep = return_sep + self.return_special = return_special super(BertSequenceEncodingExtractor, self).__init__( pretrained_model=pretrained_model, tokenizer=tokenizer, return_input=return_input, model_class='AutoModel', framework=framework, model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs) - def _postprocess(self, preds, tok, wds, ons, dur): + def _postprocess(self, stims, preds, tok, wds, ons, dur): preds = [p.numpy().squeeze() for p in preds] self.preds = preds - tok = [' '.join(wds)] try: dur = ons[-1] + dur[-1] - ons[0] except: dur = None ons = ons[0] - if self.return_sep: - out = preds[0][-1,:] - elif self.pooling: + if self.pooling: pool_func = getattr(np, self.pooling) - out = pool_func(preds[0][1:-1, :], axis=1, keepdims=True) - else: - out = preds[1] + out = pool_func(preds[0][1:-1, :], axis=0) + elif self.return_special: + if self.return_special == '[CLS]': + out = preds[0][0,:] + elif self.return_special == '[SEP]': + out = preds[0][1,:] + else: + out = preds[1] data = [[out.tolist()]] feat = ['encoding'] if self.return_input: - data += [tok] + data += [stims.name] feat += ['sequence'] return data, feat, ons, dur @@ -698,7 +711,7 @@ def _mask_words(self, wds): mwds[self.mask_pos] = '[MASK]' return mwds - def _postprocess(self, preds, tok, wds, ons, dur): + def _postprocess(self, stims,preds, tok, wds, ons, dur): preds = preds[0].numpy()[:,1:-1,:] if self.return_softmax: preds = scipy.special.softmax(preds, axis=-1) @@ -715,7 +728,7 @@ def _postprocess(self, preds, tok, wds, ons, dur): if self.return_masked_word: feat, data = self._return_masked_word(preds, feat, data) if self.return_input: - data += [' '.join(wds)] + data += [stims.name] feat += ['sequence'] if len(self.target) > 1: self.target = [self.target] diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index cd9ec009..3c9aaedd 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -337,59 +337,63 @@ def test_bert_sequence_extractor(): ext = BertSequenceEncodingExtractor() ext_tf = BertSequenceEncodingExtractor(framework='tf') ext_sequence = BertSequenceEncodingExtractor(return_input=True) - ext_sep = BertSequenceEncodingExtractor(return_sep=True) - ext_mean = BertSequenceEncodingExtractor(pooling='mean') + ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') + ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') ext_max = BertSequenceEncodingExtractor(pooling='max') - #ext_distil = BertSequenceEncodingExtractor(pretrained_model='distilbert-base-uncased') + ext_distil = BertSequenceEncodingExtractor(pretrained_model='distilbert-base-uncased') + + # Test correct behavior when setting return_special + assert ext_cls.pooling is None + assert ext_pooler.pooling is None + assert ext_cls.return_special == '[CLS]' + assert ext_pooler.return_special == 'pooler_output' res = ext.transform(stim).to_df() res_tf = ext.transform(stim).to_df() res_file = ext.transform(stim_file).to_df() res_sequence = ext_sequence.transform(stim).to_df() - res_sep = ext_sep.transform(stim).to_df() - res_mean = ext_mean.transform(stim).to_df() + res_cls = ext_cls.transform(stim).to_df() + res_pooler = ext_pooler.transform(stim).to_df() res_max = ext_max.transform(stim).to_df() - #res_distil = ext_distil.transform(stim).to_df() + res_distil = ext_distil.transform(stim).to_df() # Check shape assert len(res['encoding'][0]) == 768 - assert len(res_sep['encoding'][0]) == 768 - assert len(res_mean['encoding'][0]) == 768 + assert len(res_cls['encoding'][0]) == 768 + assert len(res_pooler['encoding'][0]) == 768 assert len(res_max['encoding'][0]) == 768 - #assert len(res_distil['encoding']) == 768 + assert len(res_distil['encoding'][0]) == 768 assert res.shape[0] == 1 - assert res_sep.shape[0] == 1 - assert res_mean.shape[0] == 1 + assert res_cls.shape[0] == 1 + assert res_pooler.shape[0] == 1 assert res_max.shape[0] == 1 - #assert res_distil.shape[0] == 1 - - # Make sure pool/sep/no arguments return different encodings - assert res['encoding'][0] != res_sep['encoding'][0] - assert res['encoding'][0] != res_mean['encoding'][0] - assert res_sep['encoding'][0] != res_mean['encoding'][0] - assert res_max['encoding'][0] != res_mean['encoding'][0] - assert all(res_max['encoding'][0] >= res_mean['encoding'][0]) - + assert res_distil.shape[0] == 1 + + # Make sure pooler/cls/no arguments return different encodings + assert res['encoding'][0] != res_cls['encoding'][0] + assert res['encoding'][0] != res_pooler['encoding'][0] + assert res['encoding'][0] != res_max['encoding'][0] + assert all([res_max['encoding'][0][i] >= res['encoding'][0][i] + for i in range(768)]) + # test return sequence assert res_sequence['sequence'][0] == 'This is not a tokenized sentence .' # test file stim - assert res_file['duration'][0] == 3.1 + assert res_file['duration'][0] == 2.9 assert res_file['onset'][0] == 0.2 # test tf vs. torch cor = np.corrcoef(res['encoding'][0], res_tf['encoding'][0])[0,1] assert np.isclose(cor, 1) - # catch error with wrong numpy function + # catch error with wrong numpy function and wrong special token arg with pytest.raises(ValueError) as err: BertSequenceEncodingExtractor(pooling='avg') assert 'valid numpy function' in str(err.value) - - # catch error when both pooling and return_sep are defined with pytest.raises(ValueError) as err: - BertSequenceEncodingExtractor(return_sep=True, pooling='mean') - assert 'mutually exclusive' in str(err.value) + BertSequenceEncodingExtractor(return_special='[MASK]') + assert 'must be one of' in str(err.value) def test_bert_LM_extractor(): From ad62c99f58e61e89ac9080fcd5f7e4c0b149edfb Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 26 Mar 2020 22:25:56 +0100 Subject: [PATCH 39/89] add first bertLM tests --- pliers/extractors/text.py | 2 +- .../tests/extractors/test_text_extractors.py | 60 ++++++++++++++++--- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index d06d75ce..da099059 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -681,7 +681,7 @@ def __init__(self, super(BertLMExtractor, self).__init__(pretrained_model=pretrained_model, tokenizer=tokenizer, framework=framework, return_input=return_input, model_class='AutoModelWithLMHead', model_kwargs=model_kwargs, - tokenizer_kwargs=tokenizer_kwargs, ) + tokenizer_kwargs=tokenizer_kwargs) self.target = listify(target) if self.target: missing = set(self.target) - set(self.tokenizer.vocab.keys()) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 3c9aaedd..44b131aa 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -272,14 +272,18 @@ def test_bert_extractor(): stim_cam = ComplexTextStim(text='ceci n\'est pas un pipe') ext_base = BertExtractor(pretrained_model='bert-base-uncased') - ext_large = BertExtractor(pretrained_model='bert-large-uncased') - ext_tf = BertExtractor(pretrained_model='bert-base-uncased', framework='tf') + ext_large = BertExtractor(pretrained_model='bert-large-uncased', + tokenizer='bert-large-uncased') + ext_tf = BertExtractor(pretrained_model='bert-base-uncased', + framework='tf') ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', return_input=True) - ext_distilbert = BertExtractor(pretrained_model='distilbert-base-uncased') - ext_roberta = BertExtractor(pretrained_model='roberta-base') - ext_camembert = BertExtractor(pretrained_model='camembert-base', - return_input=True) + ext_distilbert = BertExtractor(pretrained_model='distilbert-base-uncased', + tokenizer='distilbert-base-uncased') + ext_roberta = BertExtractor(pretrained_model='roberta-base', + tokenizer='roberta-base') + ext_camembert = BertExtractor(pretrained_model='camembert-base', + tokenizer='camembert-base', return_input=True) base_result = ext_base.transform(stim) res = base_result.to_df() @@ -340,7 +344,8 @@ def test_bert_sequence_extractor(): ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') ext_max = BertSequenceEncodingExtractor(pooling='max') - ext_distil = BertSequenceEncodingExtractor(pretrained_model='distilbert-base-uncased') + ext_distil = BertSequenceEncodingExtractor(pretrained_model='distilbert-base-uncased', + tokenizer='distilbert-base-uncased') # Test correct behavior when setting return_special assert ext_cls.pooling is None @@ -397,6 +402,47 @@ def test_bert_sequence_extractor(): def test_bert_LM_extractor(): + stim = ComplexTextStim(text='This is not a tokenized sentence.') + stim_masked = ComplexTextStim(text='This is [MASK] tokenized sentence.') + stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + + with pytest.raises(ValueError) as err: + BertLMExtractor(top_n=100, target='test') + assert 'mutually exclusive' in str(err.value) + with pytest.raises(ValueError) as err: + BertLMExtractor(top_n=100, threshold=.5) + assert 'mutually exclusive' in str(err.value) + with pytest.raises*(ValueError) as err: + BertLMExtractor(target='test', threshold=.5) + assert 'mutually exclusive' in str(err.value) + with pytest.raises(ValueError) as err: + BertLMExtractor(mask=['test', 'mask']) + assert 'must be a string' in str(err.value) + with pytest.raises(ValueError) as err: + BertLMExtractor(target='nonwd') + assert 'No valid target token' in str(err.value) + + ext = BertLMExtractor(mask=2) + ext_masked = BertLMExtractor() + ext_target = BertLMExtractor(target=['target','word']) + ext_distil = BertLMExtractor(mask=2, pretrained_model='distilbert-base-uncased', + tokenizer='distilbert-base-uncased') + + res = ext.transform(stim).to_df() + res_masked = ext_masked.transform(stim_masked).to_df() + res_file = ext.transform(stim_file).to_df() + res_target = ext_target(stim).to_df() + res_distil = ext_distil.transform(stim).to_df() + + assert res.shape[0] == 1 + assert res_distil.shape[0] == 1 + + # test onset/duration + assert res_file['onset'][0] == 1.0 + assert res_file['duration'][0] == 0.2 + + + def test_bert_sentiment_extractor(): From 5275d70dcc655a3a99333c0361dc8c24c77ed2c5 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 27 Mar 2020 15:22:26 +0100 Subject: [PATCH 40/89] added all tests for bertLM and sentiment extractor --- pliers/extractors/text.py | 33 +++--- .../tests/extractors/test_text_extractors.py | 109 +++++++++++++++--- 2 files changed, 109 insertions(+), 33 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index da099059..d88a99cf 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -494,7 +494,7 @@ def _extract(self, stims): wds, ons, dur, tok, idx = self._preprocess(stims) preds = self.model(idx) preds = [p.detach() if self.framework == 'pt' else p for p in preds] - data, feat, ons, dur = self._postprocess(preds, tok, wds, ons, dur) + data, feat, ons, dur = self._postprocess(stims, preds, tok, wds, ons, dur) return ExtractorResult(data, stims, self, features=feat, onsets=ons, durations=dur) @@ -516,7 +516,7 @@ def _postprocess(self, stims, preds, tok, wds, ons, dur): def _to_df(self, result, include_attributes=True): res_dict = dict(zip(result.features, result._data)) if include_attributes: - log_dict = {attr: getattr(result.extractor, attr) for + log_dict = {attr: [getattr(result.extractor, attr)] for attr in self._model_attributes} res_dict.update(log_dict) res_df = pd.DataFrame(res_dict) @@ -656,7 +656,7 @@ class BertLMExtractor(BertExtractor): ''' _log_attributes = ('pretrained_model', 'framework', 'top_n', 'target', - 'tokenizer_type', 'return_softmax', 'return_masked_word') + 'mask', 'tokenizer_type', 'return_softmax', 'return_masked_word') _model_attributes = ('pretrained_model', 'framework', 'top_n', 'mask', 'target', 'threshold', 'tokenizer_type') @@ -664,7 +664,7 @@ def __init__(self, pretrained_model='bert-base-uncased', tokenizer='bert-base-uncased', framework='pt', - mask='[MASK]', + mask='MASK', top_n=None, threshold=None, target=None, @@ -673,7 +673,9 @@ def __init__(self, return_input=False, model_kwargs=None, tokenizer_kwargs=None): - if any([top_n and target, top_n and threshold, threshold and target]): + if any([top_n and target, + top_n and threshold, + threshold and target]): raise ValueError('top_n, threshold and target arguments ' 'are mutually exclusive') if type(mask) not in [int, str]: @@ -686,7 +688,7 @@ def __init__(self, if self.target: missing = set(self.target) - set(self.tokenizer.vocab.keys()) if missing: - logging.warning(f'{missing} is not in vocabulary. Dropping.') + logging.warning(f'{missing} not in vocabulary. Dropping.') present = set(self.target) & set(self.tokenizer.vocab.keys()) self.target = list(present) if self.target == []: @@ -701,7 +703,7 @@ def __init__(self, def update_mask(self, new_mask): if type(new_mask) not in [str, int]: - raise ValueError('Mask must be an integer or a string') + raise ValueError('Mask must be a string or an integer.') self.mask = new_mask def _mask_words(self, wds): @@ -711,7 +713,7 @@ def _mask_words(self, wds): mwds[self.mask_pos] = '[MASK]' return mwds - def _postprocess(self, stims,preds, tok, wds, ons, dur): + def _postprocess(self, stims, preds, tok, wds, ons, dur): preds = preds[0].numpy()[:,1:-1,:] if self.return_softmax: preds = scipy.special.softmax(preds, axis=-1) @@ -721,24 +723,25 @@ def _postprocess(self, stims,preds, tok, wds, ons, dur): elif self.target: sub_idx = self.tokenizer.convert_tokens_to_ids(self.target) elif self.threshold: - sub_idx = np.where(preds[0,self.mask_pos,:] > self.threshold)[0] + sub_idx = np.where(preds[0,self.mask_pos,:] >= self.threshold)[0] + else: + sub_idx = out_idx out_idx = [idx for idx in out_idx if idx in sub_idx] feat = self.tokenizer.convert_ids_to_tokens(out_idx) + feat = [f.upper() for f in feat] data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] if self.return_masked_word: feat, data = self._return_masked_word(preds, feat, data) if self.return_input: data += [stims.name] feat += ['sequence'] - if len(self.target) > 1: - self.target = [self.target] ons, dur = map(lambda x: listify(x[self.mask_pos]), [ons, dur]) return data, feat, ons, dur def _return_masked_word(self, preds, feat, data): if self.mask_token in self.tokenizer.vocab: true_vocab_idx = self.tokenizer.vocab[self.mask_token] - true_score = preds[0, self.mask_pos, true_vocab_idx] + true_score = preds[0,self.mask_pos,true_vocab_idx] else: true_score = np.nan logging.warning('True token not in vocabulary. Returning NaN') @@ -772,10 +775,10 @@ class BertSentimentExtractor(BertExtractor): def __init__(self, pretrained_model='distilbert-base-uncased-finetuned-sst-2-english', - tokenizer='distilbert-base-uncased', + tokenizer='bert-base-uncased', framework='pt', return_softmax=True, - return_input=True, + return_input=False, model_kwargs=None, tokenizer_kwargs=None): self.return_softmax = return_softmax @@ -785,7 +788,7 @@ def __init__(self, model_class='AutoModelForSequenceClassification', model_kwargs=model_kwargs, tokenizer_kwargs=tokenizer_kwargs) - def _postprocess(self, preds, tok, wds, ons, dur): + def _postprocess(self, stims, preds, tok, wds, ons, dur): data = preds[0].numpy().squeeze() if self.return_softmax: data = scipy.special.softmax(data) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 44b131aa..54aafe8d 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -9,8 +9,9 @@ SpaCyExtractor, BertExtractor, BertSequenceEncodingExtractor, + BertLMExtractor, + BertSentimentExtractor, WordCounterExtractor) - from pliers.extractors.base import merge_results from pliers.stimuli import TextStim, ComplexTextStim from ..utils import get_test_data_path @@ -20,6 +21,7 @@ import pytest import spacy from os import environ +from transformers import BertTokenizer TEXT_DIR = join(get_test_data_path(), 'text') @@ -278,12 +280,10 @@ def test_bert_extractor(): framework='tf') ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', return_input=True) - ext_distilbert = BertExtractor(pretrained_model='distilbert-base-uncased', - tokenizer='distilbert-base-uncased') - ext_roberta = BertExtractor(pretrained_model='roberta-base', - tokenizer='roberta-base') - ext_camembert = BertExtractor(pretrained_model='camembert-base', - tokenizer='camembert-base', return_input=True) + ext_distilbert = BertExtractor(pretrained_model='distilbert-base-uncased') + ext_roberta = BertExtractor(pretrained_model='roberta-base') + ext_camembert = BertExtractor(pretrained_model='camembert-base', + return_input=True) base_result = ext_base.transform(stim) res = base_result.to_df() @@ -332,7 +332,7 @@ def test_bert_extractor(): with pytest.raises(ValueError) as err: BertExtractor(framework='keras') assert 'Invalid framework' in str(err.value) - + def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -344,8 +344,7 @@ def test_bert_sequence_extractor(): ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') ext_max = BertSequenceEncodingExtractor(pooling='max') - ext_distil = BertSequenceEncodingExtractor(pretrained_model='distilbert-base-uncased', - tokenizer='distilbert-base-uncased') + ext_distil = BertSequenceEncodingExtractor(pretrained_model='distilbert-base-uncased') # Test correct behavior when setting return_special assert ext_cls.pooling is None @@ -403,16 +402,17 @@ def test_bert_sequence_extractor(): def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') - stim_masked = ComplexTextStim(text='This is [MASK] tokenized sentence.') + stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + # Test mutual exclusivity and mask values with pytest.raises(ValueError) as err: BertLMExtractor(top_n=100, target='test') assert 'mutually exclusive' in str(err.value) with pytest.raises(ValueError) as err: BertLMExtractor(top_n=100, threshold=.5) assert 'mutually exclusive' in str(err.value) - with pytest.raises*(ValueError) as err: + with pytest.raises(ValueError) as err: BertLMExtractor(target='test', threshold=.5) assert 'mutually exclusive' in str(err.value) with pytest.raises(ValueError) as err: @@ -421,18 +421,29 @@ def test_bert_LM_extractor(): with pytest.raises(ValueError) as err: BertLMExtractor(target='nonwd') assert 'No valid target token' in str(err.value) - + + target_wds = ['target','word'] ext = BertLMExtractor(mask=2) ext_masked = BertLMExtractor() - ext_target = BertLMExtractor(target=['target','word']) - ext_distil = BertLMExtractor(mask=2, pretrained_model='distilbert-base-uncased', - tokenizer='distilbert-base-uncased') + ext_target = BertLMExtractor(mask=1, target=target_wds) + ext_target_tf = BertLMExtractor(framework='tf', mask=1, target=target_wds) + ext_topn = BertLMExtractor(mask=3, top_n=100) + ext_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True) + ext_distil = BertLMExtractor(mask=2, pretrained_model='distilbert-base-uncased') + ext_default = BertLMExtractor() + ext_return_mask = BertLMExtractor(mask=1, top_n=10, + return_masked_word=True, return_input=True) res = ext.transform(stim).to_df() res_masked = ext_masked.transform(stim_masked).to_df() res_file = ext.transform(stim_file).to_df() - res_target = ext_target(stim).to_df() + res_target = ext_target.transform(stim).to_df() + res_target_tf = ext_target_tf.transform(stim).to_df() + res_topn = ext_topn.transform(stim).to_df() + res_threshold = ext_threshold.transform(stim).to_df() res_distil = ext_distil.transform(stim).to_df() + res_default = ext_default.transform(stim_masked).to_df() + res_return_mask = ext_return_mask.transform(stim).to_df() assert res.shape[0] == 1 assert res_distil.shape[0] == 1 @@ -441,11 +452,73 @@ def test_bert_LM_extractor(): assert res_file['onset'][0] == 1.0 assert res_file['duration'][0] == 0.2 + # Check target words + assert all([w.upper() in res_target.columns for w in target_wds]) + assert res_target.shape[1] == 13 + + # Check top_n + assert res_topn.shape[1] == 111 + assert all([res_topn.iloc[:,3][0] > res_topn.iloc[:,i][0] for i in range(4,103)]) + + # Check threshold and return_softmax + tknz = BertTokenizer.from_pretrained('bert-base-uncased') + vocab = tknz.vocab.keys() + for v in vocab: + if v.upper() in res_threshold.columns: + assert res_threshold[v.upper()][0] >= .1 + assert res_threshold[v.upper()][0] <= 1 + + # torch vs tf + assert all([np.isclose(res_target[t.upper()][0], + res_target_tf[t.upper()][0]) for t in target_wds]) + + # Test update mask method + assert ext_target.mask == 1 + ext_target.update_mask(new_mask='sentence') + assert ext_target.mask == 'sentence' + res_target_new = ext_target.transform(stim).to_df() + #assert all([res_target[c][0] != res_target_new[c][0] + # for c in ['TARGET', 'WORD', 'mask']]) + with pytest.raises(ValueError) as err: + ext_target.update_mask(new_mask=['some', 'mask']) + assert 'must be a string' in str(err.value) + + # Test default mask + assert res_default.shape[0] == 1 + assert res_default['mask'][0] == 'MASK' + # Test return mask and input + assert res_return_mask['true_word'][0] == 'is' + assert 'true_word_score' in res_return_mask.columns + assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' def test_bert_sentiment_extractor(): - + stim = ComplexTextStim(text='This is the best day of my life.') + stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + + ext = BertSentimentExtractor() + ext_tf = BertSentimentExtractor(framework='tf') + ext_seq = BertSentimentExtractor(return_input=True) + ext_softmax = BertSentimentExtractor(return_softmax=True) + + res = ext.transform(stim).to_df() + res_file = ext.transform(stim_file).to_df() + res_tf = ext_tf.transform(stim).to_df() + res_seq = ext_seq.transform(stim).to_df() + res_softmax = ext_softmax.transform(stim).to_df() + + assert res.shape[0] == 1 + assert res_file['onset'][0] == 0.2 + assert res_file['duration'][0] == 2.9 + assert all([s in res.columns for s in ['sent_pos', 'sent_neg']]) + assert np.isclose(res['sent_pos'][0], res_tf['sent_pos'][0]) + assert np.isclose(res['sent_neg'][0], res_tf['sent_neg'][0]) + assert res_seq['sequence'][0] == 'This is the best day of my life .' + assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) + assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) + + def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' ' again and again Sometimes they are ' From b47bbbbc93881afd39da6e9159278827f1deb6c1 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 27 Mar 2020 15:59:15 +0100 Subject: [PATCH 41/89] disable caching for LM test --- pliers/extractors/text.py | 2 +- .../tests/extractors/test_text_extractors.py | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index d88a99cf..bdff9868 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -728,7 +728,7 @@ def _postprocess(self, stims, preds, tok, wds, ons, dur): sub_idx = out_idx out_idx = [idx for idx in out_idx if idx in sub_idx] feat = self.tokenizer.convert_ids_to_tokens(out_idx) - feat = [f.upper() for f in feat] + feat = [f.capitalize() for f in feat] data = [listify(p) for p in preds[0,self.mask_pos,out_idx]] if self.return_masked_word: feat, data = self._return_masked_word(preds, feat, data) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 54aafe8d..96d5e0e5 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -1,3 +1,4 @@ +from pliers import config from pliers.extractors import (DictionaryExtractor, PartOfSpeechExtractor, LengthExtractor, @@ -401,6 +402,7 @@ def test_bert_sequence_extractor(): def test_bert_LM_extractor(): + config.set_option('cache_transformers', False) stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -453,7 +455,7 @@ def test_bert_LM_extractor(): assert res_file['duration'][0] == 0.2 # Check target words - assert all([w.upper() in res_target.columns for w in target_wds]) + assert all([w.capitalize() in res_target.columns for w in target_wds]) assert res_target.shape[1] == 13 # Check top_n @@ -464,21 +466,21 @@ def test_bert_LM_extractor(): tknz = BertTokenizer.from_pretrained('bert-base-uncased') vocab = tknz.vocab.keys() for v in vocab: - if v.upper() in res_threshold.columns: - assert res_threshold[v.upper()][0] >= .1 - assert res_threshold[v.upper()][0] <= 1 + if v.capitalize() in res_threshold.columns: + assert res_threshold[v.capitalize()][0] >= .1 + assert res_threshold[v.capitalize()][0] <= 1 # torch vs tf - assert all([np.isclose(res_target[t.upper()][0], - res_target_tf[t.upper()][0]) for t in target_wds]) + assert all([np.isclose(res_target[t.capitalize()][0], + res_target_tf[t.capitalize()][0]) for t in target_wds]) # Test update mask method assert ext_target.mask == 1 ext_target.update_mask(new_mask='sentence') assert ext_target.mask == 'sentence' res_target_new = ext_target.transform(stim).to_df() - #assert all([res_target[c][0] != res_target_new[c][0] - # for c in ['TARGET', 'WORD', 'mask']]) + assert all([res_target[c][0] != res_target_new[c][0] + for c in ['TARGET', 'WORD', 'mask']]) with pytest.raises(ValueError) as err: ext_target.update_mask(new_mask=['some', 'mask']) assert 'must be a string' in str(err.value) From 6bb970f3b46aa657a3cc03949cab3650d80a2268 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 27 Mar 2020 16:12:47 +0100 Subject: [PATCH 42/89] resolve conflict --- pliers/extractors/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pliers/extractors/__init__.py b/pliers/extractors/__init__.py index 6b2c7e15..23045912 100644 --- a/pliers/extractors/__init__.py +++ b/pliers/extractors/__init__.py @@ -146,6 +146,5 @@ 'BertLMExtractor', 'BertSentimentExtractor', 'AudiosetLabelExtractor', - 'PretrainedBertEncodingExtractor', 'WordCounterExtractor' ] From 846111c005ec8a479d3c1f2dd5d5b840e2f4260e Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 30 Mar 2020 14:37:50 +0200 Subject: [PATCH 43/89] fix ExtractorResult caching issue --- pliers/extractors/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pliers/extractors/base.py b/pliers/extractors/base.py index 5396a065..f9910c2c 100644 --- a/pliers/extractors/base.py +++ b/pliers/extractors/base.py @@ -7,6 +7,7 @@ from pliers.transformers import Transformer from pliers.utils import isgenerator, flatten, listify from pandas.api.types import is_numeric_dtype +from copy import deepcopy class Extractor(with_metaclass(ABCMeta, Transformer)): @@ -56,7 +57,7 @@ def __init__(self, data, stim, extractor, features=None, onsets=None, durations=None, orders=None): self._data = data self.stim = stim - self.extractor = extractor + self.extractor = deepcopy(extractor) self.features = features self._history = None self.onset = onsets From bff8b24b701001124f4d793a9200776a32d304b0 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 30 Mar 2020 16:02:56 +0200 Subject: [PATCH 44/89] remove set cache_transformer statement --- pliers/tests/extractors/test_text_extractors.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 96d5e0e5..fd35eb6a 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -402,7 +402,6 @@ def test_bert_sequence_extractor(): def test_bert_LM_extractor(): - config.set_option('cache_transformers', False) stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -480,7 +479,7 @@ def test_bert_LM_extractor(): assert ext_target.mask == 'sentence' res_target_new = ext_target.transform(stim).to_df() assert all([res_target[c][0] != res_target_new[c][0] - for c in ['TARGET', 'WORD', 'mask']]) + for c in ['Target', 'Word', 'mask']]) with pytest.raises(ValueError) as err: ext_target.update_mask(new_mask=['some', 'mask']) assert 'must be a string' in str(err.value) From ab926384af4b305ed80f0e358a21c805bcaf2248 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 30 Mar 2020 20:51:23 +0200 Subject: [PATCH 45/89] fix to_df format --- pliers/extractors/text.py | 12 +++++------- pliers/tests/extractors/test_text_extractors.py | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index bdff9868..6e85494e 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -514,12 +514,10 @@ def _postprocess(self, stims, preds, tok, wds, ons, dur): return data, feat, ons, dur def _to_df(self, result, include_attributes=True): - res_dict = dict(zip(result.features, result._data)) + res_df = pd.DataFrame(dict(zip(result.features, result._data))) if include_attributes: - log_dict = {attr: [getattr(result.extractor, attr)] for - attr in self._model_attributes} - res_dict.update(log_dict) - res_df = pd.DataFrame(res_dict) + for attr in self._model_attributes: + res_df[attr] = pd.Series([getattr(result.extractor, attr)]) res_df['object_id'] = range(res_df.shape[0]) return res_df @@ -792,7 +790,7 @@ def _postprocess(self, stims, preds, tok, wds, ons, dur): data = preds[0].numpy().squeeze() if self.return_softmax: data = scipy.special.softmax(data) - data = data.tolist() + data = [listify(d) for d in data.tolist()] tok = [' '.join(wds)] try: dur = ons[-1] + dur[-1] - ons[0] @@ -801,7 +799,7 @@ def _postprocess(self, stims, preds, tok, wds, ons, dur): ons = ons[0] feat = ['sent_pos', 'sent_neg'] if self.return_input: - data += [tok] + data += tok feat += ['sequence'] return data, feat, ons, dur diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index fd35eb6a..2c7e954f 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -15,7 +15,7 @@ WordCounterExtractor) from pliers.extractors.base import merge_results from pliers.stimuli import TextStim, ComplexTextStim -from ..utils import get_test_data_path +from pliers.tests.utils import get_test_data_path import numpy as np from os.path import join From c7bde953e158d5f43fcd9730055cb61d8874cbcc Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 30 Mar 2020 21:42:30 +0200 Subject: [PATCH 46/89] fix spacing --- pliers/tests/extractors/test_text_extractors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 2c7e954f..ecc23b5e 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -16,7 +16,6 @@ from pliers.extractors.base import merge_results from pliers.stimuli import TextStim, ComplexTextStim from pliers.tests.utils import get_test_data_path - import numpy as np from os.path import join import pytest From 9f267fb95d0e20690b47a25536262925df5a8231 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 10:16:07 +0200 Subject: [PATCH 47/89] try tests without tf --- pliers/tests/extractors/test_text_extractors.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index ecc23b5e..ede375dd 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -276,8 +276,8 @@ def test_bert_extractor(): ext_base = BertExtractor(pretrained_model='bert-base-uncased') ext_large = BertExtractor(pretrained_model='bert-large-uncased', tokenizer='bert-large-uncased') - ext_tf = BertExtractor(pretrained_model='bert-base-uncased', - framework='tf') + #ext_tf = BertExtractor(pretrained_model='bert-base-uncased', + # framework='tf') ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', return_input=True) ext_distilbert = BertExtractor(pretrained_model='distilbert-base-uncased') @@ -289,7 +289,7 @@ def test_bert_extractor(): res = base_result.to_df() res_model_attr = base_result.to_df(include_attributes=True) res_large = ext_large.transform(stim).to_df() - res_tf = ext_tf.transform(stim).to_df() + #res_tf = ext_tf.transform(stim).to_df() res_token = ext_base_token.transform(stim).to_df() res_file = ext_base.transform(stim_file).to_df() res_distilbert = ext_distilbert.transform(stim).to_df() @@ -318,9 +318,9 @@ def test_bert_extractor(): assert res_file['object_id'][5] == 5 # test tensorflow vs torch - cors = [np.corrcoef(res['encoding'][i], res_tf['encoding'][i])[0,1] - for i in range(res.shape[0])] - assert all(np.isclose(cors, 1)) + #cors = [np.corrcoef(res['encoding'][i], res_tf['encoding'][i])[0,1] + # for i in range(res.shape[0])] + #assert all(np.isclose(cors, 1)) # test camembert assert res_camembert['token'][4] == 'est' @@ -333,6 +333,7 @@ def test_bert_extractor(): BertExtractor(framework='keras') assert 'Invalid framework' in str(err.value) +''' def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -549,3 +550,4 @@ def test_word_counter_extractor(): assert result_stim_txt['log_word_count'][15] == np.log(2) assert result_stim_txt['log_word_count'][44] == np.log(3) +''' From aa7d9d7ece1df99bee4caf3ef65b4123a8b49686 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 11:08:39 +0200 Subject: [PATCH 48/89] try splitting tests --- .../tests/extractors/test_text_extractors.py | 50 +++++++++---------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index ede375dd..0185bb15 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -271,38 +271,20 @@ def test_spacy_doc_extractor(): def test_bert_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - stim_cam = ComplexTextStim(text='ceci n\'est pas un pipe') ext_base = BertExtractor(pretrained_model='bert-base-uncased') - ext_large = BertExtractor(pretrained_model='bert-large-uncased', - tokenizer='bert-large-uncased') - #ext_tf = BertExtractor(pretrained_model='bert-base-uncased', - # framework='tf') - ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', + ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', return_input=True) - ext_distilbert = BertExtractor(pretrained_model='distilbert-base-uncased') - ext_roberta = BertExtractor(pretrained_model='roberta-base') - ext_camembert = BertExtractor(pretrained_model='camembert-base', - return_input=True) base_result = ext_base.transform(stim) res = base_result.to_df() res_model_attr = base_result.to_df(include_attributes=True) - res_large = ext_large.transform(stim).to_df() - #res_tf = ext_tf.transform(stim).to_df() res_token = ext_base_token.transform(stim).to_df() res_file = ext_base.transform(stim_file).to_df() - res_distilbert = ext_distilbert.transform(stim).to_df() - res_roberta = ext_roberta.transform(stim).to_df() - res_camembert = ext_camembert.transform(stim_cam).to_df() # Test encoding shape assert len(res['encoding'][0]) == 768 - assert len(res_large['encoding'][0]) == 1024 assert len(res_file['encoding'][0]) == 768 - assert len(res_distilbert['encoding'][0]) == 768 - assert len(res_roberta['encoding'][0]) == 768 - assert len(res_camembert['encoding'][0]) == 768 # test base extractor assert res.shape[0] == 8 @@ -317,14 +299,6 @@ def test_bert_extractor(): assert res_file['duration'][5] == 0.5 assert res_file['object_id'][5] == 5 - # test tensorflow vs torch - #cors = [np.corrcoef(res['encoding'][i], res_tf['encoding'][i])[0,1] - # for i in range(res.shape[0])] - #assert all(np.isclose(cors, 1)) - - # test camembert - assert res_camembert['token'][4] == 'est' - # test model attributes assert all([a in res_model_attr.columns for a in ext_base._model_attributes]) @@ -333,6 +307,28 @@ def test_bert_extractor(): BertExtractor(framework='keras') assert 'Invalid framework' in str(err.value) +models = ['bert-large-uncased', 'distilbert-base-uncased', + 'roberta-base','camembert-base'] +@pytest.mark.parametrize('model', models) +def test_bert_other_models(): + if model == 'camembert-base': + stim = ComplexTextStim(text='ceci n\'est pas un pipe') + else: + stim = ComplexTextStim(text='This is not a tokenized sentence.') + ext = BertExtractor(pretrained_model=model, return_input=True) + res = ext.transform(stim).to_df() + if model == 'bert-large-uncased': + shape = 1024 + else: + shape = 768 + assert len(res['encoding'][0]) == shape + if model == 'camembert-base': + assert res_camembert['token'][4] == 'est' + + + + + ''' def test_bert_sequence_extractor(): From 20e520a979229859b4a8a17c558b095e2cc26d3e Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 11:27:21 +0200 Subject: [PATCH 49/89] try deleting models --- .../tests/extractors/test_text_extractors.py | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 0185bb15..04c5a490 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -18,6 +18,8 @@ from pliers.tests.utils import get_test_data_path import numpy as np from os.path import join +from pathlib import Path +import shutil import pytest import spacy from os import environ @@ -307,10 +309,14 @@ def test_bert_extractor(): BertExtractor(framework='keras') assert 'Invalid framework' in str(err.value) -models = ['bert-large-uncased', 'distilbert-base-uncased', - 'roberta-base','camembert-base'] -@pytest.mark.parametrize('model', models) -def test_bert_other_models(): + # Delete the models + del res, res_token, res_file, ext_base, ext_base_token + + +@pytest.mark.parametrize('model', ['bert-large-uncased', + 'distilbert-base-uncased', + 'roberta-base','camembert-base']) +def test_bert_other_models(model): if model == 'camembert-base': stim = ComplexTextStim(text='ceci n\'est pas un pipe') else: @@ -325,7 +331,13 @@ def test_bert_other_models(): if model == 'camembert-base': assert res_camembert['token'][4] == 'est' - + # delete the model + home = str(Path.home()) + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + + # remove variables + del ext, res, stim From 288858ecd2a54226293cfe2a3e1c7ab88f896258 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 12:11:19 +0200 Subject: [PATCH 50/89] fix typo --- pliers/tests/extractors/test_text_extractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 04c5a490..84167f45 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -329,10 +329,10 @@ def test_bert_other_models(model): shape = 768 assert len(res['encoding'][0]) == shape if model == 'camembert-base': - assert res_camembert['token'][4] == 'est' + assert res['token'][4] == 'est' # delete the model - home = str(Path.home()) + home = Path.home() model_path = str(home / '.cache' / 'torch' / 'transformers') shutil.rmtree(model_path) From 6c12160bb90bf32e705790f470e178407ca86bfa Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 13:09:27 +0200 Subject: [PATCH 51/89] remove all after use --- .../tests/extractors/test_text_extractors.py | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 84167f45..12218d89 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -309,6 +309,11 @@ def test_bert_extractor(): BertExtractor(framework='keras') assert 'Invalid framework' in str(err.value) + # delete the model + home = Path.home() + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + # Delete the models del res, res_token, res_file, ext_base, ext_base_token @@ -340,20 +345,15 @@ def test_bert_other_models(model): del ext, res, stim - -''' - def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) ext = BertSequenceEncodingExtractor() - ext_tf = BertSequenceEncodingExtractor(framework='tf') ext_sequence = BertSequenceEncodingExtractor(return_input=True) ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') ext_max = BertSequenceEncodingExtractor(pooling='max') - ext_distil = BertSequenceEncodingExtractor(pretrained_model='distilbert-base-uncased') # Test correct behavior when setting return_special assert ext_cls.pooling is None @@ -362,25 +362,21 @@ def test_bert_sequence_extractor(): assert ext_pooler.return_special == 'pooler_output' res = ext.transform(stim).to_df() - res_tf = ext.transform(stim).to_df() res_file = ext.transform(stim_file).to_df() res_sequence = ext_sequence.transform(stim).to_df() res_cls = ext_cls.transform(stim).to_df() res_pooler = ext_pooler.transform(stim).to_df() res_max = ext_max.transform(stim).to_df() - res_distil = ext_distil.transform(stim).to_df() # Check shape assert len(res['encoding'][0]) == 768 assert len(res_cls['encoding'][0]) == 768 assert len(res_pooler['encoding'][0]) == 768 assert len(res_max['encoding'][0]) == 768 - assert len(res_distil['encoding'][0]) == 768 assert res.shape[0] == 1 assert res_cls.shape[0] == 1 assert res_pooler.shape[0] == 1 assert res_max.shape[0] == 1 - assert res_distil.shape[0] == 1 # Make sure pooler/cls/no arguments return different encodings assert res['encoding'][0] != res_cls['encoding'][0] @@ -397,7 +393,6 @@ def test_bert_sequence_extractor(): assert res_file['onset'][0] == 0.2 # test tf vs. torch - cor = np.corrcoef(res['encoding'][0], res_tf['encoding'][0])[0,1] assert np.isclose(cor, 1) # catch error with wrong numpy function and wrong special token arg @@ -408,6 +403,13 @@ def test_bert_sequence_extractor(): BertSequenceEncodingExtractor(return_special='[MASK]') assert 'must be one of' in str(err.value) + # delete the model + home = Path.home() + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + + del ext, ext_sequence, ext_cls, ext_pooler, ext_max + def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -435,10 +437,8 @@ def test_bert_LM_extractor(): ext = BertLMExtractor(mask=2) ext_masked = BertLMExtractor() ext_target = BertLMExtractor(mask=1, target=target_wds) - ext_target_tf = BertLMExtractor(framework='tf', mask=1, target=target_wds) ext_topn = BertLMExtractor(mask=3, top_n=100) ext_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True) - ext_distil = BertLMExtractor(mask=2, pretrained_model='distilbert-base-uncased') ext_default = BertLMExtractor() ext_return_mask = BertLMExtractor(mask=1, top_n=10, return_masked_word=True, return_input=True) @@ -447,15 +447,12 @@ def test_bert_LM_extractor(): res_masked = ext_masked.transform(stim_masked).to_df() res_file = ext.transform(stim_file).to_df() res_target = ext_target.transform(stim).to_df() - res_target_tf = ext_target_tf.transform(stim).to_df() res_topn = ext_topn.transform(stim).to_df() res_threshold = ext_threshold.transform(stim).to_df() - res_distil = ext_distil.transform(stim).to_df() res_default = ext_default.transform(stim_masked).to_df() res_return_mask = ext_return_mask.transform(stim).to_df() assert res.shape[0] == 1 - assert res_distil.shape[0] == 1 # test onset/duration assert res_file['onset'][0] == 1.0 @@ -476,10 +473,6 @@ def test_bert_LM_extractor(): if v.capitalize() in res_threshold.columns: assert res_threshold[v.capitalize()][0] >= .1 assert res_threshold[v.capitalize()][0] <= 1 - - # torch vs tf - assert all([np.isclose(res_target[t.capitalize()][0], - res_target_tf[t.capitalize()][0]) for t in target_wds]) # Test update mask method assert ext_target.mask == 1 @@ -501,19 +494,27 @@ def test_bert_LM_extractor(): assert 'true_word_score' in res_return_mask.columns assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' + # delete the model + home = Path.home() + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + + # remove variables + del ext, ext_masked, ext_target, ext_topn, ext_threshold, ext_default, \ + ext_return_mask + del res, res_masked, res_file, res_target, res_topn, res_threshold, \ + res_default, res_return_mask def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) ext = BertSentimentExtractor() - ext_tf = BertSentimentExtractor(framework='tf') ext_seq = BertSentimentExtractor(return_input=True) ext_softmax = BertSentimentExtractor(return_softmax=True) res = ext.transform(stim).to_df() res_file = ext.transform(stim_file).to_df() - res_tf = ext_tf.transform(stim).to_df() res_seq = ext_seq.transform(stim).to_df() res_softmax = ext_softmax.transform(stim).to_df() @@ -521,12 +522,18 @@ def test_bert_sentiment_extractor(): assert res_file['onset'][0] == 0.2 assert res_file['duration'][0] == 2.9 assert all([s in res.columns for s in ['sent_pos', 'sent_neg']]) - assert np.isclose(res['sent_pos'][0], res_tf['sent_pos'][0]) - assert np.isclose(res['sent_neg'][0], res_tf['sent_neg'][0]) assert res_seq['sequence'][0] == 'This is the best day of my life .' assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) + # delete the model + home = Path.home() + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + + del ext, ext_seq, ext_softmax + del res, res_file, res_seq, res_softmax + def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' @@ -558,4 +565,3 @@ def test_word_counter_extractor(): assert result_stim_txt['log_word_count'][15] == np.log(2) assert result_stim_txt['log_word_count'][44] == np.log(3) -''' From 9c7041e8833dfa0bca74fe62449729ecec82e21b Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 14:09:18 +0200 Subject: [PATCH 52/89] add debug statement --- pliers/tests/extractors/test_text_extractors.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 12218d89..675cbb88 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -408,6 +408,8 @@ def test_bert_sequence_extractor(): model_path = str(home / '.cache' / 'torch' / 'transformers') shutil.rmtree(model_path) + print('DONE WITH SEQUENCE!') + del ext, ext_sequence, ext_cls, ext_pooler, ext_max @@ -499,7 +501,7 @@ def test_bert_LM_extractor(): model_path = str(home / '.cache' / 'torch' / 'transformers') shutil.rmtree(model_path) - # remove variables + # remove del ext, ext_masked, ext_target, ext_topn, ext_threshold, ext_default, \ ext_return_mask del res, res_masked, res_file, res_target, res_topn, res_threshold, \ From 1e53e30be7b37e7687d1f98d24ab81f1a871f589 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 14:11:59 +0200 Subject: [PATCH 53/89] add more debug statements --- .../tests/extractors/test_text_extractors.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 675cbb88..1d31548c 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -349,11 +349,14 @@ def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - ext = BertSequenceEncodingExtractor() ext_sequence = BertSequenceEncodingExtractor(return_input=True) + print('Initialized ext_seq') ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') + print('Initialized ext_cls') ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') + print('Initialized ext_pooler') ext_max = BertSequenceEncodingExtractor(pooling='max') + print('Initialized ext_max') # Test correct behavior when setting return_special assert ext_cls.pooling is None @@ -361,28 +364,27 @@ def test_bert_sequence_extractor(): assert ext_cls.return_special == '[CLS]' assert ext_pooler.return_special == 'pooler_output' - res = ext.transform(stim).to_df() - res_file = ext.transform(stim_file).to_df() res_sequence = ext_sequence.transform(stim).to_df() + res_file = ext.transform(stim_file).to_df() res_cls = ext_cls.transform(stim).to_df() res_pooler = ext_pooler.transform(stim).to_df() res_max = ext_max.transform(stim).to_df() # Check shape - assert len(res['encoding'][0]) == 768 + assert len(res_sequence['encoding'][0]) == 768 assert len(res_cls['encoding'][0]) == 768 assert len(res_pooler['encoding'][0]) == 768 assert len(res_max['encoding'][0]) == 768 - assert res.shape[0] == 1 + assert res_sequence.shape[0] == 1 assert res_cls.shape[0] == 1 assert res_pooler.shape[0] == 1 assert res_max.shape[0] == 1 # Make sure pooler/cls/no arguments return different encodings - assert res['encoding'][0] != res_cls['encoding'][0] - assert res['encoding'][0] != res_pooler['encoding'][0] - assert res['encoding'][0] != res_max['encoding'][0] - assert all([res_max['encoding'][0][i] >= res['encoding'][0][i] + assert res_sequence['encoding'][0] != res_cls['encoding'][0] + assert res_sequence['encoding'][0] != res_pooler['encoding'][0] + assert res_sequence['encoding'][0] != res_max['encoding'][0] + assert all([res_max['encoding'][0][i] >= res_sequence['encoding'][0][i] for i in range(768)]) # test return sequence @@ -408,8 +410,6 @@ def test_bert_sequence_extractor(): model_path = str(home / '.cache' / 'torch' / 'transformers') shutil.rmtree(model_path) - print('DONE WITH SEQUENCE!') - del ext, ext_sequence, ext_cls, ext_pooler, ext_max From 7760d0a5ef78c069374d77d625fc7416ea51f909 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 14:31:27 +0200 Subject: [PATCH 54/89] skip all but word counter test --- pliers/tests/extractors/test_text_extractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 1d31548c..ff789281 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -344,7 +344,7 @@ def test_bert_other_models(model): # remove variables del ext, res, stim - +''' def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -535,7 +535,7 @@ def test_bert_sentiment_extractor(): del ext, ext_seq, ext_softmax del res, res_file, res_seq, res_softmax - +''' def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' From 73737bea50c96c431386502f0754d696ead15210 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 14:59:41 +0200 Subject: [PATCH 55/89] skip models only --- pliers/tests/extractors/test_text_extractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index ff789281..eac0115b 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -317,7 +317,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token - +''' @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -535,7 +535,7 @@ def test_bert_sentiment_extractor(): del ext, ext_seq, ext_softmax del res, res_file, res_seq, res_softmax -''' + def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' From f77f02c7dccfac05f76d6ea75451ae10bc20f892 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 15:27:11 +0200 Subject: [PATCH 56/89] skip sequence --- pliers/tests/extractors/test_text_extractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index eac0115b..5e43fcc3 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -344,7 +344,7 @@ def test_bert_other_models(model): # remove variables del ext, res, stim -''' + def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -411,7 +411,7 @@ def test_bert_sequence_extractor(): shutil.rmtree(model_path) del ext, ext_sequence, ext_cls, ext_pooler, ext_max - +''' def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') From c8ed0647e60b5a44651a91ab4619c51e5c105d28 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 16:31:23 +0200 Subject: [PATCH 57/89] restore all tests --- pliers/tests/extractors/test_text_extractors.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 5e43fcc3..16dae9cf 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -317,7 +317,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token -''' + @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -411,7 +411,7 @@ def test_bert_sequence_extractor(): shutil.rmtree(model_path) del ext, ext_sequence, ext_cls, ext_pooler, ext_max -''' + def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -507,6 +507,7 @@ def test_bert_LM_extractor(): del res, res_masked, res_file, res_target, res_topn, res_threshold, \ res_default, res_return_mask + def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) From 35388530efb124bf0bc6cb98e6400762b0e32fde Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 16:34:02 +0200 Subject: [PATCH 58/89] try only base, sequence, sentiment --- pliers/tests/extractors/test_text_extractors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 16dae9cf..0ff2156b 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -317,7 +317,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token - +''' @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -343,7 +343,7 @@ def test_bert_other_models(model): # remove variables del ext, res, stim - +''' def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -412,7 +412,7 @@ def test_bert_sequence_extractor(): del ext, ext_sequence, ext_cls, ext_pooler, ext_max - +''' def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') @@ -506,7 +506,7 @@ def test_bert_LM_extractor(): ext_return_mask del res, res_masked, res_file, res_target, res_topn, res_threshold, \ res_default, res_return_mask - +''' def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') From ff7c834d3354403de48edf080f8d35579b562bdf Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 31 Mar 2020 18:15:49 +0200 Subject: [PATCH 59/89] remove custom log_attribute logic from to_df, adapt tests and restore extractor pointer --- pliers/extractors/base.py | 3 +-- pliers/extractors/text.py | 3 --- pliers/tests/extractors/test_text_extractors.py | 15 +++++---------- pliers/transformers/base.py | 1 + 4 files changed, 7 insertions(+), 15 deletions(-) diff --git a/pliers/extractors/base.py b/pliers/extractors/base.py index f9910c2c..5396a065 100644 --- a/pliers/extractors/base.py +++ b/pliers/extractors/base.py @@ -7,7 +7,6 @@ from pliers.transformers import Transformer from pliers.utils import isgenerator, flatten, listify from pandas.api.types import is_numeric_dtype -from copy import deepcopy class Extractor(with_metaclass(ABCMeta, Transformer)): @@ -57,7 +56,7 @@ def __init__(self, data, stim, extractor, features=None, onsets=None, durations=None, orders=None): self._data = data self.stim = stim - self.extractor = deepcopy(extractor) + self.extractor = extractor self.features = features self._history = None self.onset = onsets diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 6e85494e..1386cbe8 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -515,9 +515,6 @@ def _postprocess(self, stims, preds, tok, wds, ons, dur): def _to_df(self, result, include_attributes=True): res_df = pd.DataFrame(dict(zip(result.features, result._data))) - if include_attributes: - for attr in self._model_attributes: - res_df[attr] = pd.Series([getattr(result.extractor, attr)]) res_df['object_id'] = range(res_df.shape[0]) return res_df diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 0ff2156b..9d3480d4 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -317,7 +317,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token -''' + @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -343,20 +343,16 @@ def test_bert_other_models(model): # remove variables del ext, res, stim -''' + def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) ext_sequence = BertSequenceEncodingExtractor(return_input=True) - print('Initialized ext_seq') ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') - print('Initialized ext_cls') ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') - print('Initialized ext_pooler') ext_max = BertSequenceEncodingExtractor(pooling='max') - print('Initialized ext_max') # Test correct behavior when setting return_special assert ext_cls.pooling is None @@ -412,7 +408,7 @@ def test_bert_sequence_extractor(): del ext, ext_sequence, ext_cls, ext_pooler, ext_max -''' + def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') @@ -482,14 +478,13 @@ def test_bert_LM_extractor(): assert ext_target.mask == 'sentence' res_target_new = ext_target.transform(stim).to_df() assert all([res_target[c][0] != res_target_new[c][0] - for c in ['Target', 'Word', 'mask']]) + for c in ['Target', 'Word']]) with pytest.raises(ValueError) as err: ext_target.update_mask(new_mask=['some', 'mask']) assert 'must be a string' in str(err.value) # Test default mask assert res_default.shape[0] == 1 - assert res_default['mask'][0] == 'MASK' # Test return mask and input assert res_return_mask['true_word'][0] == 'is' @@ -506,7 +501,7 @@ def test_bert_LM_extractor(): ext_return_mask del res, res_masked, res_file, res_target, res_topn, res_threshold, \ res_default, res_return_mask -''' + def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') diff --git a/pliers/transformers/base.py b/pliers/transformers/base.py index 04bc1b10..f3dbc98a 100644 --- a/pliers/transformers/base.py +++ b/pliers/transformers/base.py @@ -315,3 +315,4 @@ def get_transformer(name, base=None, *args, **kwargs): return cls(*args, **kwargs) raise KeyError("No transformer named '%s' found." % name) + From fb7ef793071088f9d15f5e3d892ff22e0dc124bd Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 1 Apr 2020 09:03:54 +0200 Subject: [PATCH 60/89] skip test several models --- pliers/tests/extractors/test_text_extractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 9d3480d4..76c2a706 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -317,7 +317,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token - +''' @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -343,7 +343,7 @@ def test_bert_other_models(model): # remove variables del ext, res, stim - +''' def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') From ae9737130c6398699966c3f45ae2541f4c8fa832 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 1 Apr 2020 09:24:21 +0200 Subject: [PATCH 61/89] only try one test --- pliers/tests/extractors/test_text_extractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 76c2a706..74e138f2 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -343,7 +343,7 @@ def test_bert_other_models(model): # remove variables del ext, res, stim -''' + def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -531,7 +531,7 @@ def test_bert_sentiment_extractor(): del ext, ext_seq, ext_softmax del res, res_file, res_seq, res_softmax - +''' def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' From 95e3cd36c0c180fd319590702bbbcc9897065236 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 1 Apr 2020 09:30:42 +0200 Subject: [PATCH 62/89] no _log_attribute tests --- pliers/tests/extractors/test_text_extractors.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 74e138f2..ec1ff866 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -301,9 +301,6 @@ def test_bert_extractor(): assert res_file['duration'][5] == 0.5 assert res_file['object_id'][5] == 5 - # test model attributes - assert all([a in res_model_attr.columns for a in ext_base._model_attributes]) - # catch error if framework is invalid with pytest.raises(ValueError) as err: BertExtractor(framework='keras') @@ -317,7 +314,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token -''' + @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -361,7 +358,7 @@ def test_bert_sequence_extractor(): assert ext_pooler.return_special == 'pooler_output' res_sequence = ext_sequence.transform(stim).to_df() - res_file = ext.transform(stim_file).to_df() + res_file = ext_sequence.transform(stim_file).to_df() res_cls = ext_cls.transform(stim).to_df() res_pooler = ext_pooler.transform(stim).to_df() res_max = ext_max.transform(stim).to_df() @@ -458,7 +455,7 @@ def test_bert_LM_extractor(): # Check target words assert all([w.capitalize() in res_target.columns for w in target_wds]) - assert res_target.shape[1] == 13 + assert res_target.shape[1] == 6 # Check top_n assert res_topn.shape[1] == 111 @@ -531,7 +528,7 @@ def test_bert_sentiment_extractor(): del ext, ext_seq, ext_softmax del res, res_file, res_seq, res_softmax -''' + def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' From 4ee6dff5cccff52395531c66dd7f1be3426c0c48 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 1 Apr 2020 09:48:16 +0200 Subject: [PATCH 63/89] disable pytest caching --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 5ee7e32c..c1f4c6c2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ before_script: - python -m spacy download en_core_web_sm script: - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning -- py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning +- py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning --cache-clear after_success: - coveralls before_cache: From 4b9e8849e5b1b19515ef368f68f664bc75340ee2 Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 1 Apr 2020 10:15:17 +0200 Subject: [PATCH 64/89] fix shape assertion --- .../tests/extractors/test_text_extractors.py | 25 +------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index ec1ff866..dfe3094d 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -306,11 +306,6 @@ def test_bert_extractor(): BertExtractor(framework='keras') assert 'Invalid framework' in str(err.value) - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - # Delete the models del res, res_token, res_file, ext_base, ext_base_token @@ -387,9 +382,6 @@ def test_bert_sequence_extractor(): assert res_file['duration'][0] == 2.9 assert res_file['onset'][0] == 0.2 - # test tf vs. torch - assert np.isclose(cor, 1) - # catch error with wrong numpy function and wrong special token arg with pytest.raises(ValueError) as err: BertSequenceEncodingExtractor(pooling='avg') @@ -398,11 +390,6 @@ def test_bert_sequence_extractor(): BertSequenceEncodingExtractor(return_special='[MASK]') assert 'must be one of' in str(err.value) - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - del ext, ext_sequence, ext_cls, ext_pooler, ext_max @@ -458,7 +445,7 @@ def test_bert_LM_extractor(): assert res_target.shape[1] == 6 # Check top_n - assert res_topn.shape[1] == 111 + assert res_topn.shape[1] == 104 assert all([res_topn.iloc[:,3][0] > res_topn.iloc[:,i][0] for i in range(4,103)]) # Check threshold and return_softmax @@ -488,11 +475,6 @@ def test_bert_LM_extractor(): assert 'true_word_score' in res_return_mask.columns assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - # remove del ext, ext_masked, ext_target, ext_topn, ext_threshold, ext_default, \ ext_return_mask @@ -521,11 +503,6 @@ def test_bert_sentiment_extractor(): assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - del ext, ext_seq, ext_softmax del res, res_file, res_seq, res_softmax From c98735e4703e71f0cfd39df682fd113c3710db2f Mon Sep 17 00:00:00 2001 From: rbroc Date: Wed, 1 Apr 2020 11:00:46 +0200 Subject: [PATCH 65/89] skip test models --- pliers/tests/extractors/test_text_extractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index dfe3094d..4ce9802d 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -309,7 +309,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token - +''' @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -335,7 +335,7 @@ def test_bert_other_models(model): # remove variables del ext, res, stim - +''' def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') From 8950729ae44ebd96e72ca65a17efd3cc36c7c301 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 2 Apr 2020 11:18:25 +0200 Subject: [PATCH 66/89] no storing of extractors --- .../tests/extractors/test_text_extractors.py | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 4ce9802d..2b10d63a 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -341,22 +341,22 @@ def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - ext_sequence = BertSequenceEncodingExtractor(return_input=True) - ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') - ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') - ext_max = BertSequenceEncodingExtractor(pooling='max') + #ext_sequence = BertSequenceEncodingExtractor(return_input=True) + #ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') + #ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') + #ext_max = BertSequenceEncodingExtractor(pooling='max') # Test correct behavior when setting return_special - assert ext_cls.pooling is None - assert ext_pooler.pooling is None - assert ext_cls.return_special == '[CLS]' - assert ext_pooler.return_special == 'pooler_output' + #assert ext_cls.pooling is None + #assert ext_pooler.pooling is None + #assert ext_cls.return_special == '[CLS]' + #assert ext_pooler.return_special == 'pooler_output' - res_sequence = ext_sequence.transform(stim).to_df() - res_file = ext_sequence.transform(stim_file).to_df() - res_cls = ext_cls.transform(stim).to_df() - res_pooler = ext_pooler.transform(stim).to_df() - res_max = ext_max.transform(stim).to_df() + res_sequence = BertSequenceEncodingExtractor(return_input=True).transform(stim).to_df() + res_file = BertSequenceEncodingExtractor(return_input=True).transform(stim_file).to_df() + res_cls = BertSequenceEncodingExtractor(return_special='[CLS]').transform(stim).to_df() + res_pooler = BertSequenceEncodingExtractor(return_special='pooler_output').transform(stim).to_df() + res_max = BertSequenceEncodingExtractor(pooling='max').transform(stim).to_df() # Check shape assert len(res_sequence['encoding'][0]) == 768 @@ -390,7 +390,7 @@ def test_bert_sequence_extractor(): BertSequenceEncodingExtractor(return_special='[MASK]') assert 'must be one of' in str(err.value) - del ext, ext_sequence, ext_cls, ext_pooler, ext_max + #del ext, ext_sequence, ext_cls, ext_pooler, ext_max def test_bert_LM_extractor(): @@ -416,23 +416,23 @@ def test_bert_LM_extractor(): assert 'No valid target token' in str(err.value) target_wds = ['target','word'] - ext = BertLMExtractor(mask=2) - ext_masked = BertLMExtractor() + #ext = BertLMExtractor(mask=2) + #ext_masked = BertLMExtractor() ext_target = BertLMExtractor(mask=1, target=target_wds) - ext_topn = BertLMExtractor(mask=3, top_n=100) - ext_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True) - ext_default = BertLMExtractor() - ext_return_mask = BertLMExtractor(mask=1, top_n=10, - return_masked_word=True, return_input=True) - - res = ext.transform(stim).to_df() - res_masked = ext_masked.transform(stim_masked).to_df() - res_file = ext.transform(stim_file).to_df() + #ext_topn = BertLMExtractor(mask=3, top_n=100) + #ext_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True) + #ext_default = BertLMExtractor() + #ext_return_mask = BertLMExtractor(mask=1, top_n=10, + # return_masked_word=True, return_input=True) + + res = BertLMExtractor(mask=2).transform(stim).to_df() + res_masked = BertLMExtractor().transform(stim_masked).to_df() + res_file = BertLMExtractor(mask=2).transform(stim_file).to_df() res_target = ext_target.transform(stim).to_df() - res_topn = ext_topn.transform(stim).to_df() - res_threshold = ext_threshold.transform(stim).to_df() - res_default = ext_default.transform(stim_masked).to_df() - res_return_mask = ext_return_mask.transform(stim).to_df() + res_topn = BertLMExtractor(mask=3, top_n=100).transform(stim).to_df() + res_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True).transform(stim).to_df() + res_default = BertLMExtractor().transform(stim_masked).to_df() + res_return_mask = BertLMExtractor(mask=1, top_n=10, return_masked_word=True, return_input=True).transform(stim).to_df() assert res.shape[0] == 1 @@ -476,24 +476,24 @@ def test_bert_LM_extractor(): assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' # remove - del ext, ext_masked, ext_target, ext_topn, ext_threshold, ext_default, \ - ext_return_mask - del res, res_masked, res_file, res_target, res_topn, res_threshold, \ - res_default, res_return_mask + #del ext, ext_masked, ext_target, ext_topn, ext_threshold, ext_default, \ + # ext_return_mask + #del res, res_masked, res_file, res_target, res_topn, res_threshold, \ + # res_default, res_return_mask def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - ext = BertSentimentExtractor() - ext_seq = BertSentimentExtractor(return_input=True) - ext_softmax = BertSentimentExtractor(return_softmax=True) + #ext = BertSentimentExtractor() + #ext_seq = BertSentimentExtractor(return_input=True) + #ext_softmax = BertSentimentExtractor(return_softmax=True) - res = ext.transform(stim).to_df() - res_file = ext.transform(stim_file).to_df() - res_seq = ext_seq.transform(stim).to_df() - res_softmax = ext_softmax.transform(stim).to_df() + res = BertSentimentExtractor().transform(stim).to_df() + res_file = BertSentimentExtractor().transform(stim_file).to_df() + res_seq = BertSentimentExtractor(return_input=True).transform(stim).to_df() + res_softmax = BertSentimentExtractor(return_softmax=True).transform(stim).to_df() assert res.shape[0] == 1 assert res_file['onset'][0] == 0.2 @@ -503,7 +503,7 @@ def test_bert_sentiment_extractor(): assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) - del ext, ext_seq, ext_softmax + #del ext, ext_seq, ext_softmax del res, res_file, res_seq, res_softmax From 2543d83ad1eba416d1b4a396e4e202a5ddfcc92b Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 2 Apr 2020 11:58:42 +0200 Subject: [PATCH 67/89] no distilbert download --- pliers/tests/extractors/test_text_extractors.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 2b10d63a..a88090d9 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -309,7 +309,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token -''' + @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -318,8 +318,7 @@ def test_bert_other_models(model): stim = ComplexTextStim(text='ceci n\'est pas un pipe') else: stim = ComplexTextStim(text='This is not a tokenized sentence.') - ext = BertExtractor(pretrained_model=model, return_input=True) - res = ext.transform(stim).to_df() + res = BertExtractor(pretrained_model=model, return_input=True).transform(stim).to_df() if model == 'bert-large-uncased': shape = 1024 else: @@ -335,7 +334,7 @@ def test_bert_other_models(model): # remove variables del ext, res, stim -''' + def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -417,7 +416,6 @@ def test_bert_LM_extractor(): target_wds = ['target','word'] #ext = BertLMExtractor(mask=2) - #ext_masked = BertLMExtractor() ext_target = BertLMExtractor(mask=1, target=target_wds) #ext_topn = BertLMExtractor(mask=3, top_n=100) #ext_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True) @@ -426,7 +424,6 @@ def test_bert_LM_extractor(): # return_masked_word=True, return_input=True) res = BertLMExtractor(mask=2).transform(stim).to_df() - res_masked = BertLMExtractor().transform(stim_masked).to_df() res_file = BertLMExtractor(mask=2).transform(stim_file).to_df() res_target = ext_target.transform(stim).to_df() res_topn = BertLMExtractor(mask=3, top_n=100).transform(stim).to_df() @@ -481,7 +478,7 @@ def test_bert_LM_extractor(): #del res, res_masked, res_file, res_target, res_topn, res_threshold, \ # res_default, res_return_mask - +''' def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -505,7 +502,7 @@ def test_bert_sentiment_extractor(): #del ext, ext_seq, ext_softmax del res, res_file, res_seq, res_softmax - +''' def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' From 13bb469d149e97e41cf6158852fbae261160c0fb Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 2 Apr 2020 12:05:00 +0200 Subject: [PATCH 68/89] disable timeout --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c1f4c6c2..9519f046 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ before_script: - python -m spacy download en_core_web_sm script: - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning -- py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning --cache-clear +- travis_wait 20 py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning --cache-clear after_success: - coveralls before_cache: From ecd9d9a50633e86977a9003ddc3f3b6c30beeb47 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 2 Apr 2020 12:35:16 +0200 Subject: [PATCH 69/89] remove all other models --- pliers/tests/extractors/test_text_extractors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index a88090d9..8d0364b9 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -309,7 +309,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token - +''' @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -333,8 +333,8 @@ def test_bert_other_models(model): shutil.rmtree(model_path) # remove variables - del ext, res, stim - + del res, stim +''' def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') From 23b121bb09ab3f75cb72d23b02edcd4cfbf53eec Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 3 Apr 2020 09:16:37 +0200 Subject: [PATCH 70/89] revert --- .../tests/extractors/test_text_extractors.py | 30 ++----------------- 1 file changed, 2 insertions(+), 28 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 8d0364b9..e6a85b2e 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -309,7 +309,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token -''' + @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -334,21 +334,16 @@ def test_bert_other_models(model): # remove variables del res, stim -''' + def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - #ext_sequence = BertSequenceEncodingExtractor(return_input=True) - #ext_cls = BertSequenceEncodingExtractor(return_special='[CLS]') #ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') - #ext_max = BertSequenceEncodingExtractor(pooling='max') # Test correct behavior when setting return_special - #assert ext_cls.pooling is None #assert ext_pooler.pooling is None - #assert ext_cls.return_special == '[CLS]' #assert ext_pooler.return_special == 'pooler_output' res_sequence = BertSequenceEncodingExtractor(return_input=True).transform(stim).to_df() @@ -389,8 +384,6 @@ def test_bert_sequence_extractor(): BertSequenceEncodingExtractor(return_special='[MASK]') assert 'must be one of' in str(err.value) - #del ext, ext_sequence, ext_cls, ext_pooler, ext_max - def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -415,13 +408,7 @@ def test_bert_LM_extractor(): assert 'No valid target token' in str(err.value) target_wds = ['target','word'] - #ext = BertLMExtractor(mask=2) ext_target = BertLMExtractor(mask=1, target=target_wds) - #ext_topn = BertLMExtractor(mask=3, top_n=100) - #ext_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True) - #ext_default = BertLMExtractor() - #ext_return_mask = BertLMExtractor(mask=1, top_n=10, - # return_masked_word=True, return_input=True) res = BertLMExtractor(mask=2).transform(stim).to_df() res_file = BertLMExtractor(mask=2).transform(stim_file).to_df() @@ -472,21 +459,11 @@ def test_bert_LM_extractor(): assert 'true_word_score' in res_return_mask.columns assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' - # remove - #del ext, ext_masked, ext_target, ext_topn, ext_threshold, ext_default, \ - # ext_return_mask - #del res, res_masked, res_file, res_target, res_topn, res_threshold, \ - # res_default, res_return_mask -''' def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - #ext = BertSentimentExtractor() - #ext_seq = BertSentimentExtractor(return_input=True) - #ext_softmax = BertSentimentExtractor(return_softmax=True) - res = BertSentimentExtractor().transform(stim).to_df() res_file = BertSentimentExtractor().transform(stim_file).to_df() res_seq = BertSentimentExtractor(return_input=True).transform(stim).to_df() @@ -500,9 +477,6 @@ def test_bert_sentiment_extractor(): assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) - #del ext, ext_seq, ext_softmax - del res, res_file, res_seq, res_softmax -''' def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' From 736e661b76c366af512f4cdf97c0e609a8117c8d Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 3 Apr 2020 11:08:14 +0200 Subject: [PATCH 71/89] test text ext only --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 9519f046..2fbc9e49 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ before_script: - python -m spacy download en_core_web_sm script: - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning -- travis_wait 20 py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning --cache-clear +- travis_wait 30 py.test pliers/tests/extractors/test_text_extractors.py --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning after_success: - coveralls before_cache: From 1cc1756310805cd0d342fc7aef60f4fd44e2002d Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 3 Apr 2020 11:30:56 +0200 Subject: [PATCH 72/89] no init target --- .../tests/extractors/test_text_extractors.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index e6a85b2e..82e2f51a 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -408,11 +408,11 @@ def test_bert_LM_extractor(): assert 'No valid target token' in str(err.value) target_wds = ['target','word'] - ext_target = BertLMExtractor(mask=1, target=target_wds) + #ext_target = BertLMExtractor(mask=1, target=target_wds) res = BertLMExtractor(mask=2).transform(stim).to_df() res_file = BertLMExtractor(mask=2).transform(stim_file).to_df() - res_target = ext_target.transform(stim).to_df() + #res_target = ext_target.transform(stim).to_df() res_topn = BertLMExtractor(mask=3, top_n=100).transform(stim).to_df() res_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True).transform(stim).to_df() res_default = BertLMExtractor().transform(stim_masked).to_df() @@ -425,8 +425,8 @@ def test_bert_LM_extractor(): assert res_file['duration'][0] == 0.2 # Check target words - assert all([w.capitalize() in res_target.columns for w in target_wds]) - assert res_target.shape[1] == 6 + #assert all([w.capitalize() in res_target.columns for w in target_wds]) + #assert res_target.shape[1] == 6 # Check top_n assert res_topn.shape[1] == 104 @@ -441,15 +441,15 @@ def test_bert_LM_extractor(): assert res_threshold[v.capitalize()][0] <= 1 # Test update mask method - assert ext_target.mask == 1 - ext_target.update_mask(new_mask='sentence') - assert ext_target.mask == 'sentence' - res_target_new = ext_target.transform(stim).to_df() - assert all([res_target[c][0] != res_target_new[c][0] - for c in ['Target', 'Word']]) - with pytest.raises(ValueError) as err: - ext_target.update_mask(new_mask=['some', 'mask']) - assert 'must be a string' in str(err.value) + #assert ext_target.mask == 1 + #ext_target.update_mask(new_mask='sentence') + #assert ext_target.mask == 'sentence' + #res_target_new = ext_target.transform(stim).to_df() + #assert all([res_target[c][0] != res_target_new[c][0] + # for c in ['Target', 'Word']]) + #with pytest.raises(ValueError) as err: + # ext_target.update_mask(new_mask=['some', 'mask']) + #assert 'must be a string' in str(err.value) # Test default mask assert res_default.shape[0] == 1 From f8b6d9082705579377fd7f7b7fa45ce74e1c2c0d Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 3 Apr 2020 13:17:57 +0200 Subject: [PATCH 73/89] separate file --- .travis.yml | 3 +- .../tests/extractors/test_bert_extractors.py | 236 ++++++++++++++++++ 2 files changed, 237 insertions(+), 2 deletions(-) create mode 100644 pliers/tests/extractors/test_bert_extractors.py diff --git a/.travis.yml b/.travis.yml index 2fbc9e49..39aaefbf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,8 +48,7 @@ before_script: - python -m pliers.support.download - python -m spacy download en_core_web_sm script: -- py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning -- travis_wait 30 py.test pliers/tests/extractors/test_text_extractors.py --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning + - travis_wait 30 py.test pliers/tests/extractors/test_bert_extractors.py --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning after_success: - coveralls before_cache: diff --git a/pliers/tests/extractors/test_bert_extractors.py b/pliers/tests/extractors/test_bert_extractors.py new file mode 100644 index 00000000..a1a5b28d --- /dev/null +++ b/pliers/tests/extractors/test_bert_extractors.py @@ -0,0 +1,236 @@ +from pliers import config +from pliers.extractors import (DictionaryExtractor, + PartOfSpeechExtractor, + LengthExtractor, + NumUniqueWordsExtractor, + PredefinedDictionaryExtractor, + TextVectorizerExtractor, + WordEmbeddingExtractor, + VADERSentimentExtractor, + SpaCyExtractor, + BertExtractor, + BertSequenceEncodingExtractor, + BertLMExtractor, + BertSentimentExtractor, + WordCounterExtractor) +from pliers.extractors.base import merge_results +from pliers.stimuli import TextStim, ComplexTextStim +from pliers.tests.utils import get_test_data_path +import numpy as np +from os.path import join +from pathlib import Path +import shutil +import pytest +import spacy +from os import environ +from transformers import BertTokenizer + +TEXT_DIR = join(get_test_data_path(), 'text') + +def test_bert_extractor(): + stim = ComplexTextStim(text='This is not a tokenized sentence.') + stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + + ext_base = BertExtractor(pretrained_model='bert-base-uncased') + ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', + return_input=True) + + base_result = ext_base.transform(stim) + res = base_result.to_df() + res_model_attr = base_result.to_df(include_attributes=True) + res_token = ext_base_token.transform(stim).to_df() + res_file = ext_base.transform(stim_file).to_df() + + # Test encoding shape + assert len(res['encoding'][0]) == 768 + assert len(res_file['encoding'][0]) == 768 + + # test base extractor + assert res.shape[0] == 8 + assert res_token.shape[0] == 8 + assert res_token['token'][5] == '##ized' + assert res_token['word'][5] == 'tokenized' + assert res_token['object_id'][5] == 5 + + # test base extractor on file + assert res_file.shape[0] == 8 + assert res_file['onset'][3] == 1.3 + assert res_file['duration'][5] == 0.5 + assert res_file['object_id'][5] == 5 + + # catch error if framework is invalid + with pytest.raises(ValueError) as err: + BertExtractor(framework='keras') + assert 'Invalid framework' in str(err.value) + + # Delete the models + del res, res_token, res_file, ext_base, ext_base_token + + +@pytest.mark.parametrize('model', ['bert-large-uncased', + 'distilbert-base-uncased', + 'roberta-base','camembert-base']) +def test_bert_other_models(model): + if model == 'camembert-base': + stim = ComplexTextStim(text='ceci n\'est pas un pipe') + else: + stim = ComplexTextStim(text='This is not a tokenized sentence.') + res = BertExtractor(pretrained_model=model, return_input=True).transform(stim).to_df() + if model == 'bert-large-uncased': + shape = 1024 + else: + shape = 768 + assert len(res['encoding'][0]) == shape + if model == 'camembert-base': + assert res['token'][4] == 'est' + + # delete the model + home = Path.home() + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + + # remove variables + del res, stim + + +def test_bert_sequence_extractor(): + stim = ComplexTextStim(text='This is not a tokenized sentence.') + stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + + #ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') + + # Test correct behavior when setting return_special + #assert ext_pooler.pooling is None + #assert ext_pooler.return_special == 'pooler_output' + + res_sequence = BertSequenceEncodingExtractor(return_input=True).transform(stim).to_df() + res_file = BertSequenceEncodingExtractor(return_input=True).transform(stim_file).to_df() + res_cls = BertSequenceEncodingExtractor(return_special='[CLS]').transform(stim).to_df() + res_pooler = BertSequenceEncodingExtractor(return_special='pooler_output').transform(stim).to_df() + res_max = BertSequenceEncodingExtractor(pooling='max').transform(stim).to_df() + + # Check shape + assert len(res_sequence['encoding'][0]) == 768 + assert len(res_cls['encoding'][0]) == 768 + assert len(res_pooler['encoding'][0]) == 768 + assert len(res_max['encoding'][0]) == 768 + assert res_sequence.shape[0] == 1 + assert res_cls.shape[0] == 1 + assert res_pooler.shape[0] == 1 + assert res_max.shape[0] == 1 + + # Make sure pooler/cls/no arguments return different encodings + assert res_sequence['encoding'][0] != res_cls['encoding'][0] + assert res_sequence['encoding'][0] != res_pooler['encoding'][0] + assert res_sequence['encoding'][0] != res_max['encoding'][0] + assert all([res_max['encoding'][0][i] >= res_sequence['encoding'][0][i] + for i in range(768)]) + + # test return sequence + assert res_sequence['sequence'][0] == 'This is not a tokenized sentence .' + + # test file stim + assert res_file['duration'][0] == 2.9 + assert res_file['onset'][0] == 0.2 + + # catch error with wrong numpy function and wrong special token arg + with pytest.raises(ValueError) as err: + BertSequenceEncodingExtractor(pooling='avg') + assert 'valid numpy function' in str(err.value) + with pytest.raises(ValueError) as err: + BertSequenceEncodingExtractor(return_special='[MASK]') + assert 'must be one of' in str(err.value) + + +def test_bert_LM_extractor(): + stim = ComplexTextStim(text='This is not a tokenized sentence.') + stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') + stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + + # Test mutual exclusivity and mask values + with pytest.raises(ValueError) as err: + BertLMExtractor(top_n=100, target='test') + assert 'mutually exclusive' in str(err.value) + with pytest.raises(ValueError) as err: + BertLMExtractor(top_n=100, threshold=.5) + assert 'mutually exclusive' in str(err.value) + with pytest.raises(ValueError) as err: + BertLMExtractor(target='test', threshold=.5) + assert 'mutually exclusive' in str(err.value) + with pytest.raises(ValueError) as err: + BertLMExtractor(mask=['test', 'mask']) + assert 'must be a string' in str(err.value) + with pytest.raises(ValueError) as err: + BertLMExtractor(target='nonwd') + assert 'No valid target token' in str(err.value) + + target_wds = ['target','word'] + #ext_target = BertLMExtractor(mask=1, target=target_wds) + + res = BertLMExtractor(mask=2).transform(stim).to_df() + res_file = BertLMExtractor(mask=2).transform(stim_file).to_df() + #res_target = ext_target.transform(stim).to_df() + res_topn = BertLMExtractor(mask=3, top_n=100).transform(stim).to_df() + res_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True).transform(stim).to_df() + res_default = BertLMExtractor().transform(stim_masked).to_df() + res_return_mask = BertLMExtractor(mask=1, top_n=10, return_masked_word=True, return_input=True).transform(stim).to_df() + + assert res.shape[0] == 1 + + # test onset/duration + assert res_file['onset'][0] == 1.0 + assert res_file['duration'][0] == 0.2 + + # Check target words + #assert all([w.capitalize() in res_target.columns for w in target_wds]) + #assert res_target.shape[1] == 6 + + # Check top_n + assert res_topn.shape[1] == 104 + assert all([res_topn.iloc[:,3][0] > res_topn.iloc[:,i][0] for i in range(4,103)]) + + # Check threshold and return_softmax + tknz = BertTokenizer.from_pretrained('bert-base-uncased') + vocab = tknz.vocab.keys() + for v in vocab: + if v.capitalize() in res_threshold.columns: + assert res_threshold[v.capitalize()][0] >= .1 + assert res_threshold[v.capitalize()][0] <= 1 + + # Test update mask method + #assert ext_target.mask == 1 + #ext_target.update_mask(new_mask='sentence') + #assert ext_target.mask == 'sentence' + #res_target_new = ext_target.transform(stim).to_df() + #assert all([res_target[c][0] != res_target_new[c][0] + # for c in ['Target', 'Word']]) + #with pytest.raises(ValueError) as err: + # ext_target.update_mask(new_mask=['some', 'mask']) + #assert 'must be a string' in str(err.value) + + # Test default mask + assert res_default.shape[0] == 1 + + # Test return mask and input + assert res_return_mask['true_word'][0] == 'is' + assert 'true_word_score' in res_return_mask.columns + assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' + + +def test_bert_sentiment_extractor(): + stim = ComplexTextStim(text='This is the best day of my life.') + stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) + + res = BertSentimentExtractor().transform(stim).to_df() + res_file = BertSentimentExtractor().transform(stim_file).to_df() + res_seq = BertSentimentExtractor(return_input=True).transform(stim).to_df() + res_softmax = BertSentimentExtractor(return_softmax=True).transform(stim).to_df() + + assert res.shape[0] == 1 + assert res_file['onset'][0] == 0.2 + assert res_file['duration'][0] == 2.9 + assert all([s in res.columns for s in ['sent_pos', 'sent_neg']]) + assert res_seq['sequence'][0] == 'This is the best day of my life .' + assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) + assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) + From a58e2963e92d386a7cbad0de635468f9d1cc1f0b Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 3 Apr 2020 15:47:11 +0200 Subject: [PATCH 74/89] try last test edit --- .travis.yml | 2 +- .../tests/extractors/test_text_extractors.py | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 39aaefbf..180b327f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,7 @@ before_script: - python -m pliers.support.download - python -m spacy download en_core_web_sm script: - - travis_wait 30 py.test pliers/tests/extractors/test_bert_extractors.py --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning + - travis_wait 30 py.test pliers/tests/extractors/test_text_extractors.py --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning after_success: - coveralls before_cache: diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 82e2f51a..33753754 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -384,6 +384,14 @@ def test_bert_sequence_extractor(): BertSequenceEncodingExtractor(return_special='[MASK]') assert 'must be one of' in str(err.value) + # delete the model + home = Path.home() + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + + # remove variables + del res, stim + def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') @@ -459,6 +467,13 @@ def test_bert_LM_extractor(): assert 'true_word_score' in res_return_mask.columns assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' + # delete the model + home = Path.home() + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + + # remove variables + del res, stim def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') @@ -477,6 +492,14 @@ def test_bert_sentiment_extractor(): assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) + # delete the model + home = Path.home() + model_path = str(home / '.cache' / 'torch' / 'transformers') + shutil.rmtree(model_path) + + # remove variables + del res, stim + def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' From e9c2edb3e9dcfe20eec2669e1ca09c37115a8a54 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 3 Apr 2020 16:33:20 +0200 Subject: [PATCH 75/89] revert --- .travis.yml | 3 +- .../tests/extractors/test_text_extractors.py | 48 +++++++++++-------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/.travis.yml b/.travis.yml index 180b327f..ac00eddd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -48,7 +48,8 @@ before_script: - python -m pliers.support.download - python -m spacy download en_core_web_sm script: - - travis_wait 30 py.test pliers/tests/extractors/test_text_extractors.py --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning + - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning + - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning after_success: - coveralls before_cache: diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 33753754..9c8088fa 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -277,12 +277,14 @@ def test_bert_extractor(): ext_base = BertExtractor(pretrained_model='bert-base-uncased') ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', return_input=True) + ext_tf = BertExtractor(pretrained_model='bert-base-uncased', framework='tf') base_result = ext_base.transform(stim) res = base_result.to_df() res_model_attr = base_result.to_df(include_attributes=True) res_token = ext_base_token.transform(stim).to_df() res_file = ext_base.transform(stim_file).to_df() + res_tf = ext_tf.transform(stim_file).to_df() # Test encoding shape assert len(res['encoding'][0]) == 768 @@ -301,6 +303,11 @@ def test_bert_extractor(): assert res_file['duration'][5] == 0.5 assert res_file['object_id'][5] == 5 + # test tf vs torch + cors = [np.corrcoef(res['encoding'][i], res_tf['encoding'][i])[0,1] + for i in range(res.shape[0])] + assert all(np.isclose(cors, 1)) + # catch error if framework is invalid with pytest.raises(ValueError) as err: BertExtractor(framework='keras') @@ -340,16 +347,16 @@ def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - #ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') + ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') # Test correct behavior when setting return_special - #assert ext_pooler.pooling is None - #assert ext_pooler.return_special == 'pooler_output' + assert ext_pooler.pooling is None + assert ext_pooler.return_special == 'pooler_output' res_sequence = BertSequenceEncodingExtractor(return_input=True).transform(stim).to_df() res_file = BertSequenceEncodingExtractor(return_input=True).transform(stim_file).to_df() res_cls = BertSequenceEncodingExtractor(return_special='[CLS]').transform(stim).to_df() - res_pooler = BertSequenceEncodingExtractor(return_special='pooler_output').transform(stim).to_df() + res_pooler = ext_pooler.transform(stim).to_df() res_max = BertSequenceEncodingExtractor(pooling='max').transform(stim).to_df() # Check shape @@ -390,7 +397,7 @@ def test_bert_sequence_extractor(): shutil.rmtree(model_path) # remove variables - del res, stim + del ext_pooler, res_cls, res_max, res_pooler, res_sequence, res_file, stim def test_bert_LM_extractor(): @@ -416,11 +423,11 @@ def test_bert_LM_extractor(): assert 'No valid target token' in str(err.value) target_wds = ['target','word'] - #ext_target = BertLMExtractor(mask=1, target=target_wds) + ext_target = BertLMExtractor(mask=1, target=target_wds) res = BertLMExtractor(mask=2).transform(stim).to_df() res_file = BertLMExtractor(mask=2).transform(stim_file).to_df() - #res_target = ext_target.transform(stim).to_df() + res_target = ext_target.transform(stim).to_df() res_topn = BertLMExtractor(mask=3, top_n=100).transform(stim).to_df() res_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True).transform(stim).to_df() res_default = BertLMExtractor().transform(stim_masked).to_df() @@ -433,8 +440,8 @@ def test_bert_LM_extractor(): assert res_file['duration'][0] == 0.2 # Check target words - #assert all([w.capitalize() in res_target.columns for w in target_wds]) - #assert res_target.shape[1] == 6 + assert all([w.capitalize() in res_target.columns for w in target_wds]) + assert res_target.shape[1] == 6 # Check top_n assert res_topn.shape[1] == 104 @@ -449,15 +456,15 @@ def test_bert_LM_extractor(): assert res_threshold[v.capitalize()][0] <= 1 # Test update mask method - #assert ext_target.mask == 1 - #ext_target.update_mask(new_mask='sentence') - #assert ext_target.mask == 'sentence' - #res_target_new = ext_target.transform(stim).to_df() - #assert all([res_target[c][0] != res_target_new[c][0] - # for c in ['Target', 'Word']]) - #with pytest.raises(ValueError) as err: - # ext_target.update_mask(new_mask=['some', 'mask']) - #assert 'must be a string' in str(err.value) + assert ext_target.mask == 1 + ext_target.update_mask(new_mask='sentence') + assert ext_target.mask == 'sentence' + res_target_new = ext_target.transform(stim).to_df() + assert all([res_target[c][0] != res_target_new[c][0] + for c in ['Target', 'Word']]) + with pytest.raises(ValueError) as err: + ext_target.update_mask(new_mask=['some', 'mask']) + assert 'must be a string' in str(err.value) # Test default mask assert res_default.shape[0] == 1 @@ -473,7 +480,8 @@ def test_bert_LM_extractor(): shutil.rmtree(model_path) # remove variables - del res, stim + del ext_target, res, res_file, res_target, res_topn, \ + res_threshold, res_default, res_return_mask def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') @@ -498,7 +506,7 @@ def test_bert_sentiment_extractor(): shutil.rmtree(model_path) # remove variables - del res, stim + del res, res_file, res_seq, res_softmax def test_word_counter_extractor(): From 706100b172fc19cbf7aa24da8828ed284db02fd5 Mon Sep 17 00:00:00 2001 From: rbroc Date: Fri, 3 Apr 2020 19:31:50 +0200 Subject: [PATCH 76/89] do not clear models --- .../tests/extractors/test_text_extractors.py | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 9c8088fa..6cecd097 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -334,11 +334,6 @@ def test_bert_other_models(model): if model == 'camembert-base': assert res['token'][4] == 'est' - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - # remove variables del res, stim @@ -391,11 +386,6 @@ def test_bert_sequence_extractor(): BertSequenceEncodingExtractor(return_special='[MASK]') assert 'must be one of' in str(err.value) - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - # remove variables del ext_pooler, res_cls, res_max, res_pooler, res_sequence, res_file, stim @@ -474,11 +464,6 @@ def test_bert_LM_extractor(): assert 'true_word_score' in res_return_mask.columns assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - # remove variables del ext_target, res, res_file, res_target, res_topn, \ res_threshold, res_default, res_return_mask @@ -500,11 +485,6 @@ def test_bert_sentiment_extractor(): assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - # remove variables del res, res_file, res_seq, res_softmax From f876fac0bdde366fbfdee8940a01af49d592a116 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 6 Apr 2020 09:17:30 +0200 Subject: [PATCH 77/89] add markers --- .travis.yml | 2 +- pliers/tests/extractors/test_text_extractors.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index ac00eddd..4ce8f885 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ before_script: - python -m spacy download en_core_web_sm script: - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning - - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning + - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" "not high_mem" --cov-append -W ignore::UserWarning after_success: - coveralls before_cache: diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 6cecd097..246cd134 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -316,7 +316,7 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token - +@pytest.mark.high_mem @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -337,7 +337,6 @@ def test_bert_other_models(model): # remove variables del res, stim - def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -468,6 +467,7 @@ def test_bert_LM_extractor(): del ext_target, res, res_file, res_target, res_topn, \ res_threshold, res_default, res_return_mask +@pytest.mark.high_mem def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) From d2fa69867b1cc02985f765ce261f2dfcaf2dab03 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 6 Apr 2020 09:52:13 +0200 Subject: [PATCH 78/89] fix travis flag --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4ce8f885..49ac6ee9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ before_script: - python -m spacy download en_core_web_sm script: - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning - - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" "not high_mem" --cov-append -W ignore::UserWarning + - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" -m "not high_mem" --cov-append -W ignore::UserWarning after_success: - coveralls before_cache: From 561bd91037a5554a53e7c089ae66f7d6c618c145 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 6 Apr 2020 11:46:49 +0200 Subject: [PATCH 79/89] mark all as high mem --- pliers/tests/extractors/test_text_extractors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 246cd134..97d9debf 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -269,7 +269,6 @@ def test_spacy_doc_extractor(): assert result['is_tagged'][3] assert result['is_sentenced'][3] - def test_bert_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -337,6 +336,7 @@ def test_bert_other_models(model): # remove variables del res, stim +@pytest.mark.high_mem def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -388,7 +388,7 @@ def test_bert_sequence_extractor(): # remove variables del ext_pooler, res_cls, res_max, res_pooler, res_sequence, res_file, stim - +@pytest.mark.high_mem def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') @@ -488,7 +488,6 @@ def test_bert_sentiment_extractor(): # remove variables del res, res_file, res_seq, res_softmax - def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' ' again and again Sometimes they are ' From f4a2690067b36901b04f5ab06a1c0da5604d0cdd Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 6 Apr 2020 13:34:03 +0200 Subject: [PATCH 80/89] skipif in travis --- .travis.yml | 2 +- pliers/tests/extractors/test_text_extractors.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 49ac6ee9..ac00eddd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ before_script: - python -m spacy download en_core_web_sm script: - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning - - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" -m "not high_mem" --cov-append -W ignore::UserWarning + - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning after_success: - coveralls before_cache: diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 97d9debf..b31147f0 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -315,7 +315,8 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token -@pytest.mark.high_mem +@pytest.mark.skipif(environ.get('TRAVIS', False) == 'true', + reason='high memory') @pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) @@ -336,7 +337,6 @@ def test_bert_other_models(model): # remove variables del res, stim -@pytest.mark.high_mem def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -388,7 +388,6 @@ def test_bert_sequence_extractor(): # remove variables del ext_pooler, res_cls, res_max, res_pooler, res_sequence, res_file, stim -@pytest.mark.high_mem def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') @@ -467,7 +466,8 @@ def test_bert_LM_extractor(): del ext_target, res, res_file, res_target, res_topn, \ res_threshold, res_default, res_return_mask -@pytest.mark.high_mem +@pytest.mark.skipif(environ.get('TRAVIS', False) == 'true', + reason='high memory') def test_bert_sentiment_extractor(): stim = ComplexTextStim(text='This is the best day of my life.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) From 1142d14ede5143cb9f56ab887e43be3f1419863a Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 6 Apr 2020 17:32:31 +0200 Subject: [PATCH 81/89] clear cache opt --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index ac00eddd..2047a68e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ before_script: - python -m spacy download en_core_web_sm script: - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning - - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning + - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning --cache-clear after_success: - coveralls before_cache: From e78e921a517e6732e06c7acc9ddf781816b0ad97 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 6 Apr 2020 17:47:14 +0200 Subject: [PATCH 82/89] delete bert_test file --- .../tests/extractors/test_bert_extractors.py | 236 ------------------ 1 file changed, 236 deletions(-) delete mode 100644 pliers/tests/extractors/test_bert_extractors.py diff --git a/pliers/tests/extractors/test_bert_extractors.py b/pliers/tests/extractors/test_bert_extractors.py deleted file mode 100644 index a1a5b28d..00000000 --- a/pliers/tests/extractors/test_bert_extractors.py +++ /dev/null @@ -1,236 +0,0 @@ -from pliers import config -from pliers.extractors import (DictionaryExtractor, - PartOfSpeechExtractor, - LengthExtractor, - NumUniqueWordsExtractor, - PredefinedDictionaryExtractor, - TextVectorizerExtractor, - WordEmbeddingExtractor, - VADERSentimentExtractor, - SpaCyExtractor, - BertExtractor, - BertSequenceEncodingExtractor, - BertLMExtractor, - BertSentimentExtractor, - WordCounterExtractor) -from pliers.extractors.base import merge_results -from pliers.stimuli import TextStim, ComplexTextStim -from pliers.tests.utils import get_test_data_path -import numpy as np -from os.path import join -from pathlib import Path -import shutil -import pytest -import spacy -from os import environ -from transformers import BertTokenizer - -TEXT_DIR = join(get_test_data_path(), 'text') - -def test_bert_extractor(): - stim = ComplexTextStim(text='This is not a tokenized sentence.') - stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - - ext_base = BertExtractor(pretrained_model='bert-base-uncased') - ext_base_token = BertExtractor(pretrained_model='bert-base-uncased', - return_input=True) - - base_result = ext_base.transform(stim) - res = base_result.to_df() - res_model_attr = base_result.to_df(include_attributes=True) - res_token = ext_base_token.transform(stim).to_df() - res_file = ext_base.transform(stim_file).to_df() - - # Test encoding shape - assert len(res['encoding'][0]) == 768 - assert len(res_file['encoding'][0]) == 768 - - # test base extractor - assert res.shape[0] == 8 - assert res_token.shape[0] == 8 - assert res_token['token'][5] == '##ized' - assert res_token['word'][5] == 'tokenized' - assert res_token['object_id'][5] == 5 - - # test base extractor on file - assert res_file.shape[0] == 8 - assert res_file['onset'][3] == 1.3 - assert res_file['duration'][5] == 0.5 - assert res_file['object_id'][5] == 5 - - # catch error if framework is invalid - with pytest.raises(ValueError) as err: - BertExtractor(framework='keras') - assert 'Invalid framework' in str(err.value) - - # Delete the models - del res, res_token, res_file, ext_base, ext_base_token - - -@pytest.mark.parametrize('model', ['bert-large-uncased', - 'distilbert-base-uncased', - 'roberta-base','camembert-base']) -def test_bert_other_models(model): - if model == 'camembert-base': - stim = ComplexTextStim(text='ceci n\'est pas un pipe') - else: - stim = ComplexTextStim(text='This is not a tokenized sentence.') - res = BertExtractor(pretrained_model=model, return_input=True).transform(stim).to_df() - if model == 'bert-large-uncased': - shape = 1024 - else: - shape = 768 - assert len(res['encoding'][0]) == shape - if model == 'camembert-base': - assert res['token'][4] == 'est' - - # delete the model - home = Path.home() - model_path = str(home / '.cache' / 'torch' / 'transformers') - shutil.rmtree(model_path) - - # remove variables - del res, stim - - -def test_bert_sequence_extractor(): - stim = ComplexTextStim(text='This is not a tokenized sentence.') - stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - - #ext_pooler = BertSequenceEncodingExtractor(return_special='pooler_output') - - # Test correct behavior when setting return_special - #assert ext_pooler.pooling is None - #assert ext_pooler.return_special == 'pooler_output' - - res_sequence = BertSequenceEncodingExtractor(return_input=True).transform(stim).to_df() - res_file = BertSequenceEncodingExtractor(return_input=True).transform(stim_file).to_df() - res_cls = BertSequenceEncodingExtractor(return_special='[CLS]').transform(stim).to_df() - res_pooler = BertSequenceEncodingExtractor(return_special='pooler_output').transform(stim).to_df() - res_max = BertSequenceEncodingExtractor(pooling='max').transform(stim).to_df() - - # Check shape - assert len(res_sequence['encoding'][0]) == 768 - assert len(res_cls['encoding'][0]) == 768 - assert len(res_pooler['encoding'][0]) == 768 - assert len(res_max['encoding'][0]) == 768 - assert res_sequence.shape[0] == 1 - assert res_cls.shape[0] == 1 - assert res_pooler.shape[0] == 1 - assert res_max.shape[0] == 1 - - # Make sure pooler/cls/no arguments return different encodings - assert res_sequence['encoding'][0] != res_cls['encoding'][0] - assert res_sequence['encoding'][0] != res_pooler['encoding'][0] - assert res_sequence['encoding'][0] != res_max['encoding'][0] - assert all([res_max['encoding'][0][i] >= res_sequence['encoding'][0][i] - for i in range(768)]) - - # test return sequence - assert res_sequence['sequence'][0] == 'This is not a tokenized sentence .' - - # test file stim - assert res_file['duration'][0] == 2.9 - assert res_file['onset'][0] == 0.2 - - # catch error with wrong numpy function and wrong special token arg - with pytest.raises(ValueError) as err: - BertSequenceEncodingExtractor(pooling='avg') - assert 'valid numpy function' in str(err.value) - with pytest.raises(ValueError) as err: - BertSequenceEncodingExtractor(return_special='[MASK]') - assert 'must be one of' in str(err.value) - - -def test_bert_LM_extractor(): - stim = ComplexTextStim(text='This is not a tokenized sentence.') - stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') - stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - - # Test mutual exclusivity and mask values - with pytest.raises(ValueError) as err: - BertLMExtractor(top_n=100, target='test') - assert 'mutually exclusive' in str(err.value) - with pytest.raises(ValueError) as err: - BertLMExtractor(top_n=100, threshold=.5) - assert 'mutually exclusive' in str(err.value) - with pytest.raises(ValueError) as err: - BertLMExtractor(target='test', threshold=.5) - assert 'mutually exclusive' in str(err.value) - with pytest.raises(ValueError) as err: - BertLMExtractor(mask=['test', 'mask']) - assert 'must be a string' in str(err.value) - with pytest.raises(ValueError) as err: - BertLMExtractor(target='nonwd') - assert 'No valid target token' in str(err.value) - - target_wds = ['target','word'] - #ext_target = BertLMExtractor(mask=1, target=target_wds) - - res = BertLMExtractor(mask=2).transform(stim).to_df() - res_file = BertLMExtractor(mask=2).transform(stim_file).to_df() - #res_target = ext_target.transform(stim).to_df() - res_topn = BertLMExtractor(mask=3, top_n=100).transform(stim).to_df() - res_threshold = BertLMExtractor(mask=4, threshold=.1, return_softmax=True).transform(stim).to_df() - res_default = BertLMExtractor().transform(stim_masked).to_df() - res_return_mask = BertLMExtractor(mask=1, top_n=10, return_masked_word=True, return_input=True).transform(stim).to_df() - - assert res.shape[0] == 1 - - # test onset/duration - assert res_file['onset'][0] == 1.0 - assert res_file['duration'][0] == 0.2 - - # Check target words - #assert all([w.capitalize() in res_target.columns for w in target_wds]) - #assert res_target.shape[1] == 6 - - # Check top_n - assert res_topn.shape[1] == 104 - assert all([res_topn.iloc[:,3][0] > res_topn.iloc[:,i][0] for i in range(4,103)]) - - # Check threshold and return_softmax - tknz = BertTokenizer.from_pretrained('bert-base-uncased') - vocab = tknz.vocab.keys() - for v in vocab: - if v.capitalize() in res_threshold.columns: - assert res_threshold[v.capitalize()][0] >= .1 - assert res_threshold[v.capitalize()][0] <= 1 - - # Test update mask method - #assert ext_target.mask == 1 - #ext_target.update_mask(new_mask='sentence') - #assert ext_target.mask == 'sentence' - #res_target_new = ext_target.transform(stim).to_df() - #assert all([res_target[c][0] != res_target_new[c][0] - # for c in ['Target', 'Word']]) - #with pytest.raises(ValueError) as err: - # ext_target.update_mask(new_mask=['some', 'mask']) - #assert 'must be a string' in str(err.value) - - # Test default mask - assert res_default.shape[0] == 1 - - # Test return mask and input - assert res_return_mask['true_word'][0] == 'is' - assert 'true_word_score' in res_return_mask.columns - assert res_return_mask['sequence'][0] == 'This is not a tokenized sentence .' - - -def test_bert_sentiment_extractor(): - stim = ComplexTextStim(text='This is the best day of my life.') - stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) - - res = BertSentimentExtractor().transform(stim).to_df() - res_file = BertSentimentExtractor().transform(stim_file).to_df() - res_seq = BertSentimentExtractor(return_input=True).transform(stim).to_df() - res_softmax = BertSentimentExtractor(return_softmax=True).transform(stim).to_df() - - assert res.shape[0] == 1 - assert res_file['onset'][0] == 0.2 - assert res_file['duration'][0] == 2.9 - assert all([s in res.columns for s in ['sent_pos', 'sent_neg']]) - assert res_seq['sequence'][0] == 'This is the best day of my life .' - assert all([res_softmax[s][0] >= 0 for s in ['sent_pos','sent_neg'] ]) - assert all([res_softmax[s][0] <= 1 for s in ['sent_pos','sent_neg'] ]) - From 0e59c257a5abe268447bd9662fc076b78c8f5c28 Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 6 Apr 2020 18:16:59 +0200 Subject: [PATCH 83/89] checkpoint --- pliers/tests/extractors/test_text_extractors.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index b31147f0..3445b40a 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -269,6 +269,7 @@ def test_spacy_doc_extractor(): assert result['is_tagged'][3] assert result['is_sentenced'][3] + def test_bert_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -315,9 +316,10 @@ def test_bert_extractor(): # Delete the models del res, res_token, res_file, ext_base, ext_base_token + @pytest.mark.skipif(environ.get('TRAVIS', False) == 'true', reason='high memory') -@pytest.mark.parametrize('model', ['bert-large-uncased', +@pytest.mark.parametrize('model', ['bert-large-uncased', 'distilbert-base-uncased', 'roberta-base','camembert-base']) def test_bert_other_models(model): @@ -337,6 +339,7 @@ def test_bert_other_models(model): # remove variables del res, stim + def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) @@ -348,7 +351,7 @@ def test_bert_sequence_extractor(): assert ext_pooler.return_special == 'pooler_output' res_sequence = BertSequenceEncodingExtractor(return_input=True).transform(stim).to_df() - res_file = BertSequenceEncodingExtractor(return_input=True).transform(stim_file).to_df() + res_file = BertSequenceEncodingExtractor(return_input=True).transform(stim_file).to_df() res_cls = BertSequenceEncodingExtractor(return_special='[CLS]').transform(stim).to_df() res_pooler = ext_pooler.transform(stim).to_df() res_max = BertSequenceEncodingExtractor(pooling='max').transform(stim).to_df() @@ -388,6 +391,7 @@ def test_bert_sequence_extractor(): # remove variables del ext_pooler, res_cls, res_max, res_pooler, res_sequence, res_file, stim + def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') @@ -466,6 +470,7 @@ def test_bert_LM_extractor(): del ext_target, res, res_file, res_target, res_topn, \ res_threshold, res_default, res_return_mask + @pytest.mark.skipif(environ.get('TRAVIS', False) == 'true', reason='high memory') def test_bert_sentiment_extractor(): @@ -488,6 +493,7 @@ def test_bert_sentiment_extractor(): # remove variables del res, res_file, res_seq, res_softmax + def test_word_counter_extractor(): stim_txt = ComplexTextStim(text='This is a text where certain words occur' ' again and again Sometimes they are ' From 2ac62dfa12d34b1fe593d97e10104d16b8dc0bab Mon Sep 17 00:00:00 2001 From: rbroc Date: Mon, 6 Apr 2020 18:31:37 +0200 Subject: [PATCH 84/89] tf test --- pliers/tests/extractors/test_text_extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 3445b40a..5b996c58 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -284,7 +284,7 @@ def test_bert_extractor(): res_model_attr = base_result.to_df(include_attributes=True) res_token = ext_base_token.transform(stim).to_df() res_file = ext_base.transform(stim_file).to_df() - res_tf = ext_tf.transform(stim_file).to_df() + res_tf = ext_tf.transform(stim).to_df() # Test encoding shape assert len(res['encoding'][0]) == 768 From a8bda72d37217b54203f6627c60b64a095fa82fc Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 7 Apr 2020 09:25:25 +0200 Subject: [PATCH 85/89] checkpoint --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 2047a68e..ac00eddd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -49,7 +49,7 @@ before_script: - python -m spacy download en_core_web_sm script: - py.test pliers/tests/test_* pliers/tests/converters pliers/tests/filters --cov=pliers --cov-report= -m "not requires_payment" -W ignore::UserWarning - - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning --cache-clear + - py.test pliers/tests/extractors --cov=pliers --cov-report= -m "not requires_payment" --cov-append -W ignore::UserWarning after_success: - coveralls before_cache: From 10e157f6f5f46614d5425850fb81266c50019aba Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 7 Apr 2020 09:39:11 +0200 Subject: [PATCH 86/89] skip one more test --- pliers/tests/extractors/test_text_extractors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index 5b996c58..c904001d 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -340,6 +340,8 @@ def test_bert_other_models(model): del res, stim +@pytest.mark.skipif(environ.get('TRAVIS', False) == 'true', + reason='high memory') def test_bert_sequence_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_file = ComplexTextStim(join(TEXT_DIR, 'sentence_with_header.txt')) From 4a077bf37ca610a7728ba53f6141a32a79f81d67 Mon Sep 17 00:00:00 2001 From: rbroc Date: Tue, 7 Apr 2020 11:19:06 +0200 Subject: [PATCH 87/89] only run encoding extractor --- pliers/tests/extractors/test_text_extractors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pliers/tests/extractors/test_text_extractors.py b/pliers/tests/extractors/test_text_extractors.py index c904001d..3af007bd 100644 --- a/pliers/tests/extractors/test_text_extractors.py +++ b/pliers/tests/extractors/test_text_extractors.py @@ -394,6 +394,8 @@ def test_bert_sequence_extractor(): del ext_pooler, res_cls, res_max, res_pooler, res_sequence, res_file, stim +@pytest.mark.skipif(environ.get('TRAVIS', False) == 'true', + reason='high memory') def test_bert_LM_extractor(): stim = ComplexTextStim(text='This is not a tokenized sentence.') stim_masked = ComplexTextStim(text='This is MASK tokenized sentence.') From 380950b1b7e2266804f3de94b4e2a1c1c9e4e7e8 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 16 Apr 2020 17:32:34 +0200 Subject: [PATCH 88/89] add docstring to update_mask --- pliers/extractors/text.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index 1386cbe8..c1b3cb27 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -697,6 +697,11 @@ def __init__(self, self.return_masked_word = return_masked_word def update_mask(self, new_mask): + ''' Updates mask attribute with value of new_mask. + Args: + new_mask (str or int): word to mask (str) or index/position of the + word to mask in input sequence (int). Indexing starts at 0. + ''' if type(new_mask) not in [str, int]: raise ValueError('Mask must be a string or an integer.') self.mask = new_mask From 71d9b7c3395bf10c7a585d9d38a6dd26127688d2 Mon Sep 17 00:00:00 2001 From: rbroc Date: Thu, 16 Apr 2020 17:34:57 +0200 Subject: [PATCH 89/89] remove abc import --- pliers/extractors/text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pliers/extractors/text.py b/pliers/extractors/text.py index c1b3cb27..52ff112e 100644 --- a/pliers/extractors/text.py +++ b/pliers/extractors/text.py @@ -2,7 +2,6 @@ Extractors that operate primarily or exclusively on Text stimuli. ''' import sys -from abc import ABCMeta, abstractmethod from pliers.stimuli.text import TextStim, ComplexTextStim from pliers.extractors.base import Extractor, ExtractorResult from pliers.support.exceptions import PliersError