diff --git a/pliers/converters/api.py b/pliers/converters/api.py index 368d8ab9..001715ea 100644 --- a/pliers/converters/api.py +++ b/pliers/converters/api.py @@ -52,7 +52,7 @@ def _convert(self, audio): text = getattr(self.recognizer, self.recognize_method)(clip, self.api_key) - return ComplexTextStim(text=text, onset=audio.onset) + return ComplexTextStim(text=text) class WitTranscriptionConverter(SpeechRecognitionAPIConverter): diff --git a/pliers/converters/google.py b/pliers/converters/google.py index 4dd4bfca..4e5a6418 100644 --- a/pliers/converters/google.py +++ b/pliers/converters/google.py @@ -89,7 +89,7 @@ def _convert(self, stim): onset=offset + onset, duration=duration)) - return ComplexTextStim(elements=words, onset=stim.onset) + return ComplexTextStim(elements=words) class GoogleVisionAPITextConverter(GoogleVisionAPITransformer, @@ -121,30 +121,24 @@ def _convert(self, stims): responses = self._query_api(request) texts = [] - for i, response in enumerate(responses): - stim = stims[i] + for response in responses: if response and self.response_object in response: annotations = response[self.response_object] # Combine the annotations if self.handle_annotations == 'first': text = annotations[0]['description'] - texts.append(TextStim(text=text, onset=stim.onset, - duration=stim.duration)) + texts.append(TextStim(text=text)) elif self.handle_annotations == 'concatenate': text = '' for annotation in annotations: text = ' '.join([text, annotation['description']]) - texts.append(TextStim(text=text, onset=stim.onset, - duration=stim.duration)) + texts.append(TextStim(text=text)) elif self.handle_annotations == 'list': for annotation in annotations: - texts.append(TextStim(text=annotation['description'], - onset=stim.onset, - duration=stim.duration)) + texts.append(TextStim(text=annotation['description'])) elif 'error' in response: raise Exception(response['error']['message']) else: - texts.append(TextStim(text='', onset=stim.onset, - duration=stim.duration)) + texts.append(TextStim(text='')) return texts diff --git a/pliers/converters/image.py b/pliers/converters/image.py index 43dc6499..827cd3dc 100644 --- a/pliers/converters/image.py +++ b/pliers/converters/image.py @@ -26,5 +26,4 @@ class TesseractConverter(ImageToTextConverter): def _convert(self, stim): verify_dependencies(['pytesseract']) text = pytesseract.image_to_string(Image.fromarray(stim.data)) - return TextStim(text=text, onset=stim.onset, duration=stim.duration, - order=stim.order) + return TextStim(text=text) diff --git a/pliers/converters/microsoft.py b/pliers/converters/microsoft.py index cc2c8156..d87d8ae2 100644 --- a/pliers/converters/microsoft.py +++ b/pliers/converters/microsoft.py @@ -29,4 +29,4 @@ def _convert(self, stim): lines.append(' '.join([w['text'] for w in l['words']])) text = '\n'.join(lines) - return TextStim(text=text, onset=stim.onset, duration=stim.duration) + return TextStim(text=text) diff --git a/pliers/converters/video.py b/pliers/converters/video.py index e4ab8cdf..6071a9f6 100644 --- a/pliers/converters/video.py +++ b/pliers/converters/video.py @@ -16,5 +16,4 @@ class VideoToAudioConverter(Converter): def _convert(self, video): fps = AudioStim.get_sampling_rate(video.filename) return AudioStim(sampling_rate=fps, - clip=video.clip.audio, - onset=video.onset) + clip=video.clip.audio) diff --git a/pliers/extractors/base.py b/pliers/extractors/base.py index c387c037..c012bc41 100644 --- a/pliers/extractors/base.py +++ b/pliers/extractors/base.py @@ -66,6 +66,9 @@ def __init__(self, data, stim, extractor, features=None, onsets=None, self.features = features self.raw = raw self._history = None + self.onset = onsets + self.duration = durations + self.order = orders # Eventually, the goal is to make raw mandatory, and always # generate the .data property via calls to to_array() or to_df() @@ -73,18 +76,6 @@ def __init__(self, data, stim, extractor, features=None, onsets=None, # warning, we provide a backward-compatible version for the time being. self.data = np.array(data) - if onsets is None: - onsets = stim.onset - self.onsets = onsets if onsets is not None else np.nan - - if durations is None: - durations = stim.duration - self.durations = durations if durations is not None else np.nan - - if orders is None: - orders = stim.order - self.orders = orders if orders is not None else np.nan - def to_df(self, timing=True, metadata=False, format='wide', extractor_name=False, object_id=True, **to_df_kwargs): ''' Convert current instance to a pandas DatasFrame. @@ -130,6 +121,10 @@ def to_df(self, timing=True, metadata=False, format='wide', for i in range(self.data.shape[1])] df = pd.DataFrame(self.data, columns=features) + onsets = np.nan if self.onset is None else self.onset + durations = np.nan if self.duration is None else self.duration + orders = np.nan if self.order is None else self.order + index_cols = [] # Generally we leave it to Extractors to properly track the number of @@ -139,8 +134,8 @@ def to_df(self, timing=True, metadata=False, format='wide', # counter for any row in the DF that cannot be uniquely distinguished # from other rows by onset and duration. if object_id and 'object_id' not in df.columns: - index = pd.Series(self.onsets).astype(str) + '_' + \ - pd.Series(self.durations).astype(str) + index = pd.Series(onsets).astype(str) + '_' + \ + pd.Series(durations).astype(str) if object_id is True or (object_id == 'auto' and len(set(index)) > 1): ids = np.arange(len(df)) if len(index) == 1 \ @@ -149,11 +144,11 @@ def to_df(self, timing=True, metadata=False, format='wide', index_cols = ['object_id'] if timing is True or (timing == 'auto' and - (np.isfinite(self.durations).any() or - np.isfinite(self.orders).any())): - df.insert(0, 'duration', self.durations) - df.insert(0, 'order', self.orders) - df.insert(0, 'onset', self.onsets) + (np.isfinite(durations).any() or + np.isfinite(orders).any())): + df.insert(0, 'onset', onsets) + df.insert(0, 'duration', durations) + df.insert(0, 'order', orders) index_cols.extend(['onset', 'order', 'duration']) if format == 'long': diff --git a/pliers/filters/image.py b/pliers/filters/image.py index b792eca3..0dba4b5c 100644 --- a/pliers/filters/image.py +++ b/pliers/filters/image.py @@ -40,9 +40,7 @@ def _filter(self, stim): x0, y0, x1, y1 = pillow_img.getbbox() new_img = stim.data[y0:y1, x0:x1] return ImageStim(stim.filename, - data=new_img, - onset=stim.onset, - duration=stim.duration) + data=new_img) class PillowImageFilter(ImageFilter): @@ -87,6 +85,4 @@ def _filter(self, stim): pillow_img = Image.fromarray(stim.data) new_img = np.array(pillow_img.filter(self.filter)) return ImageStim(stim.filename, - data=new_img, - onset=stim.onset, - duration=stim.duration) + data=new_img) diff --git a/pliers/filters/text.py b/pliers/filters/text.py index ee1e9156..15a41026 100644 --- a/pliers/filters/text.py +++ b/pliers/filters/text.py @@ -67,8 +67,7 @@ def _filter(self, stim): stemmed = ' '.join([self.stemmer.stem(tok) for tok in tokens]) else: stemmed = self.stemmer.stem(stim.text) - return TextStim(stim.filename, stemmed, onset=stim.onset, - duration=stim.duration, order=stim.order) + return TextStim(stim.filename, stemmed) class TokenizingFilter(TextFilter): @@ -97,8 +96,7 @@ def _filter(self, stim): tokens = self.tokenizer.tokenize(stim.text) else: tokens = word_tokenize(stim.text) - stims = [TextStim(stim.filename, token, onset=stim.onset, - duration=stim.duration, order=i) + stims = [TextStim(stim.filename, token, order=i) for i, token in enumerate(tokens)] return stims @@ -134,8 +132,7 @@ def _filter(self, stim): tokens = word_tokenize(stim.text) tokens = [tok for tok in tokens if tok not in self.tokens] text = ' '.join(tokens) - return TextStim(stim.filename, text, onset=stim.onset, - duration=stim.duration, order=stim.order) + return TextStim(stim.filename, text) class PunctuationRemovalFilter(TokenRemovalFilter): @@ -151,5 +148,4 @@ class LowerCasingFilter(TextFilter): ''' Lower cases the text in a TextStim. ''' def _filter(self, stim): - return TextStim(stim.filename, stim.text.lower(), onset=stim.onset, - duration=stim.duration, order=stim.order) + return TextStim(stim.filename, stim.text.lower()) diff --git a/pliers/filters/video.py b/pliers/filters/video.py index 86bff450..d446479a 100644 --- a/pliers/filters/video.py +++ b/pliers/filters/video.py @@ -57,8 +57,7 @@ def _filter(self, video): frame_index = sorted(list(set(video.frame_index).intersection(new_idx))) return VideoFrameCollectionStim(filename=video.filename, - frame_index=frame_index, - onset=video.onset) + frame_index=frame_index) class VideoTrimmingFilter(TemporalTrimmingFilter, VideoFilter): diff --git a/pliers/stimuli/text.py b/pliers/stimuli/text.py index 000527aa..13dc9960 100644 --- a/pliers/stimuli/text.py +++ b/pliers/stimuli/text.py @@ -121,7 +121,7 @@ def __init__(self, filename=None, onset=None, duration=None, columns=None, @property def elements(self): - return self._elements + return [f for f in self] def _from_file(self, filename, columns, default_duration): tod_names = {'t': 'text', 'o': 'onset', 'd': 'duration'} @@ -142,7 +142,7 @@ def _from_file(self, filename, columns, default_duration): if duration is None: duration = default_duration elem = TextStim(filename, r['text'], r['onset'], duration) - self.add_elem(elem) + self._elements.append(elem) def save(self, path): with open(path, 'w') as f: @@ -173,16 +173,13 @@ def _from_srt(self, filename): for i, r in df.iterrows(): elem = TextStim(filename, text=r['text'], onset=r['onset'], duration=r['duration'], order=i) - self.add_elem(elem) - - def add_elem(self, elem): - offset = 0.0 if self.onset is None else self.onset - elem.onset = offset if elem.onset is None else offset + elem.onset - self._elements.append(elem) + self._elements.append(elem) def __iter__(self): """ Iterate text elements. """ for elem in self._elements: + offset = 0.0 if self.onset is None else self.onset + elem.onset = offset if elem.onset is None else offset + elem.onset yield elem def _to_sec(self, tup): @@ -215,5 +212,5 @@ def tokenize_text(text): tokens = tokenize_text(text) for i, t in enumerate(tokens): - self.add_elem(TextStim(text=t, onset=None, duration=None, - order=i)) + self._elements.append(TextStim(text=t, onset=None, duration=None, + order=i)) diff --git a/pliers/tests/filters/test_text_filters.py b/pliers/tests/filters/test_text_filters.py index 4c54dff6..1e34763b 100644 --- a/pliers/tests/filters/test_text_filters.py +++ b/pliers/tests/filters/test_text_filters.py @@ -54,11 +54,15 @@ def test_word_stemming_filter(): def test_tokenizing_filter(): - stim = TextStim(join(TEXT_DIR, 'scandal.txt')) + stim = TextStim(join(TEXT_DIR, 'scandal.txt'), onset=4.2) filt = TokenizingFilter() words = filt.transform(stim) assert len(words) == 231 assert words[0].text == 'To' + assert words[0].onset == 4.2 + assert words[0].order == 0 + assert words[1].onset == 4.2 + assert words[1].order == 1 custom_tokenizer = PunktSentenceTokenizer() filt = TokenizingFilter(tokenizer=custom_tokenizer) diff --git a/pliers/transformers/base.py b/pliers/transformers/base.py index cc397931..1b25d9dc 100644 --- a/pliers/transformers/base.py +++ b/pliers/transformers/base.py @@ -129,6 +129,7 @@ def transform(self, stims, validation='strict', *args, **kwargs): result = _log_transformation(validated_stim, result, self) if isgenerator(result): result = list(result) + self._propagate_context(validated_stim, result) return result def _validate(self, stim): @@ -186,6 +187,18 @@ def _transform(s): return (t for t in (self.transform(s, *args, **kwargs) for s in stims) if t) + def _propagate_context(self, stim, result): + if isiterable(result): + for r in result: + self._propagate_context(stim, r) + else: + if result.onset is None: + result.onset = stim.onset + if result.duration is None: + result.duration = stim.duration + if result.order is None: + result.order = stim.order + @abstractmethod def _transform(self, stim): pass @@ -223,6 +236,7 @@ def _iterate(self, stims, validation='strict', *args, **kwargs): res = self._transform(batch, *args, **kwargs) for i, stim in enumerate(batch): res[i] = _log_transformation(stim, res[i], self) + self._propagate_context(stim, res[i]) results.extend(res) return results