Merge branch 'api-remote' into batch-caching

PsychoinformaticsLab · Mar 31, 2018 · d60f664 · d60f664
2 parents 6c7b1ba + c71f736
commit d60f664
Show file tree

Hide file tree

Showing 18 changed files with 182 additions and 41 deletions.
diff --git a/pliers/converters/api/google.py b/pliers/converters/api/google.py
@@ -22,6 +22,13 @@ class GoogleSpeechAPIConverter(GoogleAPITransformer, AudioToTextConverter):
         speech_contexts (list): A list of a list of favored phrases or words
             to assist the API. The inner list is a sequence of word tokens,
             each outer element is a potential context.
+        discovery_file (str): path to discovery file containing Google
+            application credentials.
+        api_version (str): API version to use.
+        max_results (int): Max number of results per page.
+        num_retries (int): Number of times to retry query on failure.
+        rate_limit (int): The minimum number of seconds required between
+                transform calls on this Transformer.
     '''
 
     api_name = 'speech'
@@ -55,14 +62,13 @@ def _build_request(self, stim):
             data = f.read()
         os.remove(tmp)
 
-        content = base64.b64encode(data).decode()
         if self.speech_contexts:
             speech_contexts = [{'phrases': c} for c in self.speech_contexts]
         else:
             speech_contexts = []
         request = {
             'audio': {
-                'content': content
+                'content': base64.b64encode(data).decode()
             },
             'config': {
                 'encoding': 'FLAC',

diff --git a/pliers/converters/api/ibm.py b/pliers/converters/api/ibm.py
@@ -30,6 +30,8 @@ class IBMSpeechAPIConverter(APITransformer, AudioToTextConverter):
             be separated by (i.e. the unit each TextStim in the ComplexTextStim
             elements should be). Currently, only 'words' or 'phrases' are
             supported.
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
     '''
 
     _env_keys = ('IBM_USERNAME', 'IBM_PASSWORD')

diff --git a/pliers/converters/api/microsoft.py b/pliers/converters/api/microsoft.py
@@ -8,7 +8,21 @@
 class MicrosoftAPITextConverter(MicrosoftVisionAPITransformer,
                                 ImageToTextConverter):
 
-    ''' Detects text within images using the Microsoft Vision API. '''
+    ''' Detects text within images using the Microsoft Vision API.
+
+    Args:
+        language (str): Target language to detect in the image.
+        subscription_key (str): A valid subscription key for Microsoft Cognitive
+            Services. Only needs to be passed the first time the extractor is
+            initialized.
+        location (str): Region the subscription key has been registered in.
+            It will be the first part of the endpoint URL suggested by
+            Microsoft when you first created the key.
+            Examples include: westus, westcentralus, eastus
+        api_version (str): API version to use.
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
+    '''
 
     api_method = 'ocr'
     _log_attributes = ('subscription_key', 'api_version', 'language')

diff --git a/pliers/converters/api/wit.py b/pliers/converters/api/wit.py
@@ -20,6 +20,8 @@ class SpeechRecognitionAPIConverter(APITransformer, AudioToTextConverter):
     Args:
         api_key (str): API key. Must be passed explicitly or stored in
             the environment variable specified in the _env_keys field.
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
     '''
 
     _log_attributes = ('api_key', 'recognize_method')

diff --git a/pliers/extractors/api/clarifai.py b/pliers/extractors/api/clarifai.py
@@ -38,6 +38,9 @@ class ClarifaiAPIExtractor(APITransformer, BatchTransformerMixin,
             number of label predictions returned.
         select_concepts (list): List of concepts (strings) to query from the
             API. For example, ['food', 'animal'].
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
+        batch_size (int): Number of stims to send per batched API request.
     '''
 
     _log_attributes = ('api_key', 'model', 'model_name', 'min_value',
@@ -88,13 +91,17 @@ def _extract(self, stims):
         moc = clarifai_client.ModelOutputConfig(min_value=self.min_value,
                                                 max_concepts=self.max_concepts,
                                                 select_concepts=self.select_concepts)
-        output_config = moc
-        model_output_info = clarifai_client.ModelOutputInfo(output_config=output_config)
+        model_output_info = clarifai_client.ModelOutputInfo(output_config=moc)
 
         # ExitStack lets us use filename context managers simultaneously
         with ExitStack() as stack:
-            files = [stack.enter_context(s.get_filename()) for s in stims]
-            imgs = [clarifai_client.Image(filename=filename) for filename in files]
+            imgs = []
+            for s in stims:
+                if s.url:
+                    imgs.append(clarifai_client.Image(url=s.url))
+                else:
+                    f = stack.enter_context(s.get_filename())
+                    imgs.append(clarifai_client.Image(filename=f))
             tags = self.model.predict(imgs, model_output_info=model_output_info)
 
         extracted = []

diff --git a/pliers/extractors/api/indico.py b/pliers/extractors/api/indico.py
@@ -22,6 +22,9 @@ class IndicoAPIExtractor(APITransformer, BatchTransformerMixin, Extractor):
         api_key (str): A valid API key for the Indico API. Only needs to be
             passed the first time the extractor is initialized.
         models (list): The names of the Indico models to use.
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
+        batch_size (int): Number of stims to send per batched API request.
     '''
 
     _log_attributes = ('api_key', 'models', 'model_names')
@@ -75,8 +78,11 @@ def check_valid_keys(self):
                 # If valid key, a data error (None passed) is expected here
                 return True
 
+    def _get_tokens(self, stims):
+        return [stim.data for stim in stims if stim.data is not None]
+
     def _extract(self, stims):
-        tokens = [stim.data for stim in stims if stim.data is not None]
+        tokens = self._get_tokens(stims)
         scores = [model(tokens) for model in self.models]
 
         results = []
@@ -100,6 +106,14 @@ class IndicoAPITextExtractor(TextExtractor, IndicoAPIExtractor):
 
     ''' Uses to Indico API to extract features from text, such as
     sentiment extraction.
+
+    Args:
+        api_key (str): A valid API key for the Indico API. Only needs to be
+            passed the first time the extractor is initialized.
+        models (list): The names of the Indico models to use.
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
+        batch_size (int): Number of stims to send per batched API request.
     '''
 
     def __init__(self, api_key=None, models=None, rate_limit=None,
@@ -116,6 +130,14 @@ class IndicoAPIImageExtractor(ImageExtractor, IndicoAPIExtractor):
 
     ''' Uses to Indico API to extract features from Images, such as
     facial emotion recognition or content filtering.
+
+    Args:
+        api_key (str): A valid API key for the Indico API. Only needs to be
+            passed the first time the extractor is initialized.
+        models (list): The names of the Indico models to use.
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
+        batch_size (int): Number of stims to send per batched API request.
     '''
 
     def __init__(self, api_key=None, models=None, rate_limit=None,
@@ -126,3 +148,12 @@ def __init__(self, api_key=None, models=None, rate_limit=None,
                                                       models=models,
                                                       rate_limit=rate_limit,
                                                       batch_size=batch_size)
+
+    def _get_tokens(self, stims):
+        toks = []
+        for s in stims:
+            if s.url:
+                toks.append(s.url)
+            elif s.data is not None:
+                toks.append(s.data)
+        return toks
diff --git a/pliers/extractors/api/microsoft.py b/pliers/extractors/api/microsoft.py
@@ -15,15 +15,25 @@ class MicrosoftAPIFaceExtractor(MicrosoftAPITransformer, ImageExtractor):
     image using the Microsoft Azure Cognitive Services API.
 
     Args:
-        face_id (bool): return faceIds of the detected faces or not. The
+        face_id (bool): Return faceIds of the detected faces or not. The
             default value is False.
-        landmarks (str): return face landmarks of the detected faces or
+        landmarks (str): Return face landmarks of the detected faces or
             not. The default value is False.
-        attributes (list): one or more specified face attributes as strings.
+        attributes (list): One or more specified face attributes as strings.
             Supported face attributes include accessories, age, blur, emotion,
             exposure, facialHair, gender, glasses, hair, headPose, makeup,
             noise, occlusion, and smile. Note that each attribute has
             additional computational and time cost.
+        subscription_key (str): A valid subscription key for Microsoft Cognitive
+            Services. Only needs to be passed the first time the extractor is
+            initialized.
+        location (str): Region the subscription key has been registered in.
+            It will be the first part of the endpoint URL suggested by
+            Microsoft when you first created the key.
+            Examples include: westus, westcentralus, eastus
+        api_version (str): API version to use.
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
     '''
 
     api_name = 'face'
@@ -116,11 +126,21 @@ class MicrosoftVisionAPIExtractor(MicrosoftVisionAPITransformer,
     ''' Base MicrosoftVisionAPIExtractor class.
 
     Args:
-        features (list): one or more specified vision features as strings.
+        features (list): One or more specified vision features as strings.
             Supported vision features include Tags, Categories, ImageType,
             Color, and Adult. Note that each attribute has additional
             computational and time cost. By default extracts all visual
             features from an image.
+        subscription_key (str): A valid subscription key for Microsoft Cognitive
+            Services. Only needs to be passed the first time the extractor is
+            initialized.
+        location (str): Region the subscription key has been registered in.
+            It will be the first part of the endpoint URL suggested by
+            Microsoft when you first created the key.
+            Examples include: westus, westcentralus, eastus
+        api_version (str): API version to use.
+        rate_limit (int): The minimum number of seconds required between
+            transform calls on this Transformer.
     '''
 
     api_method = 'analyze'
@@ -149,14 +169,15 @@ def _to_df(self, result):
         data_dict = {}
         for feat in self.features:
             feat = feat[0].lower() + feat[1:]
-            if feat == 'tags':
-                for tag in result._data[feat]:
-                    data_dict[tag['name']] = tag['confidence']
-            elif feat == 'categories':
-                for cat in result._data[feat]:
-                    data_dict[cat['name']] = cat['score']
-            else:
-                data_dict.update(result._data[feat])
+            if feat in result._data:
+                if feat == 'tags':
+                    for tag in result._data[feat]:
+                        data_dict[tag['name']] = tag['confidence']
+                elif feat == 'categories':
+                    for cat in result._data[feat]:
+                        data_dict[cat['name']] = cat['score']
+                else:
+                    data_dict.update(result._data[feat])
         return pd.DataFrame([data_dict.values()], columns=data_dict.keys())
 
 

diff --git a/pliers/stimuli/audio.py b/pliers/stimuli/audio.py
@@ -54,7 +54,7 @@ def __init__(self, filename=None, onset=None, sampling_rate=None, url=None,
             self.data = self.data.mean(axis=1)
 
         super(AudioStim, self).__init__(
-            filename, onset=onset, duration=duration, order=order)
+            filename, onset=onset, duration=duration, order=order, url=url)
 
     @staticmethod
     def get_sampling_rate(filename):

diff --git a/pliers/stimuli/base.py b/pliers/stimuli/base.py
@@ -33,13 +33,14 @@ class Stim(with_metaclass(ABCMeta)):
     '''
 
     def __init__(self, filename=None, onset=None, duration=None, order=None,
-                 name=None):
+                 name=None, url=None):
 
         self.filename = filename
         self.onset = onset
         self.duration = duration
         self.order = order
         self._history = None
+        self.url = url
 
         if name is None:
             name = '' if self.filename is None else basename(self.filename)

diff --git a/pliers/stimuli/image.py b/pliers/stimuli/image.py
@@ -38,7 +38,7 @@ def __init__(self, filename=None, onset=None, duration=None, data=None,
             filename = url
         self.data = data
         super(ImageStim, self).__init__(filename, onset=onset,
-                                        duration=duration)
+                                        duration=duration, url=url)
 
     def save(self, path):
         imsave(path, self.data)
diff --git a/pliers/stimuli/text.py b/pliers/stimuli/text.py
@@ -35,7 +35,8 @@ def __init__(self, filename=None, text=None, onset=None, duration=None,
             text = urlopen(url).read()
         self.text = text
         name = 'text[%s]' % text[:40]  # Truncate at 40 chars
-        super(TextStim, self).__init__(filename, onset, duration, order, name)
+        super(TextStim, self).__init__(filename, onset, duration, order,
+                                       name=name, url=url)
 
     @property
     def data(self):

diff --git a/pliers/stimuli/video.py b/pliers/stimuli/video.py
@@ -75,7 +75,8 @@ def __init__(self, filename=None, frame_index=None, onset=None, url=None,
         self.n_frames = len(self.frame_index)
         super(VideoFrameCollectionStim, self).__init__(filename,
                                                        onset=onset,
-                                                       duration=duration)
+                                                       duration=duration,
+                                                       url=url)
 
     def _load_clip(self):
         audio_fps = AudioStim.get_sampling_rate(self.filename)

diff --git a/pliers/tests/extractors/api/test_clarifai_extractors.py b/pliers/tests/extractors/api/test_clarifai_extractors.py
@@ -32,6 +32,12 @@ def test_clarifai_api_extractor():
     assert result.shape == (1, 6)
     assert 'cat' in result.columns and 'dog' in result.columns
 
+    url = 'https://tuition.utexas.edu/sites/all/themes/tuition/logo.png'
+    stim = ImageStim(url=url)
+    result = ClarifaiAPIExtractor(max_concepts=5).transform(stim).to_df()
+    assert result.shape == (1, 9)
+    assert result['symbol'][0] > 0.8
+
     ext = ClarifaiAPIExtractor(api_key='nogood')
     assert not ext.validate_keys()
 

diff --git a/pliers/tests/extractors/api/test_google_extractors.py b/pliers/tests/extractors/api/test_google_extractors.py
@@ -122,6 +122,11 @@ def test_google_vision_api_label_extractor():
     assert 'apple' in result.columns
     assert result['apple'][0] > 0.75
 
+    url = 'https://tuition.utexas.edu/sites/all/themes/tuition/logo.png'
+    stim = ImageStim(url=url)
+    result = ext.transform(stim).to_df()
+    assert result['orange'][0] > 0.7
+
     ext = GoogleVisionAPILabelExtractor(discovery_file='nogood')
     assert not ext.validate_keys()
 

diff --git a/pliers/tests/extractors/api/test_indico_extractors.py b/pliers/tests/extractors/api/test_indico_extractors.py
@@ -86,6 +86,11 @@ def test_indico_api_image_extractor():
     assert set(result2.columns) == outdfKeysCheck
     assert result2['fer_Happy'][0] > 0.7
 
+    url = 'https://tuition.utexas.edu/sites/all/themes/tuition/logo.png'
+    stim = ImageStim(url=url)
+    result = ext.transform(stim).to_df()
+    assert result['fer_Neutral'][0] > 0.1
+
 
 @pytest.mark.skipif("'INDICO_APP_KEY' not in os.environ")
 def test_indico_api_extractor_large():

diff --git a/pliers/tests/extractors/api/test_microsoft_extractors.py b/pliers/tests/extractors/api/test_microsoft_extractors.py
@@ -93,6 +93,11 @@ def test_microsoft_vision_api_tag_extractor():
     assert 'apple' in res.columns
     assert res['apple'][0] > 0.7
 
+    url = 'https://tuition.utexas.edu/sites/all/themes/tuition/logo.png'
+    stim = ImageStim(url=url)
+    result = ext.transform(stim).to_df()
+    assert result['plate'][0] > 0.1  # doesn't give great labels
+
 
 @pytest.mark.requires_payment
 @pytest.mark.skipif("'MICROSOFT_VISION_SUBSCRIPTION_KEY' not in os.environ")