Skip to content

Commit

Permalink
Merge pull request #306 from tyarkoni/google-language
Browse files Browse the repository at this point in the history
Google Natural Language API support
  • Loading branch information
tyarkoni committed Jul 9, 2018
2 parents b9b843c + 5a391a9 commit 00ee12b
Show file tree
Hide file tree
Showing 16 changed files with 433 additions and 11 deletions.
2 changes: 2 additions & 0 deletions pliers/converters/api/ibm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import base64
import json
import logging
from pliers.stimuli.text import TextStim, ComplexTextStim
from pliers.utils import attempt_to_import, verify_dependencies
from pliers.converters.audio import AudioToTextConverter
Expand Down Expand Up @@ -66,6 +67,7 @@ def check_valid_keys(self):
return True
except Exception as e:
if 'Not Authorized' in str(e):
logging.warn(str(e))
return False
else:
raise e
Expand Down
4 changes: 3 additions & 1 deletion pliers/converters/api/wit.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
''' Wit.ai API-based Converters '''

import logging
import os
from abc import abstractproperty
from pliers.stimuli.text import ComplexTextStim
Expand Down Expand Up @@ -73,5 +74,6 @@ def check_valid_keys(self):
try:
urlopen(request)
return True
except HTTPError:
except HTTPError as e:
logging.warn(str(e))
return False
12 changes: 12 additions & 0 deletions pliers/extractors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@
GoogleVideoAPILabelDetectionExtractor,
GoogleVideoAPIShotDetectionExtractor,
GoogleVideoAPIExplicitDetectionExtractor,
GoogleLanguageAPIExtractor,
GoogleLanguageAPIEntityExtractor,
GoogleLanguageAPISentimentExtractor,
GoogleLanguageAPISyntaxExtractor,
GoogleLanguageAPITextCategoryExtractor,
GoogleLanguageAPIEntitySentimentExtractor,
MicrosoftAPIFaceExtractor,
MicrosoftAPIFaceEmotionExtractor,
MicrosoftVisionAPIExtractor,
Expand Down Expand Up @@ -87,6 +93,12 @@
'GoogleVideoAPILabelDetectionExtractor',
'GoogleVideoAPIShotDetectionExtractor',
'GoogleVideoAPIExplicitDetectionExtractor',
'GoogleLanguageAPIExtractor',
'GoogleLanguageAPIEntityExtractor',
'GoogleLanguageAPISentimentExtractor',
'GoogleLanguageAPISyntaxExtractor',
'GoogleLanguageAPITextCategoryExtractor',
'GoogleLanguageAPIEntitySentimentExtractor',
'BrightnessExtractor',
'SaliencyExtractor',
'SharpnessExtractor',
Expand Down
14 changes: 13 additions & 1 deletion pliers/extractors/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,13 @@
GoogleVideoIntelligenceAPIExtractor,
GoogleVideoAPILabelDetectionExtractor,
GoogleVideoAPIShotDetectionExtractor,
GoogleVideoAPIExplicitDetectionExtractor)
GoogleVideoAPIExplicitDetectionExtractor,
GoogleLanguageAPIExtractor,
GoogleLanguageAPIEntityExtractor,
GoogleLanguageAPISentimentExtractor,
GoogleLanguageAPISyntaxExtractor,
GoogleLanguageAPITextCategoryExtractor,
GoogleLanguageAPIEntitySentimentExtractor)
from .microsoft import (MicrosoftAPIFaceExtractor,
MicrosoftAPIFaceEmotionExtractor,
MicrosoftVisionAPIExtractor,
Expand All @@ -34,6 +40,12 @@
'GoogleVideoAPILabelDetectionExtractor',
'GoogleVideoAPIShotDetectionExtractor',
'GoogleVideoAPIExplicitDetectionExtractor',
'GoogleLanguageAPIExtractor',
'GoogleLanguageAPIEntityExtractor',
'GoogleLanguageAPISentimentExtractor',
'GoogleLanguageAPISyntaxExtractor',
'GoogleLanguageAPITextCategoryExtractor',
'GoogleLanguageAPIEntitySentimentExtractor',
'MicrosoftAPIFaceExtractor',
'MicrosoftAPIFaceEmotionExtractor',
'MicrosoftVisionAPIExtractor',
Expand Down
4 changes: 3 additions & 1 deletion pliers/extractors/api/clarifai.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Extractors that interact with the Clarifai API.
'''

import logging
import os
try:
from contextlib import ExitStack
Expand Down Expand Up @@ -64,7 +65,8 @@ def __init__(self, api_key=None, model='general-v1.3', min_value=None,
try:
self.api = clarifai_client.ClarifaiApp(api_key=api_key)
self.model = self.api.models.get(model)
except clarifai_client.ApiError:
except clarifai_client.ApiError as e:
logging.warn(str(e))
self.api = None
self.model = None
self.model_name = model
Expand Down
232 changes: 231 additions & 1 deletion pliers/extractors/api/google.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@

import base64
from pliers.extractors.image import ImageExtractor
from pliers.extractors.text import TextExtractor
from pliers.extractors.video import VideoExtractor
from pliers.transformers import (GoogleVisionAPITransformer,
from pliers.transformers import (GoogleAPITransformer,
GoogleVisionAPITransformer,
GoogleAPITransformer)
from pliers.extractors.base import ExtractorResult
from pliers.utils import flatten_dict
import numpy as np
import pandas as pd
import logging
Expand Down Expand Up @@ -369,3 +372,230 @@ def __init__(self, segments=None, config=None, timeout=90, request_rate=5,
max_results=max_results,
num_retries=num_retries,
rate_limit=rate_limit)


class GoogleLanguageAPIExtractor(GoogleAPITransformer, TextExtractor):

''' Extracts natural language features from text documents using the
Google Natural Language API.
Args:
features (list): List of features (str) to extract. Available
features: extractSyntax, extractEntities, extractDocumentSentiment,
extractEntitySentiment, and classifyText. See Google Natural
Language API documentation for more details.
language (str): The ISO-639-1 or BCP-47 identifier for the document
language. If None is provided, API auto-detects the language.
is_html (bool): When True, the document's text is expected to be
HTML. Otherwise, plain text is assumed.
discovery_file (str): path to discovery file containing Google
application credentials.
api_version (str): API version to use.
max_results (int): Max number of results per page.
num_retries (int): Number of times to retry query on failure.
rate_limit (int): The minimum number of seconds required between
transform calls on this Transformer.
'''

api_name = 'language'
_log_attributes = ('discovery_file', 'api_version', 'features',
'language', 'is_html')

def __init__(self, features=['extractSyntax',
'extractEntities',
'extractDocumentSentiment',
'extractEntitySentiment',
'classifyText'],
language=None, is_html=False, discovery_file=None,
api_version='v1', max_results=100,
num_retries=3, rate_limit=None):
self.features = features
self.language = language
self.is_html = is_html
super(GoogleLanguageAPIExtractor,
self).__init__(discovery_file=discovery_file,
api_version=api_version,
max_results=max_results,
num_retries=num_retries,
rate_limit=rate_limit)

def _query_api(self, request):
request_obj = self.service.documents().annotateText(body=request)
return request_obj.execute(num_retries=self.num_retries)

def _build_request(self, stim):
document = {
'type' : 'HTML' if self.is_html else 'PLAIN_TEXT',
'content' : stim.text
}

if self.language:
document['language'] = self.language

request = {
'document': document,
'features': { f : True for f in self.features },
'encodingType': 'UTF32'
}

return request

def _extract(self, stim):
request = self._build_request(stim)
response = self._query_api(request)
return ExtractorResult(response, stim, self)

def _get_span(self, text_json):
offset = text_json['text']['beginOffset']
content = text_json['text']['content']
return { 'begin_char_index' : offset,
'end_char_index' : offset + len(content),
'text' : content }

def _to_df(self, result):
response = result._data
data = []

# One row/object for all document-level features
document_data = {}

if 'extractDocumentSentiment' in self.features:
sentiment = response['documentSentiment']
document_data.update(flatten_dict(sentiment, 'sentiment'))

# Sentence level sentiment
for sentence in response.get('sentences', []):
sentence_data = self._get_span(sentence)
sentiment = sentence['sentiment']
sentence_data.update(flatten_dict(sentiment, 'sentiment'))
data.append(sentence_data)

for category in response.get('categories'):
key = 'category_%s' % category['name']
document_data[key] = category['confidence']

# Include only if there are document-level features
if document_data:
data.append(document_data)

# Entity-level features
for entity in response.get('entities', []):
entity_copy = entity.copy()
mentions = entity_copy.pop('mentions', [])
entity_copy.pop('name', None)
entity_copy = flatten_dict(entity_copy)

for m in mentions:
entity_data = self._get_span(m)
entity_data.update(entity_copy)
# Overwrite top-level sentiment with mention-level
sentiment = m.get('sentiment', {})
entity_data.update(flatten_dict(sentiment, 'sentiment'))
data.append(entity_data)

# Token-level syntax features
for token in response.get('tokens', []):
token_data = self._get_span(token)
token_data['lemma'] = token['lemma']
token_data.update(token['partOfSpeech'])
dependency = flatten_dict(token['dependencyEdge'], 'dependency')
token_data.update(dependency)
data.append(token_data)

df = pd.DataFrame(data)
df['language'] = response['language']
return df


class GoogleLanguageAPIEntityExtractor(GoogleLanguageAPIExtractor):

''' Extracts entity labels in text using the Google Language API '''

def __init__(self, language=None, is_html=False, discovery_file=None,
api_version='v1', max_results=100, num_retries=3,
rate_limit=None):
super(GoogleLanguageAPIEntityExtractor,
self).__init__(features=['extractEntities'],
language=language,
is_html=is_html,
discovery_file=discovery_file,
api_version=api_version,
max_results=max_results,
num_retries=num_retries,
rate_limit=rate_limit)


class GoogleLanguageAPISentimentExtractor(GoogleLanguageAPIExtractor):

''' Extracts sentiment of text using the Google Language API '''

def __init__(self, language=None, is_html=False, discovery_file=None,
api_version='v1', max_results=100, num_retries=3,
rate_limit=None):
super(GoogleLanguageAPISentimentExtractor,
self).__init__(features=['extractDocumentSentiment'],
language=language,
is_html=is_html,
discovery_file=discovery_file,
api_version=api_version,
max_results=max_results,
num_retries=num_retries,
rate_limit=rate_limit)


class GoogleLanguageAPISyntaxExtractor(GoogleLanguageAPIExtractor):

''' Extracts syntax properties of text using the Google Language API '''

def __init__(self, language=None, is_html=False, discovery_file=None,
api_version='v1', max_results=100, num_retries=3,
rate_limit=None):
super(GoogleLanguageAPISyntaxExtractor,
self).__init__(features=['extractSyntax'],
language=language,
is_html=is_html,
discovery_file=discovery_file,
api_version=api_version,
max_results=max_results,
num_retries=num_retries,
rate_limit=rate_limit)


class GoogleLanguageAPITextCategoryExtractor(GoogleLanguageAPIExtractor):

''' Extracts document category using the Google Language API.
See the API documentation for the taxonomy of categories:
https://cloud.google.com/natural-language/docs/categories '''

def __init__(self, language=None, is_html=False, discovery_file=None,
api_version='v1', max_results=100, num_retries=3,
rate_limit=None):
super(GoogleLanguageAPITextCategoryExtractor,
self).__init__(features=['classifyText'],
language=language,
is_html=is_html,
discovery_file=discovery_file,
api_version=api_version,
max_results=max_results,
num_retries=num_retries,
rate_limit=rate_limit)


class GoogleLanguageAPIEntitySentimentExtractor(GoogleLanguageAPIExtractor):

''' Extracts sentiment of entities found in text using the Google Language
API. Produces identical results to the entity extractor but with additional
sentiment analysis. '''

def __init__(self, language=None, is_html=False, discovery_file=None,
api_version='v1', max_results=100, num_retries=3,
rate_limit=None):
super(GoogleLanguageAPIEntitySentimentExtractor,
self).__init__(features=['extractEntitySentiment'],
language=language,
is_html=is_html,
discovery_file=discovery_file,
api_version=api_version,
max_results=max_results,
num_retries=num_retries,
rate_limit=rate_limit)
2 changes: 2 additions & 0 deletions pliers/extractors/api/indico.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Extractors that interact with the Indico API.
'''

import logging
import os
from pliers.extractors.image import ImageExtractor
from pliers.extractors.text import TextExtractor
Expand Down Expand Up @@ -73,6 +74,7 @@ def check_valid_keys(self):
api.api_handler(None, None, self.model_names[0])
except IndicoError as e:
if str(e) == 'Invalid API key':
logging.warn(str(e))
return False
else:
# If valid key, a data error (None passed) is expected here
Expand Down
2 changes: 1 addition & 1 deletion pliers/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def to_df(self, timing=True, metadata=False, format='wide',
index = pd.Series(onsets).astype(str) + '_' + \
pd.Series(durations).astype(str)
if object_id is True or (object_id == 'auto' and
len(set(index)) > 1):
len(set(index)) < len(df)):
ids = np.arange(len(df)) if len(index) == 1 \
else df.groupby(index).cumcount()
df.insert(0, 'object_id', ids)
Expand Down
4 changes: 3 additions & 1 deletion pliers/stimuli/api.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
''' Stimuli that are inherently associated with remote resources. '''

import logging
import os

from .base import load_stims
Expand Down Expand Up @@ -70,7 +71,8 @@ def check_valid_keys(self):
try:
self.api.VerifyCredentials()
return True
except twitter.error.TwitterError:
except twitter.error.TwitterError as e:
logging.warn(str(e))
return False

def get_status(self, status_id):
Expand Down
1 change: 1 addition & 0 deletions pliers/tests/data/text/sample_text_with_entities.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Google, headquartered in Mountain View, unveiled the new Android phone at the Consumer Electronic Show. Sundar Pichai said in his keynote that users love their new Android phones.
Loading

0 comments on commit 00ee12b

Please sign in to comment.