In this workflow, we extract and highlight nouns in order to get the important parts of reviews.

In [None]:
token = "" #@param {type: "string"}

!pip install -q RelevanceAI==2.4.2
!pip install -q flair

from relevanceai.utils import decode_workflow_token
config = decode_workflow_token(token)

text_fields = config['text_fields']
auth_token = config['authorizationToken']
model_name = config['model_name']
if config['output_fields'] is None:
  output_fields = [f"_nounchunk_.{t}" for t in text_fields]
else:
  output_fields = config['output_fields']

from relevanceai import Client
client = Client(token=auth_token)
ds = client.Dataset(config['dataset_id'])

from flair.data import Sentence
from flair.models import SequenceTagger

tagger = SequenceTagger.load(model_name)

def extract_nouns(text, as_documents=False):
  sentence = Sentence(text)
  tagger.predict(sentence)
  nouns = []
  for entity in sentence.get_spans("np"):
    if entity.tag == "NP":
      if as_documents:
        nouns.append({"noun": entity.text, "score": entity.score})
      else:
        nouns.append(entity.text)
  return nouns

def extract_nouns_from_documents(docs, as_documents=False):
  new_docs = [{"_id": d['_id']} for d in docs]
  for d in docs:
      for i, t in enumerate(text_fields):
          value = extract_nouns(
              client.get_field(t, d)[:200], 
              as_documents=as_documents
          )
          client.set_field(output_fields[i], new_docs[i], value)
  return new_docs

filters = []
for output_field in output_fields:
  filters += ds[output_field].not_exists()

ds.bulk_apply(
    extract_nouns_from_documents, 
    retrieve_chunksize=50,
    filters=filters
)