In this workflow, we extract and highlight nouns in order to get the important parts of reviews.

In [1]:
token = "" #@param {type: "string"}

In [None]:
from relevanceai.utils import decode_workflow_token
config = decode_workflow_token(token)

text_fields = [config['text_field']]
auth_token = config['authorizationToken']
model_name = config['model_id']
output_fields = [config['outputField']]
cutoff_probability = config['cutoffProbability']

from relevanceai import Client
client = Client(token=auth_token)
ds = client.Dataset(config['dataset_id'])

from flair.data import Sentence
from flair.models import SequenceTagger
from tqdm.auto import tqdm

tagger = SequenceTagger.load(model_name)

def extract_nouns(text, as_documents=False):
  sentence = Sentence(text)
  tagger.predict(sentence)
  nouns = []
  for entity in sentence.get_spans("np"):
    if entity.tag == "NP" and entity.score >= cutoff_probability:
      if as_documents:
        nouns.append({"noun": entity.text, "score": entity.score})
      else:
        nouns.append(entity.text)
  return nouns

def extract_nouns_from_documents(docs, as_documents=False):
  new_docs = [{"_id": d['_id']} for d in docs]
  for d in tqdm(docs):
      for i, t in enumerate(text_fields):
          value = extract_nouns(
              client.get_field(t, d)[:200], 
              as_documents=as_documents
          )
          client.set_field(output_fields[i], new_docs[i], value)
  return new_docs

filters = []
for output_field in output_fields:
  filters += ds[output_field].not_exists()

for chunk in ds.chunk_dataset(select_fields=text_fields, filters=filters):
  docs = extract_nouns_from_documents(chunk)
  ds.upsert_documents(docs)
