In [4]:
import stanza

# Download the language model
stanza.download('en')

# Build a Neural Pipeline
pipeline = stanza.Pipeline('en', processors = {'tokenize': 'spacy',
                                                    # 'mwt': 'default',
                                                    'pos': 'default',
                                                    'lemma': 'default',
                                                    'depparse': 'default'},
                                        download_method=stanza.DownloadMethod.REUSE_RESOURCES)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-01 17:31:28 INFO: Downloading default packages for language: en (English) ...
2023-12-01 17:31:29 INFO: File exists: /Users/sam/stanza_resources/en/default.zip
2023-12-01 17:31:33 INFO: Finished downloading models and saved to /Users/sam/stanza_resources.
2023-12-01 17:31:34 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | spacy               |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-12-01 17:31:34 INFO: Using device: cpu
2023-12-01 17:31:34 INFO: Loading: tokenize
2023-12-01 17:31:34 INFO: Loading: pos
2023-12-01 17:31:35 INFO: Loading: lemma
2023-12-01 17:31:35 INFO: Loading: constituency
2023-12-01 17:31:35 INFO: Loading: depparse
2023-12-01 17:31:35 INFO: Loading: sentiment
2023

In [6]:
with open('/Users/sam/programs/skimmer/dev-data/test01.txt') as f:
    text = f.read()

for sent in pipeline(text).sentences:
    print(sent.text.strip())
    print()

Trading in shares of WeWork were halted Monday as rumors swirl that the office sharing company, once valued as high as $47 billion, will seek bankruptcy protection.

Last week, The Wall Street Journal and other media outlets reported that WeWork was planning to file for Chapter 11 bankrutpcy protection as early as this week — citing unnamed sources familiar with the matter.

A WeWork spokesperson said last week that the company does not comment on speculation and did not immediately return messages after trading in the company’s stock was halted Monday.

Shares of WeWork, which cost more than $400 two years ago, could be had Monday for less than $1.

WeWork is paying the price for aggressive expansion in its early years.

The company went public in October 2021 after its first attempt to do so two years earlier collapsed spectacularly.

The debacle led to the ouster of founder and CEO Adam Neumann, whose erratic behavior and exorbitant spending spooked early investors.

Japan’s SoftBan

In [7]:
# text = 'JetBlue canceled our flight this morning which was already late.'
# text = "When he first met the FTX founder Sam Bankman-Fried in late 2021, he took the cargo-shorted chief executive on a walk through the eucalyptus trees near his Berkeley, Calif., home."
text = 'The most bizarre thing in the world happened to me.'

# Pass the sentence through the pipeline
parsed = pipeline(text)

# Print the dependencies of the first sentence in the doc object
# Format - (Token, Index of head, Nature of dependency)
# Index starts from 1, 0 is reserved for ROOT
for sent in parsed.sentences:
    sent.print_dependencies()


print("{:<15} | {:<10} | {:<15} ".format('Token', 'Relation', 'Head'))
print("-" * 50)
  
# Convert sentence object to dictionary  
sent_dict = parsed.sentences[0].to_dict()

# iterate to print the token, relation and head
for word in sent_dict:
  print ("{:<15} | {:<10} | {:<15} "
         .format(str(word['text']),str(word['deprel']), str(sent_dict[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))


('The', 4, 'det')
('most', 3, 'advmod')
('bizarre', 4, 'amod')
('thing', 8, 'nsubj')
('in', 7, 'case')
('the', 7, 'det')
('world', 4, 'nmod')
('happened', 0, 'root')
('to', 10, 'case')
('me', 8, 'obl')
('.', 8, 'punct')
Token           | Relation   | Head            
--------------------------------------------------
The             | det        | thing           
most            | advmod     | bizarre         
bizarre         | amod       | thing           
thing           | nsubj      | happened        
in              | case       | world           
the             | det        | world           
world           | nmod       | thing           
happened        | root       | ROOT            
to              | case       | me              
me              | obl        | happened        
.               | punct      | happened        


In [25]:
#sent_dict
print(dir(parsed.sentences[0].words[0]))
for w in parsed.sentences[0].words:
    print(f'{w.id:5} {w.text:9}: {w.head} {w.deprel}')


['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_deprel', '_deps', '_end_char', '_feats', '_head', '_id', '_is_null', '_lemma', '_misc', '_parent', '_sent', '_start_char', '_text', '_upos', '_xpos', 'add_property', 'deprel', 'deps', 'end_char', 'feats', 'head', 'id', 'lemma', 'misc', 'parent', 'pos', 'pretty_print', 'sent', 'start_char', 'text', 'to_conll_text', 'to_dict', 'upos', 'xpos']
    1 When     : 4 advmod
    2 he       : 4 nsubj
    3 first    : 4 advmod
    4 met      : 17 advcl
    5 the      : 7 det
    6 FTX      : 7 compound
    7 founder  : 4 obj
    8 Sam      : 7 appos
    9 Bankman  : 8 flat
   10 -        : 11 punct
   11 Fried    : 8 flat
   12 in       : 14 case
   13 lat

In [4]:
head_map = {w.id: w.head for w in parsed.sentences[0].words}
head_map

{1: 4,
 2: 4,
 3: 4,
 4: 17,
 5: 7,
 6: 7,
 7: 4,
 8: 7,
 9: 8,
 10: 11,
 11: 8,
 12: 14,
 13: 14,
 14: 4,
 15: 4,
 16: 17,
 17: 0,
 18: 23,
 19: 21,
 20: 19,
 21: 23,
 22: 23,
 23: 17,
 24: 26,
 25: 26,
 26: 17,
 27: 30,
 28: 30,
 29: 30,
 30: 26,
 31: 37,
 32: 37,
 33: 37,
 34: 35,
 35: 33,
 36: 33,
 37: 30,
 38: 17}

In [10]:
from typing import TypeVar, Iterable

def reverse_dict(d):
    reversed_dict = {}
    for key, value in d.items():
        reversed_dict.setdefault(value, []).append(key)
    return reversed_dict


T = TypeVar('T')

def dfs(node: T, visited: set[T], graph: dict[T, set[T]]):
    visited.add(node)
    for neighbor in graph.get(node, []):
        if neighbor not in visited:
            dfs(neighbor, visited, graph)

def transitive_closure(d: dict[T, set[T]], keys: Iterable[T]) -> dict[T, set[T]]:
    """
    :param d: Dict defining relation defining the transitive closure. (a, b) is in the relation
        iff `b in d[a]`.
    :param keys: Keys desired in result
    :return: Dict mapping each key in keys to the transitive closure of the key
    """

    closure = {}
    for key in keys:
        visited = {key}
        dfs(key, visited, d)
        closure[key] = visited
    return closure

In [11]:
from pprint import pprint

rev_map = reverse_dict(head_map)
pprint(rev_map)

constituent_map = transitive_closure(rev_map, head_map.keys())
pprint(constituent_map)

words = parsed.sentences[0].words
for w in words:
    if w.id not in constituent_map: continue
    cons = constituent_map[w.id]
    cons_text = ' '.join(words[c-1].text for c in sorted(cons))
    print(f"{w.id:3} {w.text:10}: {w.deprel:10} {cons_text}")

{0: [17],
 4: [1, 2, 3, 7, 14, 15],
 7: [5, 6, 8],
 8: [9, 11],
 11: [10],
 14: [12, 13],
 17: [4, 16, 23, 26, 38],
 19: [20],
 21: [19],
 23: [18, 21, 22],
 26: [24, 25, 30],
 30: [27, 28, 29, 37],
 33: [35, 36],
 35: [34],
 37: [31, 32, 33]}
{1: {1},
 2: {2},
 3: {3},
 4: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
 5: {5},
 6: {6},
 7: {5, 6, 7, 8, 9, 10, 11},
 8: {8, 9, 10, 11},
 9: {9},
 10: {10},
 11: {10, 11},
 12: {12},
 13: {13},
 14: {12, 13, 14},
 15: {15},
 16: {16},
 17: {1,
      2,
      3,
      4,
      5,
      6,
      7,
      8,
      9,
      10,
      11,
      12,
      13,
      14,
      15,
      16,
      17,
      18,
      19,
      20,
      21,
      22,
      23,
      24,
      25,
      26,
      27,
      28,
      29,
      30,
      31,
      32,
      33,
      34,
      35,
      36,
      37,
      38},
 18: {18},
 19: {19, 20},
 20: {20},
 21: {19, 20, 21},
 22: {22},
 23: {18, 19, 20, 21, 22, 23},
 24: {24},
 25: {25},
 26: {32, 33, 3

In [12]:
print(parsed.sentences[0].text)

When he first met the FTX founder Sam Bankman-Fried in late 2021, he took the cargo-shorted chief executive on a walk through the eucalyptus trees near his Berkeley, Calif., home.


In [17]:
parsed.sentences[0].text

'When he first met the FTX founder Sam Bankman-Fried in late 2021, he took the cargo-shorted chief executive on a walk through the eucalyptus trees near his Berkeley, Calif., home.'