In [1]:
# Get Stanza from django
from datacore.functions import get_stanza, stanza_phrase_analysis

nlp = get_stanza(lang="en", processors="tokenize,pos,lemma,depparse,ner")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.0.json:   0%|   …

2022-09-09 14:10:45 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |

2022-09-09 14:10:45 INFO: Use device: cpu
2022-09-09 14:10:45 INFO: Loading: tokenize
2022-09-09 14:10:46 INFO: Loading: pos
2022-09-09 14:10:46 INFO: Loading: lemma
2022-09-09 14:10:47 INFO: Loading: depparse
2022-09-09 14:10:48 INFO: Loading: ner
2022-09-09 14:10:54 INFO: Done loading processors!


In [2]:
# doc = nlp("Tom is from The New York Times Journal, here for Tom.")
doc = nlp("This is a sentence.")
# doc = nlp("When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.")
phrase = doc.sentences[0]
print(phrase)

[
  {
    "id": 1,
    "text": "This",
    "lemma": "this",
    "upos": "PRON",
    "xpos": "DT",
    "feats": "Number=Sing|PronType=Dem",
    "head": 4,
    "deprel": "nsubj",
    "start_char": 0,
    "end_char": 4,
    "ner": "O",
    "multi_ner": [
      "O"
    ]
  },
  {
    "id": 2,
    "text": "is",
    "lemma": "be",
    "upos": "AUX",
    "xpos": "VBZ",
    "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
    "head": 4,
    "deprel": "cop",
    "start_char": 5,
    "end_char": 7,
    "ner": "O",
    "multi_ner": [
      "O"
    ]
  },
  {
    "id": 3,
    "text": "a",
    "lemma": "a",
    "upos": "DET",
    "xpos": "DT",
    "feats": "Definite=Ind|PronType=Art",
    "head": 4,
    "deprel": "det",
    "start_char": 8,
    "end_char": 9,
    "ner": "O",
    "multi_ner": [
      "O"
    ]
  },
  {
    "id": 4,
    "text": "sentence",
    "lemma": "sentence",
    "upos": "NOUN",
    "xpos": "NN",
    "feats": "Number=Sing",
    "head": 0,
    "deprel": "root",
 

In [4]:
# print(phrase.tokens)
for token in phrase.tokens:
    print(token)

[
  {
    "id": 1,
    "text": "Tina",
    "lemma": "Tina",
    "upos": "PROPN",
    "xpos": "NNP",
    "feats": "Number=Sing",
    "head": 2,
    "deprel": "compound",
    "start_char": 0,
    "end_char": 4,
    "ner": "B-PERSON"
  }
]
[
  {
    "id": 2,
    "text": "Arena",
    "lemma": "Arena",
    "upos": "PROPN",
    "xpos": "NNP",
    "feats": "Number=Sing",
    "head": 4,
    "deprel": "nsubj",
    "start_char": 5,
    "end_char": 10,
    "ner": "E-PERSON"
  }
]
[
  {
    "id": 3,
    "text": "is",
    "lemma": "be",
    "upos": "AUX",
    "xpos": "VBZ",
    "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
    "head": 4,
    "deprel": "cop",
    "start_char": 11,
    "end_char": 13,
    "ner": "O"
  }
]
[
  {
    "id": 4,
    "text": "here",
    "lemma": "here",
    "upos": "ADV",
    "xpos": "RB",
    "feats": "PronType=Dem",
    "head": 0,
    "deprel": "root",
    "start_char": 14,
    "end_char": 18,
    "ner": "O"
  }
]
[
  {
    "id": 5,
    "text": "in",


In [11]:
# create a standard structure
import copy

data = {}
pos_template = []
xpos_template = []
feat_template = []
for word in phrase.words:
    data[word.id] = copy.deepcopy(word)
    if word.feats is not None:
        word_feats = {}
        for feat in str(word.feats).split("|"):
            splited = feat.split("=")
            word_feats[splited[0]] = splited[1]
        data[word.id].feats = word_feats
    pos_template.append(word.pos)
    xpos_template.append(
        "{}({})".format(word.pos, word.xpos) if word.xpos else word.pos
    )
    feat_template.append("{}({}~{})".format(word.pos, word.xpos, word.feats))
print("-".join(pos_template))
print("-".join(xpos_template))
print("-".join(feat_template))
print(data)

PROPN-PROPN-AUX-ADV-ADP-PROPN-PUNCT
PROPN(NNP)-PROPN(NNP)-AUX(VBZ)-ADV(RB)-ADP(IN)-PROPN(NNP)-PUNCT(.)
PROPN(NNP~Number=Sing)-PROPN(NNP~Number=Sing)-AUX(VBZ~Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin)-ADV(RB~PronType=Dem)-ADP(IN~None)-PROPN(NNP~Number=Sing)-PUNCT(.~None)
{1: {
  "id": 1,
  "text": "Tina",
  "lemma": "Tina",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": {
    "Number": "Sing"
  },
  "head": 2,
  "deprel": "compound",
  "start_char": 0,
  "end_char": 4
}, 2: {
  "id": 2,
  "text": "Arena",
  "lemma": "Arena",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": {
    "Number": "Sing"
  },
  "head": 4,
  "deprel": "nsubj",
  "start_char": 5,
  "end_char": 10
}, 3: {
  "id": 3,
  "text": "is",
  "lemma": "be",
  "upos": "AUX",
  "xpos": "VBZ",
  "feats": {
    "Mood": "Ind",
    "Number": "Sing",
    "Person": "3",
    "Tense": "Pres",
    "VerbForm": "Fin"
  },
  "head": 4,
  "deprel": "cop",
  "start_char": 11,
  "end_char": 13
}, 4: {
  "id": 4,
  "text": "here",


In [10]:
ent = []
for item in phrase.ents:
    ent.append(item)
print(ent)

[{
  "text": "Thrun",
  "type": "ORG",
  "start_char": 0,
  "end_char": 5
}, {
  "text": "Google",
  "type": "ORG",
  "start_char": 46,
  "end_char": 52
}, {
  "text": "2007",
  "type": "DATE",
  "start_char": 56,
  "end_char": 60
}]


In [11]:
print(phrase.dependencies)
dep_list = []
for dep in phrase.dependencies:
    # (governor index, govenor dependency, word.deprel, word index, word dependency)
    dep_list.append(
        "{},{},{},{}".format(
            dep[0].id,
            dep[0].deprel if dep[0].deprel is not None else "ROOT",
            dep[1],
            dep[2].id,
            dep[2].deprel,
        )
    )
template_line = ";".join(dep_list)
print(template_line)

[({
  "id": 2,
  "text": "started",
  "lemma": "start",
  "upos": "VERB",
  "xpos": "VBD",
  "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
  "head": 0,
  "deprel": "root",
  "start_char": 6,
  "end_char": 13
}, 'nsubj', {
  "id": 1,
  "text": "Thrun",
  "lemma": "Thrun",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": "Number=Sing",
  "head": 2,
  "deprel": "nsubj",
  "start_char": 0,
  "end_char": 5
}), ({
  "id": 0,
  "text": "ROOT"
}, 'root', {
  "id": 2,
  "text": "started",
  "lemma": "start",
  "upos": "VERB",
  "xpos": "VBD",
  "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
  "head": 0,
  "deprel": "root",
  "start_char": 6,
  "end_char": 13
}), ({
  "id": 2,
  "text": "started",
  "lemma": "start",
  "upos": "VERB",
  "xpos": "VBD",
  "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
  "head": 0,
  "deprel": "root",
  "start_char": 6,
  "end_char": 13
}, 'xcomp', {
  "id": 3,
  "text": "working",
  "lemma": "work",
  "upos":

})]
2,root,nsubj,1;0,ROOT,root,2;2,root,xcomp,3;8,obl,case,4;7,amod,compound,5;5,compound,punct,6;8,obl,amod,7;3,xcomp,obl,8;10,obl,case,9;3,xcomp,obl,10;12,obl,case,11;3,xcomp,obl,12;20,parataxis,punct,13;15,nsubj,amod,14;20,parataxis,nsubj,15;19,nmod,case,16;19,nmod,case,17;19,nmod,det,18;15,nsubj,nmod,19;2,root,parataxis,20;20,parataxis,obj,21;20,parataxis,advmod,22;2,root,punct,23


In [8]:
# Dependency Template
dep_list = []
for dep in phrase.dependencies:
    dep_list.append(
        "({},{},{},{})".format(dep[2].id, dep[2].pos, dep[2].deprel, dep[0].id)
    )
template_line = "-".join(dep_list)
print(template_line)

(1,PROPN,compound,2)-(2,PROPN,nsubj,4)-(3,AUX,cop,4)-(4,ADV,root,0)-(5,ADP,case,6)-(6,PROPN,obl,4)-(7,PUNCT,punct,4)


In [9]:
print(
    *[
        f'id: {word.id}\tword: {word.text}\thead id: {word.head}\thead: {sent.words[word.head-1].text if word.head > 0 else "root"}\tdeprel: {word.deprel}'
        for sent in doc.sentences
        for word in sent.words
    ],
    sep="\n",
)

id: 1	word: Tina	head id: 2	head: Arena	deprel: compound
id: 2	word: Arena	head id: 4	head: here	deprel: nsubj
id: 3	word: is	head id: 4	head: here	deprel: cop
id: 4	word: here	head id: 0	head: root	deprel: root
id: 5	word: in	head id: 6	head: London	deprel: case
id: 6	word: London	head id: 4	head: here	deprel: obl
id: 7	word: .	head id: 4	head: here	deprel: punct


In [10]:
# Visualize dependency in text format
print(
    "{:<5} | {:<15} | {:<10} | {:<10} | {:<15} | {:<10}".format(
        "ID", "Text", "Head POS", "Relation", "Dependent", "Dependent POS"
    )
)
print("-" * 75)

# Use dependency.triples() to extract the dependency triples in the form
# ((head word, head POS), relation, (dependent word, dependent POS))
for dep in phrase.dependencies:
    print(
        "{:<5} | {:<15} | {:<10} | {:<10} | {:<15} | {:<10}".format(
            str(dep[2].id),
            str(dep[2].text),
            str(dep[2].pos),
            str(dep[2].deprel),
            str(dep[0].id),
            str(dep[0].pos),
        )
    )

ID    | Text            | Head POS   | Relation   | Dependent       | Dependent POS
---------------------------------------------------------------------------
1     | Tina            | PROPN      | compound   | 2               | PROPN     
2     | Arena           | PROPN      | nsubj      | 4               | ADV       
3     | is              | AUX        | cop        | 4               | ADV       
4     | here            | ADV        | root       | 0               | None      
5     | in              | ADP        | case       | 6               | PROPN     
6     | London          | PROPN      | obl        | 4               | ADV       
7     | .               | PUNCT      | punct      | 4               | ADV       


2022-02-25 00:28:35 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2022-02-25 00:28:35 INFO: Use device: cpu
2022-02-25 00:28:35 INFO: Loading: tokenize
2022-02-25 00:28:35 INFO: Loading: pos
2022-02-25 00:28:39 INFO: Done loading processors!


In [5]:
# doc = nlp("Tom is from The New York Times Journal, here for Tom.")
doc = nlp("This is a sentence.")
# doc = nlp("When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously.")
phrase = doc.sentences[0]
print(phrase)
for dep in phrase.dependencies:
    print(
        "{:<5} | {:<15} | {:<10} | {:<10} | {:<15} | {:<10}".format(
            str(dep[2].id),
            str(dep[2].text),
            str(dep[2].pos),
            str(dep[2].deprel),
            str(dep[0].id),
            str(dep[0].pos),
        )
    )

[
  {
    "id": 1,
    "text": "This",
    "lemma": "this",
    "upos": "PRON",
    "xpos": "DT",
    "feats": "Number=Sing|PronType=Dem",
    "head": 4,
    "deprel": "nsubj",
    "start_char": 0,
    "end_char": 4,
    "ner": "O",
    "multi_ner": [
      "O"
    ]
  },
  {
    "id": 2,
    "text": "is",
    "lemma": "be",
    "upos": "AUX",
    "xpos": "VBZ",
    "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin",
    "head": 4,
    "deprel": "cop",
    "start_char": 5,
    "end_char": 7,
    "ner": "O",
    "multi_ner": [
      "O"
    ]
  },
  {
    "id": 3,
    "text": "a",
    "lemma": "a",
    "upos": "DET",
    "xpos": "DT",
    "feats": "Definite=Ind|PronType=Art",
    "head": 4,
    "deprel": "det",
    "start_char": 8,
    "end_char": 9,
    "ner": "O",
    "multi_ner": [
      "O"
    ]
  },
  {
    "id": 4,
    "text": "sentence",
    "lemma": "sentence",
    "upos": "NOUN",
    "xpos": "NN",
    "feats": "Number=Sing",
    "head": 0,
    "deprel": "root",
 