In [None]:
#https://colab.research.google.com/github/CLARIN-PL/NlpRest2-Tutorials/blob/master/agenda.ipynb
#notebook → https://colab.research.google.com/github/CLARIN-PL/NlpRest2-Tutorials/blob/master/part1.ipynb
#https://colab.research.google.com/github/CLARIN-PL/NlpRest2-Tutorials/blob/master/part2.ipynb
#https://colab.research.google.com/github/CLARIN-PL/NlpRest2-Tutorials/blob/master/part3.ipynb

In [1]:
import json
import requests

clarinpl_url = "http://ws.clarin-pl.eu/nlprest2/base"
user_mail = "demo2019@nlpday.pl"

In [2]:
gram_mapper = {
'subst':  'rzecz',
'depr':   'rzecz',
'num':    'rzecz',
'numcol': 'rzecz',
'adj':    'przym',
'adja':   'przym',
'adjp':   'przym',
'adjc':   'przym',
'adv':    'przys',
'ppron12':'rzecz',
'ppron3': 'rzecz',
'siebie': 'rzecz',
'fin':    'czas',
'bedzie': 'czas',
'aglt':   'czas',
'praet':  'czas',
'impt':   'czas',
'imps':   'czas',
'inf':    'czas',
'pcon':   'czas',
'pant':   'czas',
'ger':    'czas',
'pact':   'czas',
'ppas':   'czas',
'winien': '?',
'pred':   '?',
'prep':   '?',
'conj':   '?',
'comp':   '?',
'qub':    '?',
'brev':   '?',
'burk':   '?',
'interj': '?',
'interp': '?',
'xxx':    '?',
'ign':    '?',
}

In [3]:
url = clarinpl_url + "/process"
lpmn = "wcrft2"
text = "Na płocie siedzi kot."

payload = {'text': text, 'lpmn': lpmn, 'user': user_mail}
headers = {'content-type': 'application/json'}

In [4]:
r = requests.post(url, data=json.dumps(payload), headers=headers)
ccl = r.content.decode('utf-8')
print(ccl)

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE chunkList SYSTEM "ccl.dtd">
<chunkList>
 <chunk id="ch1" type="p">
  <sentence id="s1">
   <tok>
    <orth>Na</orth>
    <lex disamb="1"><base>na</base><ctag>prep:acc</ctag></lex>
   </tok>
   <tok>
    <orth>płocie</orth>
    <lex disamb="1"><base>płot</base><ctag>subst:sg:loc:m3</ctag></lex>
   </tok>
   <tok>
    <orth>siedzi</orth>
    <lex disamb="1"><base>siedzieć</base><ctag>fin:sg:ter:imperf</ctag></lex>
   </tok>
   <tok>
    <orth>kot</orth>
    <lex disamb="1"><base>kot</base><ctag>subst:sg:nom:m1</ctag></lex>
   </tok>
   <ns/>
   <tok>
    <orth>.</orth>
    <lex disamb="1"><base>.</base><ctag>interp</ctag></lex>
   </tok>
  </sentence>
 </chunk>
</chunkList>



In [5]:
import xml.etree.ElementTree as ET

def ccl_orths(ccl):
    tree = ET.fromstring(ccl)
    return [orth.text for orth in tree.iter('orth')]

orths = ccl_orths(ccl)

print(orths)


['Na', 'płocie', 'siedzi', 'kot', '.']


In [6]:
def ccl_bases(ccl):
    tree = ET.fromstring(ccl)
    return [tok.find('./lex/base').text for tok in tree.iter('tok')]

bases = ccl_bases(ccl)
    
print(bases)

['na', 'płot', 'siedzieć', 'kot', '.']


In [7]:
def ccl_poses(ccl):
    tree = ET.fromstring(ccl)
    return [tok.find('./lex/ctag').text.split(":")[0] for tok in tree.iter('tok')]

poses = ccl_poses(ccl)

print(poses)

['prep', 'subst', 'fin', 'subst', 'interp']


In [8]:
def ccl_simple_grammar(ccl):
    tags = ccl_poses(ccl)
    return [gram_mapper[w] for w in tags]

simple_grammar = ccl_simple_grammar(ccl)

print(simple_grammar)

['?', 'rzecz', 'czas', 'rzecz', '?']


## Tag and recognize named entities (coarse-grained categories)

In [9]:
url = clarinpl_url + "/process"
#lpmn = 'wcrft2'
lpmn = "wcrft2|liner2"
text = "Tony Halik przyszedł na świat w Toruniu"

payload = {'text': text, 'lpmn': lpmn, 'user': user_mail}
headers = {'content-type': 'application/json'}

In [10]:
r = requests.post(url, data=json.dumps(payload), headers=headers)
print(r.content.decode('utf-8'))

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE chunkList SYSTEM "ccl.dtd">
<chunkList>
 <chunk type="p" id="ch1">
  <sentence id="s1">
   <tok>
    <orth>Tony</orth>
    <lex disamb="1"><base>ton</base><ctag>subst:pl:nom:m3</ctag></lex>
    <ann chan="nam" head="1">1</ann>
   </tok>
   <tok>
    <orth>Halik</orth>
    <lex disamb="1"><base>Halik</base><ctag>ign</ctag></lex>
    <ann chan="nam">1</ann>
   </tok>
   <tok>
    <orth>przyszedł</orth>
    <lex disamb="1"><base>przyjść</base><ctag>praet:sg:m1:perf</ctag></lex>
    <ann chan="nam">0</ann>
   </tok>
   <tok>
    <orth>na</orth>
    <lex disamb="1"><base>na</base><ctag>prep:acc</ctag></lex>
    <ann chan="nam">0</ann>
   </tok>
   <tok>
    <orth>świat</orth>
    <lex disamb="1"><base>świat</base><ctag>subst:sg:nom:m3</ctag></lex>
    <ann chan="nam">0</ann>
   </tok>
   <tok>
    <orth>w</orth>
    <lex disamb="1"><base>w</base><ctag>prep:acc:nwok</ctag></lex>
    <ann chan="nam">0</ann>
   </tok>
   <tok>
    <orth>Toruniu<