In [None]:
# default_exp api

In [None]:
#hide
from nbdev.showdoc import show_doc

# API
> The UCREL Tool Chain API class:


In [None]:
# export

import functools
from typing import Optional, List, Tuple
import re
from xml.sax import saxutils

import requests

from ucrel_api.ucrel_doc import UCREL_Doc, UCREL_Token

class UCREL_API():

    SGML_ENTITY_MAPPER = {'£': '&pound;', 
                          'é': '&eacute;', '<': '&lt;', 
                          '>': '&gt;', '[': '&lsqb;', 
                          ']': '&rsqb;'}
    REVERSE_SGML_ENTITY_MAPPER = {v: k for k, v in SGML_ENTITY_MAPPER.items()}

    @classmethod
    def _sgml_entity_escape(cls, text: str) -> str:
        '''
        The SGML entities that are escaped are those found in 
        the [CLAWS input/output format guidelines.](http://ucrel.lancs.ac.uk/claws/format.html)

        1. **text**: Text to escape
        
        **returns** The text escaped from SGML entities
        '''
        # Need to escape & first as it is contained in the other escaped 
        # entities
        if '&' in text:
            text = text.replace('&', '&amp;')
        for entity, escaped_entity in cls.SGML_ENTITY_MAPPER.items():
            if entity in text:
                text = text.replace(entity, escaped_entity)
        return text
    
    @classmethod
    def _sgml_entity_un_escape(cls, text: str) -> str:
        '''
        The SGML entities that are un-escaped are those found in 
        the [CLAWS input/output format guidelines.](http://ucrel.lancs.ac.uk/claws/format.html)

        1. **text**: Text to un-escape
        
        **returns** The text un-escaped from SGML entities
        '''
        if '&amp;' in text:
            text = text.replace('&amp;', '&')
        for escaped_entity, entity in cls.REVERSE_SGML_ENTITY_MAPPER.items():
            if escaped_entity in text:
                text = text.replace(escaped_entity, entity)
        return text

    def __init__(self, email: str, server_address: str, 
                 port: str = '', timeout: int = 60) -> None:
        '''
        Creates a UCREL API instance that is used to call the UCREL Tool chain.
        
        1. **email**: Email address of the user. This is used to identify the user 
        calling the UCREL Tool Chain API.
        2. **server_address**: The address of the UCREL Tool Chain e.g. 
        [http://ucrel-api.lancaster.ac.uk](http://ucrel-api.lancaster.ac.uk)
        3. **port**: The port to the server e.g. 8080. Can be left as empty string
        if port number is not required.
        4. **timeout**: The amount of time to allow each request to take before raising 
        a [`requests.exceptions.Timeout`](https://requests.readthedocs.io/en/latest/api/#requests.Timeout)
        '''
        self.email = email
        self.server_address = server_address
        self.port = port
        self.timeout = timeout

    def _ucrel_post_request(self, endpoint: str, text: str, **data_kwargs) -> str:
        '''
        1. **endpoint**: The POST endpoint of the UCREL Tool Chain sevrer 
        to call. The endpoint is expected to require `text` key in the 
        multipart form data.
        2. **text**: The text to be processed by the given `endpoint`.
        3. **data_kwargs**: Optional, additional `key: value` data to 
        be sent with the multipart form data.

        **returns**: The string response from the UCREL Tool Chain 
        server after calling the given `endpoint` with the given `text`
        and any `data_kwargs`.

        **raises requests.exceptions.Timeout**: If the response from the POST request
        takes longer than `self.timeout`.
        **raises requests.exceptions.HTTPError**: If anything other than a status code 200
        is returned from the `endpoint`.
        **raises Exception**: If any error occurs while processing the POST request
        to the `endpoint`.
        '''
        url = f'{self.server_address}{endpoint}'
        if self.port:
            url = f'{self.server_address}:{self.port}{endpoint}'
        # Escape the SGML entities
        text = text.strip()
        escaped_text = self._sgml_entity_escape(text)
        # Type here refers to the fact we want to use the REST API
        # Style refers to the output type, in this case we use verticical
        # as the verticial format returns the most output e.g. all possible tags
        # and split into sentences.
        data = {'type': 'rest', 'email': self.email, 
                **data_kwargs, 'style': 'tab',  'text': escaped_text}
        headers = {'Accept':'text/plain; charset=utf-8', 
                   'Content-Type': 'text/plain; charset=utf-8'}
        try:
            post_response = requests.post(url, files=data, timeout=self.timeout,
                                          headers=headers)
            status_code = post_response.status_code
            if post_response.status_code != 200:
                error_msg = (f'Raised a status code of {status_code}. '
                             'Can only accept code 200.')
                raise requests.exceptions.HTTPError(error_msg)
            return post_response.text
        except requests.exceptions.Timeout:
            error_message = (f'URL: {url}. Failed due to a timeout for the ')
            raise requests.exceptions.Timeout(error_message)
        except Exception as e:
            raise type(e)(f'URL: {url}\nError: {str(e)}')

    def usas(self, text: str, tagset: str = 'c7') -> UCREL_Doc:
        '''
        1. **text**: The text to be tagged by USAS.
        2. **tagset**: The tagset to be used by USAS. Either `c5` or `c7`. 
        
        **returns**: A `UCREL_Doc` representing the text and the 
        lingustic attributes that are generared from tagging it 
        with [USAS.](http://ucrel.lancs.ac.uk/usas/)
        '''
        # Call USAS endpoint.
        usas_endpoint = '/cgi-bin/usas.pl'
        usas_data = self._ucrel_post_request(usas_endpoint, text, tagset=tagset)
        usas_data = usas_data.strip()
        if not usas_data:
            return UCREL_Doc(text, tokens=[], sentence_indexes=[])
        
        ucrel_tokens: List[UCREL_Token] = []
        
        sentence_indexes: List[Tuple[int, int]] = []
        token_index = 0
        last_sentence_index = 0

        

        for sentence in usas_data.split('<s>'):
            sentence = sentence.strip().rstrip('</s>')
            if not sentence:
                continue
            for token_values in sentence.split('\n'):
                token_values = token_values.strip()
                if not token_values:
                    continue

                token_values = token_values.split('\t')
                token_text, pos_tag, lemma, usas_tags = None, None, None, None
                # Punctuation does not get tagged with USAS tags.
                if len(token_values) == 3:
                    token_text, pos_tag, lemma = token_values
                else:
                    token_text, pos_tag, lemma, usas_tags = token_values
                token_text = self._sgml_entity_un_escape(token_text)
                lemma = self._sgml_entity_un_escape(lemma)
                usas_tag = None
                mwe_tag = None
                if usas_tags is not None:
                    # Most likely USAS tag
                    usas_tag = usas_tags.split()[0]
                    # Get the MWE tag
                    usas_and_mwe = usas_tag.split('[i')
                    if len(usas_and_mwe) == 2:
                        usas_tag, mwe_tag = usas_and_mwe

                ucrel_tokens.append(UCREL_Token(token_text, lemma=lemma, 
                                                pos_tag=pos_tag, 
                                                usas_tag=usas_tag, mwe_tag=mwe_tag))
                token_index += 1
            sentence_indexes.append((last_sentence_index, token_index))
            last_sentence_index = token_index

        return UCREL_Doc(text, tokens=ucrel_tokens, sentence_indexes=sentence_indexes)

    def __repr__(self) -> str:
        '''
        String representation of the UCREL API instance, format:
        
        UCREL API, server address {self.server_address}, port {self.port}, timeout {self.timeout}

        `, port {self.port}` -- will only exist in string if `self.port!=''`
        '''
        base_repr = f'UCREL API, server address {self.server_address}'
        if self.port:
            base_repr += f', port {self.port}'
        base_repr += f', timeout {self.timeout} seconds'
        return base_repr

In [None]:
show_doc(UCREL_API.__init__)

<h4 id="UCREL_API.__init__" class="doc_header"><code>UCREL_API.__init__</code><a href="__main__.py#L56" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_API.__init__</code>(**`email`**:`str`, **`server_address`**:`str`, **`port`**:`str`=*`''`*, **`timeout`**:`int`=*`60`*)

Creates a UCREL API instance that is used to call the UCREL Tool chain.

1. **email**: Email address of the user. This is used to identify the user 
calling the UCREL Tool Chain API.
2. **server_address**: The address of the UCREL Tool Chain e.g. 
[http://ucrel-api.lancaster.ac.uk](http://ucrel-api.lancaster.ac.uk)
3. **port**: The port to the server e.g. 8080. Can be left as empty string
if port number is not required.
4. **timeout**: The amount of time to allow each request to take before raising 
a `requests.exceptions.Timeout`

In [None]:
ucrel_api = UCREL_API('a.moore@lancaster.ac.uk', 'http://ucrel-api.lancaster.ac.uk')

In [None]:
show_doc(UCREL_API.__repr__)

<h4 id="UCREL_API.__repr__" class="doc_header"><code>UCREL_API.__repr__</code><a href="__main__.py#L185" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_API.__repr__</code>()

String representation of the UCREL API instance, format:

UCREL API, server address {self.server_address}, port {self.port}, timeout {self.timeout}

`, port {self.port}` -- will only exist in string if `self.port!=''`

In [None]:
ucrel_api

UCREL API, server address http://ucrel-api.lancaster.ac.uk, timeout 60 seconds

In [None]:
show_doc(UCREL_API.usas)

<h4 id="UCREL_API.usas" class="doc_header"><code>UCREL_API.usas</code><a href="__main__.py#L124" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_API.usas</code>(**`text`**:`str`, **`tagset`**:`str`=*`'c7'`*)

1. **text**: The text to be tagged by USAS.
2. **tagset**: The tagset to be used by USAS. Either `c5` or `c7`. 

**returns**: A [`UCREL_Doc`](/ucrel-python-api/ucrel_doc.html#UCREL_Doc) representing the text and the 
lingustic attributes that are generared from tagging it 
with [USAS.](http://ucrel.lancs.ac.uk/usas/)

In [None]:
ucrel_doc = ucrel_api.usas(('Hope you have a nice day. '
                            'Works with SGML entities e.g. 5 > 4.'
                            'Also with MWE like New York.'))
for index, sentence in enumerate(ucrel_doc.sentences):
    print(f'Sentence {index}')
    for token in sentence:
        print(token)
    if index == 0 or index == 1:
        print('\n')

Sentence 0
UCREL Token: Hope	Lemma: hope	POS tag: VV0	USAS tag: X2.6+
UCREL Token: you	Lemma: you	POS tag: PPY	USAS tag: Z8mf
UCREL Token: have	Lemma: have	POS tag: VH0	USAS tag: A9+
UCREL Token: a	Lemma: a	POS tag: AT1	USAS tag: Z5
UCREL Token: nice	Lemma: nice	POS tag: JJ	USAS tag: O4.2+
UCREL Token: day	Lemma: day	POS tag: NNT1	USAS tag: T1.3
UCREL Token: .	Lemma: PUNC	POS tag: .


Sentence 1
UCREL Token: Works	Lemma: works	POS tag: NN	USAS tag: I4/H1c
UCREL Token: with	Lemma: with	POS tag: IW	USAS tag: Z5
UCREL Token: SGML	Lemma: sgml	POS tag: NP1	USAS tag: Z99
UCREL Token: entities	Lemma: entity	POS tag: NN2	USAS tag: O2
UCREL Token: e.g.	Lemma: e.g.	POS tag: REX	USAS tag: A4.1
UCREL Token: 5	Lemma: 5	POS tag: MC	USAS tag: N1
UCREL Token: >	Lemma: >	POS tag: FO	USAS tag: Z99
UCREL Token: 4	Lemma: 4	POS tag: MC	USAS tag: N1
UCREL Token: .	Lemma: PUNC	POS tag: .


Sentence 2
UCREL Token: Also	Lemma: also	POS tag: RR	USAS tag: N5++
UCREL Token: with	Lemma: with	POS tag: IW	USAS tag: 

**Note** that even though `New York` is the first `MWE` identified as shown above, it has the `MWE tag`: `2.2.1` and `2.2.2` suggesting that there has been a MWE previously due to the first number in the tag being `2`. Actually the USAS, POS, and MWE tags shown above are the most likely tags and other less probable tags are generated for each token, but they are not shown here as we only output the most probable tag for each token.

In [None]:
#hide

from ucrel_api.api import UCREL_API

def test_ucrel_api_repr() -> None:
    base_parameters = {'email':'test@example.com', 'server_address':'127.0.0.1'}
    
    server_address_only = UCREL_API(**base_parameters)
    assert 'UCREL API, server address 127.0.0.1, timeout 60 seconds' == str(server_address_only)

    port_and_server = UCREL_API(**base_parameters, port='8070')
    assert 'UCREL API, server address 127.0.0.1, port 8070, timeout 60 seconds' == str(port_and_server)

    timeout_different = UCREL_API(**base_parameters, port='8070', timeout=1)
    assert 'UCREL API, server address 127.0.0.1, port 8070, timeout 1 seconds' == str(timeout_different)
test_ucrel_api_repr()

In [None]:
#hide

from time import sleep
from typing import Optional

import pytest

from ucrel_api.ucrel_doc import UCREL_Doc
from ucrel_api.ucrel_token import UCREL_Token
from ucrel_api.api import UCREL_API

DOC_TOKENS = [UCREL_Token('hello', 'hello','UH', 'Z4'), UCREL_Token('how', 'how', 'RRQ', 'Z5'), 
              UCREL_Token('are', 'be','VBR', 'A3+'), UCREL_Token('you', 'you', 'PPY', 'Z8mf'),
              UCREL_Token('it', 'it', 'PPH1', 'Z8'), UCREL_Token("'s", "be", 'VBZ', 'A3+'),
              UCREL_Token('.', 'PUNCT','.', None), UCREL_Token('Great', 'great', 'JJ', 'A5.1+'),
              UCREL_Token('day', 'day', 'NNT1', 'T1.3')]

#@pytest.mark.parametrize("port", [None, '80'])
def test_ucrel_api_usas(port: Optional[str] = None) -> None:
    test_api = UCREL_API(email='a.moore@lancaster.ac.uk', 
                         server_address='http://ucrel-api.lancaster.ac.uk')
    # Test port
    if port is not None:
        test_api = UCREL_API(email='a.moore@lancaster.ac.uk', 
                             server_address='http://ucrel-api.lancaster.ac.uk',
                             port=port)
    # None case
    value = test_api.usas('')
    empty_doc = UCREL_Doc("", tokens=[], sentence_indexes=[])
    assert value == empty_doc
    # sleeps are to ensure not too many calls are made to the API
    sleep(0.5)

    # One sentence and token
    value = test_api.usas('hello')
    _doc = UCREL_Doc('hello', tokens=[DOC_TOKENS[0]], 
                     sentence_indexes=[(0,1)])
    assert value == _doc

    sleep(0.5)
    # One sentence many tokens
    value = test_api.usas("hello how are you it's")
    _doc = UCREL_Doc("hello how are you it's", tokens=DOC_TOKENS[:6], 
                     sentence_indexes=[(0,6)])
    assert value == _doc

    sleep(0.5)
    # Two sentences
    value = test_api.usas("hello how are you it's. Great day")
    _doc = UCREL_Doc("hello how are you it's. Great day", tokens=DOC_TOKENS, 
                     sentence_indexes=[(0,7), (7,9)])
    
    sleep(0.5)
    # Test the un-escaping of SGML entities
    value = test_api.usas("hello []")
    sgml_tokens = [UCREL_Token('hello', 'hello', 'NN1%', 'S1.1.1'), 
                   UCREL_Token('[', '[', '(', None), 
                   UCREL_Token(']', ']', ')', None)]
    assert value == UCREL_Doc("hello []", tokens=sgml_tokens, sentence_indexes=[(0,3)])

    sleep(0.5)
    value = test_api.usas("hello £100")
    sgml_tokens = [UCREL_Token('hello', 'hello', 'UH', 'Z4'), 
                   UCREL_Token('£100', '£100', 'NNU', 'I1')]
    assert value == UCREL_Doc("hello £100", tokens=sgml_tokens, sentence_indexes=[(0,2)])

    sleep(0.5)
    value = test_api.usas("hello <>")
    sgml_tokens = [UCREL_Token('hello', 'hello', 'UH', 'Z4'), 
                   UCREL_Token('<>', '<>', 'FO', 'Z99')]
    assert value == UCREL_Doc("hello <>", tokens=sgml_tokens, sentence_indexes=[(0,2)])

    sgml_tokens = [UCREL_Token('hello', 'hello', 'UH', 'Z4'),
                   UCREL_Token('&', '&', 'CC', 'Z5'),
                   UCREL_Token('another', 'another', 'DD1', 'A6.1-'),
                   UCREL_Token('£100', '£100', 'NNU', 'I1'),
                   UCREL_Token('<>', '<>', 'FO', 'Z99'),
                   UCREL_Token('[', '[', '(', None),
                   UCREL_Token(']', ']', ')', None),
                   UCREL_Token('André', 'andré', 'NP1', 'Z99'),
                   UCREL_Token('&', '&', 'CC', 'Z5')]

    sleep(0.5)
    value = test_api.usas("hello & another £100 <> [] André &")
    assert value == UCREL_Doc("hello & another £100 <> [] André &", tokens=sgml_tokens, 
                              sentence_indexes=[(0,9)])
    # MWE
    sleep(0.5)
    new_york_tokens = [UCREL_Token('hello', 'hello', 'UH', 'Z4'),
                       UCREL_Token('New', 'new', 'NP1', 'Z2', '2.2.1'),
                       UCREL_Token('York', 'york', 'NP1', 'Z2', '2.2.2')]
    value = test_api.usas("hello New York")
    assert value == UCREL_Doc("hello New York", tokens=new_york_tokens, 
                              sentence_indexes=[(0,3)])
test_ucrel_api_usas()

In [None]:
# hide

from time import sleep

import pytest
import responses
import requests

from ucrel_api.api import UCREL_API

SERVER_ADDRESS = 'http://ucrel-api.lancaster.ac.uk'
ENDPOINT = '/cgi-bin/usas.pl'
TAGSET = 'c7'
TEST_API = UCREL_API(email='a.moore@lancaster.ac.uk', 
                     server_address=SERVER_ADDRESS)

@responses.activate
def test__timeout_ucrel_post_request():
    responses.add('POST', f'{SERVER_ADDRESS}{ENDPOINT}', body=requests.exceptions.Timeout())
    with pytest.raises(requests.exceptions.Timeout):
        TEST_API._ucrel_post_request(ENDPOINT, 'hello', tagset=TAGSET)
    return None

@responses.activate
def test__status_code_ucrel_post_request():
    responses.add('POST', f'{SERVER_ADDRESS}{ENDPOINT}', status=301)
    with pytest.raises(requests.exceptions.HTTPError):
        TEST_API._ucrel_post_request(ENDPOINT, 'hello', tagset=TAGSET)
    return None

@responses.activate
def test__general_exception_ucrel_post_request():
    responses.add('POST', f'{SERVER_ADDRESS}{ENDPOINT}', body=Exception('Unknown error.'))
    with pytest.raises(Exception):
        TEST_API._ucrel_post_request(ENDPOINT, 'hello', tagset=TAGSET)
    return None

def test__escaping_sgml_entities_ucrel_post_request():
    # Test the escaping of SGML entities
    sleep(0.5)
    hello_square_brackets = ('\n<s>\n</s>\n<s>\n'
                             'hello\tNN1%\thello\tS1.1.1 \n'
                             '&lsqb;\t(\t&lsqb;\t\n'
                             '&rsqb;\t)\t&rsqb;\t\n'
                             '</s>\n')
    assert TEST_API._ucrel_post_request(ENDPOINT, "hello []", tagset=TAGSET) == hello_square_brackets

    sleep(0.5)
    hello_pound_sign = ('\n<s>\n</s>\n<s>\n'
                        'hello\tUH\thello\tZ4 \n'
                        '&pound;100\tNNU\t&pound;100\tI1 \n'
                        '</s>\n')
    assert TEST_API._ucrel_post_request(ENDPOINT, "hello £100", tagset=TAGSET) == hello_pound_sign

    sleep(0.5)
    hello_greater_less_sign = ('\n<s>\n</s>\n<s>\n'
                               'hello\tUH\thello\tZ4 \n'
                               '&lt;&gt;\tFO\t&lt;&gt;\tZ99 \n'
                               '</s>\n')
    assert TEST_API._ucrel_post_request(ENDPOINT, "hello <>", tagset=TAGSET) == hello_greater_less_sign
    
    sleep(0.5)
    all_sgml_values = ('\n<s>\n</s>\n<s>\n'
                       'hello\tUH\thello\tZ4 \n'
                       '&amp;\tCC\t&amp;\tZ5 \n'
                       'another\tDD1\tanother\tA6.1- N5++ \n'
                       '&pound;100\tNNU\t&pound;100\tI1 \n'
                       '&lt;&gt;\tFO\t&lt;&gt;\tZ99 \n'
                       '&lsqb;\t(\t&lsqb;\t\n'
                       '&rsqb;\t)\t&rsqb;\t\n'
                       'Andr&eacute;\tNP1\tandr&eacute;\tZ99 \n'
                       '&amp;\tCC\t&amp;\tZ5 \n'
                       '</s>\n')
    sgml_text = "hello & another £100 <> [] André &"
    assert TEST_API._ucrel_post_request(ENDPOINT, sgml_text, tagset=TAGSET) == all_sgml_values

test__timeout_ucrel_post_request()
test__status_code_ucrel_post_request()
test__general_exception_ucrel_post_request()
test__escaping_sgml_entities_ucrel_post_request()