In [None]:
# default_exp ucrel_doc

In [None]:
# hide
from nbdev.showdoc import show_doc

# UCREL Doc
> The UCREL Doc class holds text level lingustic information which is stored as a list of UCREL Token instances.

In [None]:
# export

from collections import abc
from typing import List, Tuple, Optional, Iterable, Any

from ucrel_api.ucrel_token import UCREL_Token

class UCREL_Doc(abc.Iterable, abc.Sized):
    '''
    Classs that holds text level lingustic information which is stored as 
    a list of `UCREL_Token`s. A text here can be anything from one word 
    to a whole book or larger. However becareful how much information 
    you store in this class as it is all stored in memory. 

    This class is inspired by the [Doc](https://spacy.io/api/doc) 
    class from the [SpaCy API.](https://spacy.io/api)

    **inherits from**: [collections.abc.Iterable](https://docs.python.org/3/library/collections.abc.html#collections.abc.Iterable) 
    and [collections.abc.Sized](https://docs.python.org/3/library/collections.abc.html#collections.abc.Sized)
    '''
    def __init__(self, text: str, tokens: List[UCREL_Token], 
                 sentence_indexes: Optional[List[Tuple[int,int]]] = None
                 ) -> None:
        '''
        1. **text**: The text the Doc is representing.
        2. **tokens**: List of `UCREL_Token`s
        3. **sentence_indexes**: A List of Tuples. Where each tuple 
           contains a start and an end token index representing 
           the start and end of the sentence. These are used to 
           create the `sentences` property. Can be accessed through 
           `self._sentence_indexes`. **Optional**
        '''
        self.text = text
        self.tokens = tokens
        self._sentence_indexes = sentence_indexes
    
    def __repr__(self) -> str:
        '''
        String representation of the UCREL Doc instance:
        '''
        base_repr = 'UCREL Doc:'
        if self._sentence_indexes is not None:
            base_repr = f'UCREL Doc ({len(self._sentence_indexes)} sentences):'
        first_three_tokens = self.tokens[:3]
        first_three_token_strings = '\n'.join([str(token) for token in first_three_tokens])
        base_repr += f'\nFirst {len(first_three_tokens)} tokens:\n'
        base_repr += first_three_token_strings
        return base_repr

    def __iter__(self) -> Iterable[UCREL_Token]:
        '''
        **returns**: Yields each token in `self.tokens`.
        '''
        for token in self.tokens:
            yield token
    
    def __len__(self) -> int:
        '''
        **returns**: The number of tokens in the Doc.
        '''
        return len(self.tokens)

    def __getitem__(self, index: int) -> UCREL_Token:
        '''
        1. **index**: The index of the token to return. 
        
        **returns**: The token at the given index.
        '''
        return self.tokens[index]

    def __eq__(self, other: Any) -> bool:
        '''
        Compare another instance with the current instance of this
        class.

        1. **other**: Another instance, if this instance is not of this
        class type it will raise a `NotImplementedError`.

        **returns** `True` if the two instances are the same based on
        the following attributes:
        
        1. text
        2. sentence_indexes
        3. tokens

        **raises NotImplementedError**: If the `other` instance is not of 
        the same class type as `self`.
        '''
        # Reference:
        # https://stackoverflow.com/questions/1227121/compare-object-instances-for-equality-by-their-attributes
        if not isinstance(other, UCREL_Doc):
            error_msg = (f"Cannot compare this UCREL Doc {self}\n"
                         "With anything other than another UCREL Doc instance."
                         " The instance that is being compared is of type "
                         f"{type(other)}")
            raise NotImplementedError(error_msg)
        other: UCREL_Doc
        if self.text != other.text:
            return False
        if self._sentence_indexes != other._sentence_indexes:
            return False
        if len(self) != len(other):
            return False
        for token_index, token in enumerate(self):
            if token != other[token_index]:
                return False
        return True

    @property
    def sentences(self) -> Iterable[List[UCREL_Token]]:
        '''
        **returns**: An iterable of all sentences in the text represented 
        as a list of `UCREL_Token`s. 

        **raises ValueError**: If the `sentence_indexes` parameter is not set 
        at contruction time.
        '''
        if self._sentence_indexes is None:
            error_msg = ('The `sentences` attribute cannot be accessed/created '
                         'without `sentence_indexes` at construction time')
            raise ValueError(error_msg)
        for start_index, end_index in self._sentence_indexes:
            yield self.tokens[start_index:end_index]


In [None]:
show_doc(UCREL_Doc.__init__)

<h4 id="UCREL_Doc.__init__" class="doc_header"><code>UCREL_Doc.__init__</code><a href="__main__.py#L21" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_Doc.__init__</code>(**`text`**:`str`, **`tokens`**:`List`\[[`UCREL_Token`](/ucrel-python-api/ucrel_token.html#UCREL_Token)\], **`sentence_indexes`**:`Optional`\[`List`\[`Tuple`\[`int`, `int`\]\]\]=*`None`*)

1. **text**: The text the Doc is representing.
2. **tokens**: List of [`UCREL_Token`](/ucrel-python-api/ucrel_token.html#UCREL_Token)s
3. **sentence_indexes**: A List of Tuples. Where each tuple 
   contains a start and an end token index representing 
   the start and end of the sentence. These are used to 
   create the `sentences` property. Can be accessed through 
   `self._sentence_indexes`. **Optional**

In [None]:
from ucrel_api.ucrel_token import UCREL_Token

DOC_TOKENS = [UCREL_Token('hello', pos_tag='UH', usas_tag='Z4'), 
              UCREL_Token('how', 'RRQ', 'Z5'), 
              UCREL_Token('are', 'VBR', 'A3+'), UCREL_Token('you', 'PPY', 'Z8mf'),
              UCREL_Token('.', '.', None), UCREL_Token('I', 'PPIS1', 'Z8mf'),
              UCREL_Token('am', 'VBM', 'A3+'), UCREL_Token('great', 'JJ', 'A5.1+'),
              UCREL_Token('thanks', 'NN2', 'S1.2.4+'), UCREL_Token('.', '.', None)]

example_doc = UCREL_Doc(text='hello how are you. I am great thanks.',
                        tokens=DOC_TOKENS, sentence_indexes=[(0,5), (5,10)])

In [None]:
show_doc(UCREL_Doc.__repr__)

<h4 id="UCREL_Doc.__repr__" class="doc_header"><code>UCREL_Doc.__repr__</code><a href="__main__.py#L37" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_Doc.__repr__</code>()

String representation of the UCREL Doc instance:

In [None]:
example_doc

UCREL Doc (2 sentences):
First 3 tokens:
UCREL Token: hello	POS tag: UH	USAS tag: Z4
UCREL Token: how	Lemma: RRQ	POS tag: Z5
UCREL Token: are	Lemma: VBR	POS tag: A3+

In [None]:
show_doc(UCREL_Doc.sentences)

<h4 id="UCREL_Doc.sentences" class="doc_header"><code>UCREL_Doc.sentences</code><a href="" class="source_link" style="float:right">[source]</a></h4>

**returns**: An iterable of all sentences in the text represented 
as a list of [`UCREL_Token`](/ucrel-python-api/ucrel_token.html#UCREL_Token)s. 

**raises ValueError**: If the `sentence_indexes` parameter is not set 
at contruction time.

In [None]:
for index, sentence in enumerate(example_doc.sentences):
    print(f'Sentence {index}:')
    for token in sentence:
        print(f'{token}')
    if index == 0:
        print('\n')

Sentence 0:
UCREL Token: hello	POS tag: UH	USAS tag: Z4
UCREL Token: how	Lemma: RRQ	POS tag: Z5
UCREL Token: are	Lemma: VBR	POS tag: A3+
UCREL Token: you	Lemma: PPY	POS tag: Z8mf
UCREL Token: .	Lemma: .


Sentence 1:
UCREL Token: I	Lemma: PPIS1	POS tag: Z8mf
UCREL Token: am	Lemma: VBM	POS tag: A3+
UCREL Token: great	Lemma: JJ	POS tag: A5.1+
UCREL Token: thanks	Lemma: NN2	POS tag: S1.2.4+
UCREL Token: .	Lemma: .


In [None]:
show_doc(UCREL_Doc.__iter__)

<h4 id="UCREL_Doc.__iter__" class="doc_header"><code>UCREL_Doc.__iter__</code><a href="__main__.py#L50" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_Doc.__iter__</code>()

**returns**: Yields each token in `self.tokens`.

In [None]:
for index, token in enumerate(example_doc):
    print(f'{index} {token}')

0 UCREL Token: hello	POS tag: UH	USAS tag: Z4
1 UCREL Token: how	Lemma: RRQ	POS tag: Z5
2 UCREL Token: are	Lemma: VBR	POS tag: A3+
3 UCREL Token: you	Lemma: PPY	POS tag: Z8mf
4 UCREL Token: .	Lemma: .
5 UCREL Token: I	Lemma: PPIS1	POS tag: Z8mf
6 UCREL Token: am	Lemma: VBM	POS tag: A3+
7 UCREL Token: great	Lemma: JJ	POS tag: A5.1+
8 UCREL Token: thanks	Lemma: NN2	POS tag: S1.2.4+
9 UCREL Token: .	Lemma: .


In [None]:
show_doc(UCREL_Doc.__getitem__)

<h4 id="UCREL_Doc.__getitem__" class="doc_header"><code>UCREL_Doc.__getitem__</code><a href="__main__.py#L63" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_Doc.__getitem__</code>(**`index`**:`int`)

1. **index**: The index of the token to return. 

**returns**: The token at the given index.

In [None]:
example_doc[-2]

UCREL Token: thanks	Lemma: NN2	POS tag: S1.2.4+

In [None]:
show_doc(UCREL_Doc.__len__)

<h4 id="UCREL_Doc.__len__" class="doc_header"><code>UCREL_Doc.__len__</code><a href="__main__.py#L57" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_Doc.__len__</code>()

**returns**: The number of tokens in the Doc.

In [None]:
len(example_doc)

10

In [None]:
show_doc(UCREL_Doc.__eq__)

<h4 id="UCREL_Doc.__eq__" class="doc_header"><code>UCREL_Doc.__eq__</code><a href="__main__.py#L71" class="source_link" style="float:right">[source]</a></h4>

> <code>UCREL_Doc.__eq__</code>(**`other`**:`Any`)

Compare another instance with the current instance of this
class.

1. **other**: Another instance, if this instance is not of this
class type it will raise a `NotImplementedError`.

**returns** `True` if the two instances are the same based on
the following attributes:

1. text
2. sentence_indexes
3. tokens

**raises NotImplementedError**: If the `other` instance is not of 
the same class type as `self`.

In [None]:
assert example_doc == UCREL_Doc(text='hello how are you. I am great thanks.',
                                tokens=DOC_TOKENS, 
                                sentence_indexes=[(0,5), (5,10)])

example_without_sent_indexes = UCREL_Doc(text='hello how are you. I am great thanks.',
                                         tokens=DOC_TOKENS)
assert example_doc != example_without_sent_indexes

try:
    {'text': 'hello how are you. I am great thanks.', 
     'tokens': DOC_TOKENS, 'sentence_indexes': [(0,5), (5,10)]} == example_doc
except NotImplementedError:
    print('UCREL_Doc instances can only be compared '
          'with other UCREL_Doc instances:')


UCREL_Doc instances can only be compared with other UCREL_Doc instances:


In [None]:
# hide
import copy
import pytest

from ucrel_api.ucrel_doc import UCREL_Doc, UCREL_Token

DOC_TOKENS = [UCREL_Token('hello', 'UH', 'Z4'), UCREL_Token('how', 'RRQ', 'Z5'), 
              UCREL_Token('are', 'VBR', 'A3+'), UCREL_Token('you', 'PPY', 'Z8mf')]

def test_ucrel_doc_repr() -> None:    
    minimum_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS)
    true_repr = 'UCREL Doc:\nFirst 3 tokens:\n' + '\n'.join([str(token) for token in DOC_TOKENS[:3]])
    assert str(minimum_doc) == true_repr

    sentence_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS, sentence_indexes=[(0,1), (1,4)])
    true_repr = 'UCREL Doc (2 sentences):\nFirst 3 tokens:\n' + '\n'.join([str(token) for token in DOC_TOKENS[:3]])
    assert str(sentence_doc) == true_repr

    # Test when the UCREL Doc has less than 3 tokens
    minimum_doc = UCREL_Doc('hello how', tokens=DOC_TOKENS[:2])
    true_repr = 'UCREL Doc:\nFirst 2 tokens:\n' + '\n'.join([str(token) for token in DOC_TOKENS[:2]])
    assert str(minimum_doc) == true_repr

def test_ucrel_doc_iter() -> None:
    minimum_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS)
    for index, token in enumerate(minimum_doc):
        assert DOC_TOKENS[index] == token

def test_ucrel_doc_len() -> None:
    empty = UCREL_Doc('',[])
    assert len(empty) == 0

    minimum_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS)
    assert len(minimum_doc) == 4

def test_ucrel_doc_getitem() -> None:
    minimum_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS)
    assert minimum_doc[0] == DOC_TOKENS[0]
    
    assert minimum_doc[:2] == [DOC_TOKENS[0], DOC_TOKENS[1]]

    with pytest.raises(IndexError):
        minimum_doc[4]
    
    with pytest.raises(TypeError):
        minimum_doc['0']

def test_ucrel_doc_eq() -> None:
    # Pass when just text and tokens
    minimum_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS)
    same_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS)
    assert minimum_doc == same_doc

    # Difference based on both text and tokens
    diff_doc = UCREL_Doc('hello how are', tokens=DOC_TOKENS[:-1])
    assert minimum_doc != diff_doc

    # Difference based on tokens and not text
    diff_tokens = copy.deepcopy(DOC_TOKENS)
    diff_tokens[-1].pos_tag = 'DIFF'
    diff_doc = UCREL_Doc('hello how are you', tokens=diff_tokens)
    assert minimum_doc != diff_doc

    # Difference based on token length
    diff_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS[:3])
    assert minimum_doc != diff_doc

    # Pass with text, tokens, and sentence indexes
    minimum_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS, 
                            sentence_indexes=[(0,1), (1,4)])
    same_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS, 
                         sentence_indexes=[(0,1), (1,4)])
    assert minimum_doc == same_doc

    # Difference based on sentence indexes
    diff_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS, 
                         sentence_indexes=[(0,2), (2,4)])
    assert minimum_doc != diff_doc
    
    with pytest.raises(NotImplementedError):
        minimum_doc == {'text': 'hello'}

def test_ucrel_doc_sentences() -> None:
    minimum_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS, sentence_indexes=[(0,1), (1,4)])
    sentence_1 = [DOC_TOKENS[0]]
    sentence_2 = [DOC_TOKENS[1], DOC_TOKENS[2], DOC_TOKENS[3]]
    true_sentences = [sentence_1, sentence_2]
    count = 0
    for index, sentence in enumerate(minimum_doc.sentences):
        count += 1
        assert sentence == true_sentences[index]
    assert count == 2
    # Ensure it can run more than once for the same instance
    count = 0
    for index, sentence in enumerate(minimum_doc.sentences):
        count += 1
        assert sentence == true_sentences[index]
    assert count == 2
    
    with pytest.raises(ValueError):
        minimum_doc = UCREL_Doc('hello how are you', tokens=DOC_TOKENS)
        for value in minimum_doc.sentences:
            pass
    


test_ucrel_doc_repr()
test_ucrel_doc_iter()
test_ucrel_doc_len()
test_ucrel_doc_getitem()
test_ucrel_doc_sentences()
test_ucrel_doc_eq()