# **Regular Languages**

This Jupyter Notebook is intended to showcase the use of the `pyfoma` finite state library. Basic information about pyfoma can be found in its [documentation](https://github.com/mhulden/pyfoma/blob/main/README.md) and the description of its [regular expression metalanguage](https://github.com/mhulden/pyfoma/blob/main/docs/RegularExpressionCompiler.ipynb).

In [1]:
%pip install -q pyfoma ipytest

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pyfoma
from pyfoma import FST
from IPython import get_ipython

In [3]:
import pytest
try:
    get_ipython()
    import ipytest
    ipytest.autoconfig()
    def init_test():
        ipytest.clean()
    def run_test():
        ipytest.run()
except NameError:
    def init_test():
        pass
    def run_test():
        pass

---

## Dates

The following regular expression only accepts dates in the form `MM/DD/YYYY`

In [4]:
date = FST.re(r"([1-9]|0[1-9]|1[0-2])\/([1-9]|0[1-9]|[12][0-9]|3[01])\/([0-9][0-9][0-9][0-9])")

def check_date(text):
    return len(list(date.generate(text))) == 1

In [5]:
check_date('1/14/23')

False

In [6]:
check_date('02/07/2024')

True

## Testing

In [7]:
init_test()

YES_DATES = ('2/3/2023', '12/14/1999', '11/4/2020', '02/03/2000')
NO_DATES = (' 2/3/2023', '22/1/2023', '5/89/1874', '2/1/20230', '2/1/2023/', '2/1/1')

@pytest.mark.parametrize('text,isDate', [(x,True) for x in YES_DATES] + 
                                        [(x,False) for x in NO_DATES])
def test_date(text, isDate):
    assert check_date(text) == isDate

run_test()

[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                                                   [100%][0m
[32m[32m[1m10 passed[0m[32m in 0.08s[0m[0m


---

## Numbers

Transducer that maps the integers 1–99 to English (e.g. "12" to "twelve", "46" to "forty six")

In [8]:
number_defs = dict()

# one digit numbers
number_defs['s1'] = FST.re(r"1:(one) | 2:(two) | 3:(three) | 4:(four) | 5:(five) | 6:(six) | 7:(seven) | 8:(eight) | 9:(nine)", number_defs)

# irregular teens
number_defs['s2'] = FST.re(r"(10):(ten) | (11):(eleven) | (12):(twelve) | (13):(thirteen) | (18):(eighteen)", number_defs)

# regular teens
number_defs['s3'] = FST.re(r"(1[4-7,9]) @ (1:'' $s1 '':(teen))", number_defs)

# tens
number_defs['s4'] = FST.re(r"2:(twenty) | 3:(thirty) | 4:(forty) | 5:(fifty) | 6:(sixty) | 7:(seventy) | 8:(eighty) | 9:(ninety)", number_defs)

# double digit numbers
number_defs['s5'] = FST.re(r"([2-9]0) @ ($s4 '':'' 0:'')", number_defs)
number_defs['s6'] = FST.re(r"([2-9][1-9]) @ ($s4 '':' ' ([1-9] @ $s1))", number_defs)

number = FST.re(r"$s1 | $s2 | $s3 | $s5 | $s6", number_defs)

In [9]:
list(number.generate('6'))

['six']

In [10]:
list(number.analyze('sixteen'))

['16']

Check your results:

In [11]:
init_test()

PAIRS = [('1','one'),('2','two'),('3','three'),('4','four'),('5','five'),
    ('6','six'),('7','seven'),('8','eight'),('9','nine'),('10','ten'),
    ('11','eleven'),('12','twelve'),('13','thirteen'),('14','fourteen'),
    ('15','fiveteen'),('16','sixteen'),('17','seventeen'),('18','eighteen'),
    ('19','nineteen'),('20','twenty'),('21','twenty one'),('22','twenty two'),
    ('23','twenty three'),('24','twenty four'),('25','twenty five'),
    ('26','twenty six'),('27','twenty seven'),('28','twenty eight'),
    ('29','twenty nine'),('30','thirty'),('31','thirty one'),('32','thirty two'),
    ('33','thirty three'),('34','thirty four'),('35','thirty five'),
    ('36','thirty six'),('37','thirty seven'),('38','thirty eight'),
    ('39','thirty nine'),('40','forty'),('41','forty one'),('42','forty two'),
    ('43','forty three'),('44','forty four'),('45','forty five'),
    ('46','forty six'),('47','forty seven'),('48','forty eight'),
    ('49','forty nine'),('50','fifty'),('51','fifty one'),('52','fifty two'),
    ('53','fifty three'),('54','fifty four'),('55','fifty five'),
    ('56','fifty six'),('57','fifty seven'),('58','fifty eight'),
    ('59','fifty nine'),('60','sixty'),('61','sixty one'),('62','sixty two'),
    ('63','sixty three'),('64','sixty four'),('65','sixty five'),
    ('66','sixty six'),('67','sixty seven'),('68','sixty eight'),
    ('69','sixty nine'),('70','seventy'),('71','seventy one'),
    ('72','seventy two'),('73','seventy three'),('74','seventy four'),
    ('75','seventy five'),('76','seventy six'),('77','seventy seven'),
    ('78','seventy eight'),('79','seventy nine'),('80','eighty'),
    ('81','eighty one'),('82','eighty two'),('83','eighty three'),
    ('84','eighty four'),('85','eighty five'),('86','eighty six'),
    ('87','eighty seven'),('88','eighty eight'),('89','eighty nine'),
    ('90','ninety'),('91','ninety one'),('92','ninety two'),
    ('93','ninety three'),('94','ninety four'),('95','ninety five'),
    ('96','ninety six'),('97','ninety seven'),('98','ninety eight'),
    ('99','ninety nine')]

@pytest.mark.parametrize('digits,text', PAIRS)
def test_number(digits,text):

    assert list(number.generate(digits)) == [text]
    assert list(number.analyze(text)) == [digits]

run_test()

[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m [ 92%]
[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[3

---

## Tokenization

The following cell defines a regular expression for a simple tokenizer.

It divides tokens up at spaces with some exceptions:
- punctuation marks `,!?` are tokens by themselves.
- the punctuation marks ` `` `  and `''` are single tokens
- the contractions `n't`, `'ve`, `'ll`, `'re`, and `'s` are seperate tokens
- numbers are separate tokens, where:
    - a number may start with `$` or end with `%`
    - a number may start with or contain a comma, but may not end with one

**Note**: English tokenizers also usually have to worry about periods (`.`), which can be used to mark abbreviations, as decimal points, or to end a sentence (among other things). Unfortunately, pyfoma has some weird bugs in the way in handles periods, so we'll just ignore them.


In [12]:
tok_patterns = {}

# insert spaces before and after punctuation, including left and right double apostrophes
tok_patterns['punct'] = FST.re(r"$^rewrite('':' ' ([!?-] | ``) '':' ')") 
tok_patterns['punct1'] = FST.re(r"$^rewrite('':' ' [,] '':'')") 

# insert space before contractions including n't, 've, 'll, 're, 's
tok_patterns['contract'] = FST.re(r"$^rewrite('':' ' (n\'t|\'ll|\'re|\'s) '':' ')")
tok_patterns['contract1'] = FST.re(r"$^rewrite('':' ' (\'ve) '':'')")

# handle numbers, which may start with $ or end with %, and may contain but not end with a comma
tok_patterns['numbers'] = FST.re(r"$^rewrite(' ':'' [$]?([,][0-9]+)[%]? '':'')") 

# combine patterns
tokenizer = FST.re("$punct @ $punct1 @ $contract @ $contract1 @ $numbers", tok_patterns)

def tokenize(s):
    s = list(tokenizer.generate(s))
    if len(s) == 1:
        return s[0].split()
    else:
        return None

In [13]:
tokenize("Don't you love transducers?")

['Do', "n't", 'you', 'love', 'transducers', '?']

In [14]:
init_test()

TEST_EXAMPLES = (
    ('This is a test!', ['This','is','a','test','!']),
    ('Is this a test?', ['Is','this','a','test','?']),
    ("I don't think this is a test", ['I', 'do', "n't", 'think', 'this', 'is', 'a', 'test']),
    ("Thủy phi cơ của tôi là đầy đủ của lươn", 
        ['Thủy', 'phi', 'cơ', 'của', 'tôi', 'là', 'đầy', 'đủ', 'của', 'lươn']),
    ("Is it legal to shout ``Fire!'' in a crowded theater?", 
        ['Is', 'it', 'legal', 'to', 'shout', "``", 'Fire', '!', "''", 'in', 'a', 'crowded', 'theater','?']),
    ("The word 'very' is very over-used", 
        ['The', 'word', "'", 'very', "'", 'is', 'very', 'over', '-', 'used']),
    ("I don't think we'll've been there yet", 
        ['I', 'do', "n't", 'think', 'we', "'ll", "'ve", 'been', 'there', 'yet']),
    ("Give me 12 apples, please", ['Give', 'me', '12', 'apples', ',', 'please']),    
    ("A 20% tip on a $30 tab is 6 dollars", 
        ['A', '20%', 'tip', 'on', 'a', '$30', 'tab', 'is', '6', 'dollars']),
    ("They're going to pay us 10% of $120,000 by Jun 4, 2021",
        ['They', "'re", 'going', 'to', 'pay', 'us', '10%', 'of', '$120,000', 'by', 'Jun', '4', ',', '2021']),     
)

@pytest.mark.parametrize('text,toks', TEST_EXAMPLES)
def test_tokenizer(text, toks):
    assert tokenize(text) == toks

run_test()

[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[31mF[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[31m                                                                                   [100%][0m
[31m[1m_____________________ test_tokenizer[The word 'very' is very over-used-toks5] ______________________[0m

text = "The word 'very' is very over-used", toks = ['The', 'word', "'", 'very', "'", 'is', ...]

    [37m@pytest[39;49;00m.mark.parametrize([33m'[39;49;00m[33mtext,toks[39;49;00m[33m'[39;49;00m, TEST_EXAMPLES)[90m[39;49;00m
    [94mdef[39;49;00m [92mtest_tokenizer[39;49;00m(text, toks):[90m[39;49;00m
>       [94massert[39;49;00m tokenize(text) == toks[90m[39;49;00m
[1m[31mE       assert ['The', 'word..., 'over', ...] == ['The', 'word...'", 'is', ...][0m
[1m[31mE         At index 2 diff: "'very'" != "'"[0m
[1m[31mE         Right contains 2 more items, first extra item: '-'[0m
[1m[31mE         Use -v to get more diff[0m

[1m[31m/var/folders/zp/_d