In [2]:
import nltk, re, pprint
from nltk import word_tokenize

### Regular Expressions for Detecting Word Patterns

In [3]:
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [4]:
# find words ending with 'ed'
# research(p, s) # try to find pattern p in string s

# $ matches end of word
[w for w in wordlist if re.search('ed$', w)][:10]

['abaissed',
 'abandoned',
 'abased',
 'abashed',
 'abatised',
 'abed',
 'aborted',
 'abridged',
 'abscessed',
 'absconded']

In [7]:
# . is a wildcard symbol
# look for 8-letter word with j as 3rd letter and t as 6th letter
[w for w in wordlist if re.search('^..j..t..$', w)][:10]
# ^ caret matches start of string, without it, can have any characters before 
# the pattern

['abjectly',
 'adjuster',
 'dejected',
 'dejectly',
 'injector',
 'majestic',
 'objectee',
 'objector',
 'rejecter',
 'rejector']

? symbol specifies that previous character is optional

`re.search('^e-?mail$', w)` will return both e-mail and email


### Ranges and Closures

In [8]:
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

['gold', 'golf', 'hold', 'hole']

\+ means "one or more instances of the preceding item

\* means "zero or more instances of the preceding item"

\+ and \* symbols are sometimes referred to as Kleene closures, or simply closures.

In [11]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
[w for w in chat_words if re.search('^m+i+n+e+$', w)]

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'mine',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

In [12]:
[w for w in chat_words if re.search('^m*i*n*e*$', w)][:10]

['',
 'e',
 'i',
 'in',
 'm',
 'me',
 'meeeeeeeeeeeee',
 'mi',
 'miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee']

The ^ operator has another function when it appears as the first character inside square brackets. For example «[^aeiouAEIOU]» matches any character other than a vowel. We can search the NPS Chat Corpus for words that are made up entirely of non-vowel characters using «^[^aeiouAEIOU]+$» to find items like 'grrrr', 'cyb3r'

In [13]:
wsj = sorted(set(nltk.corpus.treebank.words()))

In [22]:
# \ 
# \ means exact match
# \. only matches period
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)][:10]

['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5']

In [15]:
[w for w in wsj if re.search('^[A-Z]+\$$', w)]

['C$', 'US$']

In [17]:
# {}
# The braced expressions, like {3,5}, specify the number of repeats of the previous item.
[w for w in wsj if re.search('^[0-9]{4}$', w)][:10]

['1614',
 '1637',
 '1787',
 '1901',
 '1903',
 '1917',
 '1925',
 '1929',
 '1933',
 '1934']

In [18]:
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)][:10]

['10-day',
 '10-lap',
 '10-year',
 '100-share',
 '12-point',
 '12-year',
 '14-hour',
 '15-day',
 '150-point',
 '190-point']

In [19]:
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]

['black-and-white',
 'bread-and-butter',
 'father-in-law',
 'machine-gun-toting',
 'savings-and-loan']

In [28]:
# |
# The pipe character indicates a choice between the material on its left or its right. 
# ()
#  Parentheses indicate the scope of an operator: they can be used together with the pipe
[w for w in wsj if re.search('(ed|ing)$', w)][:10]

['62%-owned',
 'Absorbed',
 'According',
 'Adopting',
 'Advanced',
 'Advancing',
 'Alfred',
 'Allied',
 'Annualized',
 'Anything']

In [29]:
# no parentheses
len([w for w in wsj if re.search('ed|ing$', w)]) > len([w for w in wsj if re.search('(ed|ing)$', w)])

True

### Regex Applications

In [37]:
# re.findall() finds all non-overlapping matches
word = 'supercalifragilisticexpialidocious'
len(re.findall('[aeiou]', word))

16

In general, when using regular expressions containing backslash, we should instruct the interpreter not to look inside the string at all, but simply to pass it directly to the re library for processing. We do this by prefixing the string with the letter r, to indicate that it is a raw string. For example, the raw string r'\band\b' contains two \b symbols that are interpreted by the re library as matching word boundaries instead of backspace characters. If you get into the habit of using r'...' for regular expressions — as we will do from now on — you will avoid having to think about these complications.

In [43]:
#  sequences of two or more vowels in some text, and determine their relative frequency
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
                for vs in re.findall('r[aeiou]{2,}', word))

In [44]:
fd.most_common(10)

[('rea', 108),
 ('rie', 63),
 ('ree', 57),
 ('ria', 46),
 ('rou', 41),
 ('rai', 39),
 ('rio', 22),
 ('roa', 19),
 ('roo', 18),
 ('rei', 11)]

In [47]:
# strip out hyphens
[int(n) for n in re.findall('[0-9]+', '2009-12-31')]

[2009, 12, 31]

In [48]:
# match initial vowel sequences, final vowel sequences, and consonants
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

In [50]:
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and


### Finding Word Stems

When we use a web search engine, we usually don't mind (or even notice) if the words in the document differ from our search terms in having different endings. A query for laptops finds documents containing laptop and vice versa. Indeed, laptop and laptops are just two forms of the same dictionary word (or lemma). For some language processing tasks we want to ignore word endings, and just deal with word stems.

In [52]:
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
    return word

In [56]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

[('process', 'ing')]

In [54]:
stem('processing')

'process'

In [59]:
# greedy start
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('processe', 's')]

In [60]:
# correct implementation
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('process', 'es')]

Searching tokenized text