In [1]:
import nltk

with open('desert.txt') as f:
    desert_text = f.read()

In [2]:
desert_tokens = nltk.word_tokenize(desert_text)

print(len(desert_tokens))
print(desert_tokens[:100])

1364
['Three', 'Calgarians', 'have', 'found', 'a', 'rather', 'unusual', 'way', 'of', 'leaving', 'snow', 'and', 'ice', 'behind', '.', 'They', 'set', 'off', 'this', 'week', 'on', 'foot', 'and', 'by', 'camel', 'on', 'a', 'grueling', 'trek', 'across', 'the', 'burning', 'Arabian', 'desert', '.', 'When', 'they', 'were', 'still', 'in', 'Canada', ',', 'planning', 'their', 'trip', ',', 'they', 'expected', 'they', 'would', 'feel', 'lonely', '.', 'But', 'after', 'two', 'days', 'into', 'the', '1,200', 'kilometre', 'journey', ',', 'the', 'caravan', 'has', 'won', 'celebrity', 'status', 'among', 'the', 'native', 'Bedouin', 'people', 'and', 'government', 'officials', 'of', 'Oman', '.', 'Some', 'have', 'excitedly', 'tagged', 'along', ',', 'says', 'expedition', 'leader', 'Jamie', 'Clarke', '.', 'Mr.', 'Clarke', 'is', 'making', 'the', 'trek', 'with', 'his']


# Porter stemmer

In [3]:
porter = nltk.PorterStemmer()

stemmed_p = [porter.stem(t) for t in desert_tokens]
print(stemmed_p[:100])

['three', 'calgarian', 'have', 'found', 'a', 'rather', 'unusu', 'way', 'of', 'leav', 'snow', 'and', 'ice', 'behind', '.', 'they', 'set', 'off', 'thi', 'week', 'on', 'foot', 'and', 'by', 'camel', 'on', 'a', 'gruel', 'trek', 'across', 'the', 'burn', 'arabian', 'desert', '.', 'when', 'they', 'were', 'still', 'in', 'canada', ',', 'plan', 'their', 'trip', ',', 'they', 'expect', 'they', 'would', 'feel', 'lone', '.', 'but', 'after', 'two', 'day', 'into', 'the', '1,200', 'kilometr', 'journey', ',', 'the', 'caravan', 'ha', 'won', 'celebr', 'statu', 'among', 'the', 'nativ', 'bedouin', 'peopl', 'and', 'govern', 'offici', 'of', 'oman', '.', 'some', 'have', 'excitedli', 'tag', 'along', ',', 'say', 'expedit', 'leader', 'jami', 'clark', '.', 'mr.', 'clark', 'is', 'make', 'the', 'trek', 'with', 'hi']


# Lancaster stemmer

In [4]:
lancaster = nltk.LancasterStemmer()

stemmed_l = [lancaster.stem(t) for t in desert_tokens]
print(stemmed_l[:100])

['three', 'calg', 'hav', 'found', 'a', 'rath', 'unus', 'way', 'of', 'leav', 'snow', 'and', 'ic', 'behind', '.', 'they', 'set', 'off', 'thi', 'week', 'on', 'foot', 'and', 'by', 'camel', 'on', 'a', 'gruel', 'trek', 'across', 'the', 'burn', 'arab', 'desert', '.', 'when', 'they', 'wer', 'stil', 'in', 'canad', ',', 'plan', 'their', 'trip', ',', 'they', 'expect', 'they', 'would', 'feel', 'lon', '.', 'but', 'aft', 'two', 'day', 'into', 'the', '1,200', 'kilomet', 'journey', ',', 'the', 'carav', 'has', 'won', 'celebr', 'stat', 'among', 'the', 'nat', 'bedouin', 'peopl', 'and', 'govern', 'off', 'of', 'om', '.', 'som', 'hav', 'excit', 'tag', 'along', ',', 'say', 'expedit', 'lead', 'jamy', 'clark', '.', 'mr.', 'clark', 'is', 'mak', 'the', 'trek', 'with', 'his']


In [6]:
porter_word = stemmed_p[1]
porter_word

'calgarian'

In [7]:
lancaster_word = stemmed_l[1]
lancaster_word

'calg'

# REGEX

In [24]:
pattern = r''' (?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| (?:[A-Z][a-z]+\.)+ # titles, e.g. Mr.
| [A-za-z]+\'[a-z]+  # contractions, e.g. wasn't
| \$?\d+(?:\.\d+)?%? # currency and percentages, $12.40, 50%
| \w+(?:-\w+)* # words with internal hyphens
| \.\.\. # ellipsis
| [][.,;ââ?():-_%#â] # separate tokens
'''

shorttext = "Mr. Black and Mrs. Brown couldn't attend the lecture by Dr. Gray, but Gov. White wasn't there in the U.S.A."

print(nltk.regexp_tokenize(shorttext, pattern))

['Mr.', 'Black', 'and', 'Mrs.', 'Brown', "couldn't", 'attend', 'the', 'lecture', 'by', 'Dr.', 'Gray', ',', 'but', 'Gov.', 'White', "wasn't", 'there', 'in', 'the', 'U.S.A.']
