## Tokenization

In [10]:
import nltk, re, pprint
from nltk import word_tokenize

In [25]:
# how to use word_tokenize
print("""
Tokenization: breaking up the string into wods and punctuations.
""")

# Load data through the url
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
# get the whole data as a string
data = response.read().decode('utf8')
# tokenization
token = word_tokenize(data)
print(token[:10])



Tokenization: breaking up the string into wods and punctuations.

['\ufeffThe', 'Project', 'Gutenberg', 'eBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by']


In [26]:
# create nltk text fom tokens
text = nltk.Text(token)
text.collocations()

Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Project Gutenberg; Ilya
Petrovitch; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens


## Regular Expressions

In [31]:
import re
wordlist = [w for w in nltk.corpus.words.words('en') if w.islower()]

In [53]:
# ended in "ed"
print([w for w in wordlist if re.search('ed$', w)][:5])
print('')

# ended in "ed" or "ing"
print([w for w in wordlist if re.search('(ed|ing)$', w)][:5])
print('')

# 8-letter words whose 3rd and 5th character is j and t, respectively.
print([w for w in wordlist if re.search('^..j..t..$', w)][:5])
print()

# matching each other with certain ranges of alphabets
print([w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)][:5])
print('')

# including 'a' and 'b' at least one; now matter how many charactors are
print([w for w in wordlist if re.search('a+b+', w)])
print('')


['abaissed', 'abandoned', 'abased', 'abashed', 'abatised']

['abaissed', 'abandoned', 'abased', 'abashed', 'abatised']

['abjectly', 'adjuster', 'dejected', 'dejectly', 'injector']

['gold', 'golf', 'hold', 'hole']

['aba', 'abac', 'abaca', 'abacate', 'abacay', 'abacinate', 'abacination', 'abaciscus', 'abacist', 'aback', 'abactinal', 'abactinally', 'abaction', 'abactor', 'abaculus', 'abacus', 'abaff', 'abaft', 'abaisance', 'abaiser', 'abaissed', 'abalienate', 'abalienation', 'abalone', 'abampere', 'abandon', 'abandonable', 'abandoned', 'abandonedly', 'abandonee', 'abandoner', 'abandonment', 'abaptiston', 'abarthrosis', 'abarticular', 'abarticulation', 'abas', 'abase', 'abased', 'abasedly', 'abasedness', 'abasement', 'abaser', 'abash', 'abashed', 'abashedly', 'abashedness', 'abashless', 'abashlessly', 'abashment', 'abasia', 'abasic', 'abask', 'abastardize', 'abatable', 'abate', 'abatement', 'abater', 'abatis', 'abatised', 'abaton', 'abator', 'abattoir', 'abature', 'abave', 'abaxial', 'a

In [58]:
wsj = sorted(set(nltk.corpus.treebank.words()))

# get float variables
#  first char is at least one of number between 0 ~ 9
#  after dot ".", char is at least one of number(s) between 0 ~ 9
print([w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)])
print('')

# 4 latters of number
print([w for w in wsj if re.search('^[0-9]{4}$', w)])
print("")

# connected latters by '-'
#  1st part is >= 5 latters, 2nd part is 2~3 latters, 3rd part is >= 6 latters
print([w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)])
print('')



['0.0085', '0.05', '0.1', '0.16', '0.2', '0.25', '0.28', '0.3', '0.4', '0.5', '0.50', '0.54', '0.56', '0.60', '0.7', '0.82', '0.84', '0.9', '0.95', '0.99', '1.01', '1.1', '1.125', '1.14', '1.1650', '1.17', '1.18', '1.19', '1.2', '1.20', '1.24', '1.25', '1.26', '1.28', '1.35', '1.39', '1.4', '1.457', '1.46', '1.49', '1.5', '1.50', '1.55', '1.56', '1.5755', '1.5805', '1.6', '1.61', '1.637', '1.64', '1.65', '1.7', '1.75', '1.76', '1.8', '1.82', '1.8415', '1.85', '1.8500', '1.9', '1.916', '1.92', '10.19', '10.2', '10.5', '107.03', '107.9', '109.73', '11.10', '11.5', '11.57', '11.6', '11.72', '11.95', '112.9', '113.2', '116.3', '116.4', '116.7', '116.9', '118.6', '12.09', '12.5', '12.52', '12.68', '12.7', '12.82', '12.97', '120.7', '1206.26', '121.6', '126.1', '126.15', '127.03', '129.91', '13.1', '13.15', '13.5', '13.50', '13.625', '13.65', '13.73', '13.8', '13.90', '130.6', '130.7', '131.01', '132.9', '133.7', '133.8', '14.00', '14.13', '14.26', '14.28', '14.43', '14.5', '14.53', '14.54',

In [63]:
print(len([w for w in wordlist if re.search('[aeiou]{2,}', w)]))

print(len([w for w in wordlist if re.findall(r'[aeiou]{2,}', w)]))

78281
78281
