In [5]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Tokenization**

In [7]:
sentence = "books are on the table"
tokens = word_tokenize(sentence)
tokens

['books', 'are', 'on', 'the', 'table']

**StopWords**

In [18]:
from nltk.corpus import stopwords
nltk.download('stopwords')

sentence = "Machine learning is so cool"
stopwords = set(stopwords.words('english'))

word_tokens = word_tokenize(sentence)
filtered_sentence = [w for w in word_tokens if w not in stopwords]

filtered_sentence

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['Machine', 'learning', 'cool']

**Stemming**

In [26]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

sentence = "Machine learning feels so cool"
sentence_tokens = word_tokenize(sentence)

stemmed_sentence = [ps.stem(s) for s in sentence_tokens]
stemmed_sentence

['machin', 'learn', 'feel', 'so', 'cool']

**Lemmatizing**

In [30]:
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')

wnl = WordNetLemmatizer()
lemmatized_sentence = [wnl.lemmatize(s) for s in sentence_tokens]
lemmatized_sentence

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['Machine', 'learning', 'feel', 'so', 'cool']

**POS Tagging**

In [45]:
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
tokens = word_tokenize("And now for something completely different")
tokens_tags = pos_tag(tokens)
tokens_tags

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('And', 'CC'),
 ('now', 'RB'),
 ('for', 'IN'),
 ('something', 'NN'),
 ('completely', 'RB'),
 ('different', 'JJ')]

**Brown Automatic Tagging**

In [57]:
from nltk.corpus import brown
nltk.download('brown')

brown_sents = brown.sents(categories='news')
brown_tagged_sents = brown.tagged_sents(categories='news')

print(len(brown_sents))
print(len(brown_tagged_sents))

print(brown_sents[0][:5])
print(brown_tagged_sents[0][:5])

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\max\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


4623
4623
['The', 'Fulton', 'County', 'Grand', 'Jury']
[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL')]


**Automatic Tagging (Default Tagger)**

In [69]:
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
highest_freq_tag = nltk.FreqDist(tags).max()
highest_freq_tag

'NN'

In [67]:
raw = "I do not like green eggs and ham, I do not like them Sam I am!"
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger(highest_freq_tag)
default_tagger.tag(tokens)

[('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('green', 'NN'),
 ('eggs', 'NN'),
 ('and', 'NN'),
 ('ham', 'NN'),
 (',', 'NN'),
 ('I', 'NN'),
 ('do', 'NN'),
 ('not', 'NN'),
 ('like', 'NN'),
 ('them', 'NN'),
 ('Sam', 'NN'),
 ('I', 'NN'),
 ('am', 'NN'),
 ('!', 'NN')]

In [70]:
default_tagger.accuracy(brown_tagged_sents)

0.13089484257215028

**Automatic Tagging (Regular Expression Tagger)**

In [74]:
patterns = [
            (r'.*ing$', 'VBG'),  #gerunds
            (r'.*ed$', 'VBD'),   #simple past
            (r'.*es$', 'VBZ'),   #3rd singular present
            (r'.*ould$', 'MD'),  #modals
            (r'.*\'s$', 'NN$'),  #possessive nouns
            (r'.*s$', 'NNS'),    #plural nouns
            (r'.*^-?[0-9]+(\.[0-9]+)?$', 'CD'),  #cardinal numbers
            (r'.*$', 'NN')
        ]      #nouns (default)

In [79]:
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3][:10])

[('``', 'NN'),
 ('Only', 'NN'),
 ('a', 'NN'),
 ('relative', 'NN'),
 ('handful', 'NN'),
 ('of', 'NN'),
 ('such', 'NN'),
 ('reports', 'NNS'),
 ('was', 'NNS'),
 ('received', 'VBD')]

In [80]:
regexp_tagger.accuracy(brown_tagged_sents)

0.20186168625812995

**Automatic Tagging (N-Gram Tagging)**

In [86]:
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'QL'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [88]:
unigram_tagger.accuracy(brown_tagged_sents)

0.9349006503968017

In [95]:
size = int(len(brown_sents) * 0.9)
training_set = brown_tagged_sents[:size]
testing_set = brown_tagged_sents[size:]
print(testing_set)
unigram_tagger = nltk.UnigramTagger(training_set)

[[('But', 'CC'), ('in', 'IN'), ('all', 'ABN'), ('its', 'PP$'), ('175', 'CD'), ('years', 'NNS'), (',', ','), ('not', '*'), ('a', 'AT'), ('single', 'AP'), ('Negro', 'NP'), ('student', 'NN'), ('has', 'HVZ'), ('entered', 'VBN'), ('its', 'PP$'), ('classrooms', 'NNS'), ('.', '.')], [('Last', 'AP'), ('week', 'NN'), ('Federal', 'JJ-TL'), ('District', 'NN-TL'), ('Judge', 'NN-TL'), ('William', 'NP'), ('A.', 'NP'), ('Bootle', 'NP'), ('ordered', 'VBD'), ('the', 'AT'), ('university', 'NN'), ('to', 'TO'), ('admit', 'VB'), ('immediately', 'RB'), ('a', 'AT'), ('``', '``'), ('qualified', 'VBN'), ("''", "''"), ('Negro', 'NP'), ('boy', 'NN'), ('and', 'CC'), ('girl', 'NN'), ('.', '.')], ...]


In [96]:
unigram_tagger.accuracy(testing_set)

0.8121200039868434

**Automatic Tagging (N-Gram Tagging)**

In [98]:
bigram_tagger = nltk.BigramTagger(training_set)
bigram_tagger.tag(brown_sents[2007])

[('Various', 'JJ'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('apartments', 'NNS'),
 ('are', 'BER'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('terrace', 'NN'),
 ('type', 'NN'),
 (',', ','),
 ('being', 'BEG'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('ground', 'NN'),
 ('floor', 'NN'),
 ('so', 'CS'),
 ('that', 'CS'),
 ('entrance', 'NN'),
 ('is', 'BEZ'),
 ('direct', 'JJ'),
 ('.', '.')]

In [100]:
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)

[('The', 'AT'),
 ('population', 'NN'),
 ('of', 'IN'),
 ('the', 'AT'),
 ('Congo', 'NP'),
 ('is', 'BEZ'),
 ('13.5', None),
 ('million', None),
 (',', None),
 ('divided', None),
 ('into', None),
 ('at', None),
 ('least', None),
 ('seven', None),
 ('major', None),
 ('``', None),
 ('culture', None),
 ('clusters', None),
 ("''", None),
 ('and', None),
 ('innumerable', None),
 ('tribes', None),
 ('speaking', None),
 ('400', None),
 ('separate', None),
 ('dialects', None),
 ('.', None)]

In [102]:
bigram_tagger.accuracy(testing_set)

0.10206319146815508

In [104]:
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(training_set, backoff=t0)
t2 = nltk.BigramTagger(training_set, backoff=t1)

t2.accuracy(testing_set)

0.8452108043456593