## Intro to NLP with NLTK
** Topics Covered: **
1. Tokenizing (sent_tokenize, word_tokenize)
2. Stemming/Lemmatization
3. POS Tagging
4. Chunking/Chinking
5. Named Entity Recognition

In [45]:
# imports
import sys
import nltk
import sklearn

In [5]:
# versions
print('python {}'.format(sys.version))
print('nltk {}'.format(nltk.__version__))
print('sklearn {}'.format(sklearn.__version__))

python 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)]
nltk 3.4
sklearn 0.19.1


In [7]:
# tokenization
from nltk import sent_tokenize, word_tokenize

text = ''' Marsha is on her way. She called from the car phone I think. It sounded like the car phone, to let us know 
           that she would be delayed. I would like to welcome two people who haven't been with us before.
           Suzanne Clewell, we're delighted to have you with us today. Suzanne, would you tell us a little bit 
           about what you do? '''

print(sent_tokenize(text))
print(word_tokenize(text))

[' Marsha is on her way.', 'She called from the car phone I think.', 'It sounded like the car phone, to let us know \n           that she would be delayed.', "I would like to welcome two people who haven't been with us before.", "Suzanne Clewell, we're delighted to have you with us today.", 'Suzanne, would you tell us a little bit \n           about what you do?']
['Marsha', 'is', 'on', 'her', 'way', '.', 'She', 'called', 'from', 'the', 'car', 'phone', 'I', 'think', '.', 'It', 'sounded', 'like', 'the', 'car', 'phone', ',', 'to', 'let', 'us', 'know', 'that', 'she', 'would', 'be', 'delayed', '.', 'I', 'would', 'like', 'to', 'welcome', 'two', 'people', 'who', 'have', "n't", 'been', 'with', 'us', 'before', '.', 'Suzanne', 'Clewell', ',', 'we', "'re", 'delighted', 'to', 'have', 'you', 'with', 'us', 'today', '.', 'Suzanne', ',', 'would', 'you', 'tell', 'us', 'a', 'little', 'bit', 'about', 'what', 'you', 'do', '?']


In [13]:
# stopwords
from nltk.corpus import stopwords

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
# Stemming and Lemmatization
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

words = ['study', 'studied', 'studies', 'student']

print('Stemmer :')
for w in words:
    print(PorterStemmer().stem(w))

print('\nLemmatizer :')
for w in words:
    print(WordNetLemmatizer().lemmatize(w))

Stemmer :
studi
studi
studi
student

Lemmatizer :
study
studied
study
student


In [30]:
# POS tagging
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
test_text = state_union.raw('2006-GWBush.txt')

custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)
tokenized_test_text = custom_sentence_tokenizer.tokenize(test_text)

def process():
    try:
        for sentence in tokenized_test_text[:5]:
            words = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

process()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [31]:
# to get tagset
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [41]:
# Chunking with NLTK
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
test_text = state_union.raw('2006-GWBush.txt')

custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)
tokenized_test_text = custom_sentence_tokenizer.tokenize(test_text)

def process():
    try:
        for sentence in tokenized_test_text[:5]:
            words = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(words)
            
            # combine POS tag with regular expression
            chunk_gram = r'''Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}'''
            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)
            
            # print nltk tree
            for subtree in chunked.subtrees(filter=lambda x: x.label() == 'Chunk'):
                print(subtree)
            
            # draw chunks with nltk
            # chunked.draw()
            
    except Exception as e:
        print(str(e))

process()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)
(Chunk Coretta/NNP Scott/NNP King/NNP)
(Chunk Applause/NNP)
(Chunk President/NNP George/NNP W./NNP Bush/NNP)
(Chunk State/NNP)
(Chunk Union/NNP Address/NNP)
(Chunk Capitol/NNP)
(Chunk Tuesday/NNP)
(Chunk Jan/NNP)


In [43]:
# Chinking with NLTK
# The main difference is the }{, vs. {} --> This means we are removing
# from the chink one or more sets of words as per expression
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
test_text = state_union.raw('2006-GWBush.txt')

custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)
tokenized_test_text = custom_sentence_tokenizer.tokenize(test_text)

def process():
    try:
        for sentence in tokenized_test_text[:5]:
            words = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(words)
            
            # combine POS tag with regular expression
            chunk_gram = r'''Chunk: {<.*>+}
                                          }<VB.?|IN|DT|TO>+{'''
            chunk_parser = nltk.RegexpParser(chunk_gram)
            chunked = chunk_parser.parse(tagged)
            
            # print nltk tree
            for subtree in chunked.subtrees(filter=lambda x: x.label() == 'Chunk'):
                print(subtree)
            
            # draw chunks with nltk
            # chunked.draw()
            
    except Exception as e:
        print(str(e))

process()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk
  THE/NNP
  UNION/NNP
  January/NNP
  31/CD
  ,/,
  2006/CD
  THE/NNP
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP)
(Chunk ./.)
(Chunk
  Mr./NNP
  Speaker/NNP
  ,/,
  Vice/NNP
  President/NNP
  Cheney/NNP
  ,/,
  members/NNS)
(Chunk Congress/NNP ,/, members/NNS)
(Chunk
  Supreme/NNP
  Court/NNP
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:)
(Chunk our/PRP$ nation/NN)
(Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
(Chunk America/NNP)
(Chunk its/PRP$ founding/NN ideals/NNS and/CC)
(Chunk noble/JJ dream/NN ./.)
(Chunk Tonight/NN we/PRP)
(Chunk hope/NN)
(Chunk glad/JJ reunion/NN)
(Chunk husband/NN who/WP)
(Chunk so/RB long/RB ago/RB ,/, and/CC we/PRP)
(Chunk grateful/JJ)
(Chunk good/JJ life/NN)
(Chunk Coretta/NNP Scott/NNP King/NNP ./.)
(Chunk

In [44]:
# Named Entity Recognition
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
test_text = state_union.raw('2006-GWBush.txt')

custom_sentence_tokenizer = PunktSentenceTokenizer(train_text)
tokenized_test_text = custom_sentence_tokenizer.tokenize(test_text)

def process():
    try:
        for sentence in tokenized_test_text[:5]:
            words = nltk.word_tokenize(sentence)
            tagged = nltk.pos_tag(words)
            named_entity = nltk.ne_chunk(tagged, binary=False)
            
            # draw chunks with nltk
            named_entity.draw()
            
    except Exception as e:
        print(str(e))

process()

## Experiment - Text Classification
**MOVIE REVIEW CLASSIFICATION**

In [46]:
# imports
import nltk
import random
from nltk.corpus import movie_reviews

In [102]:
# get all the reviews and corresponding class
documents = [(list(movie_reviews.words(fileid)), category) 
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# shuffle the documents
random.shuffle(documents)

print('Number of documents: {}'.format(len(documents)))
print('\nFirst review: {}'.format(documents[0]))

all_words = []
for word in movie_reviews.words():
    all_words.append(word.lower())
    
all_words = nltk.FreqDist(all_words)
print('\nvocab size: {}'.format(len(all_words)))
print('\nMost common words: {}'.format(all_words.most_common(15)))

Number of documents: 2000

First review: (['men', 'in', 'black', 'is', 'an', 'explosive', 'mix', 'of', 'science', 'fiction', ',', 'action', ',', 'and', 'comedy', 'that', 'hits', 'the', 'target', 'in', 'every', 'possible', 'way', '.', 'although', 'another', 'alien', 'movie', ',', 'men', 'in', 'black', 'succeeds', 'in', 'every', 'way', 'that', 'independence', 'day', 'didn', "'", 't', ',', 'and', 'towers', 'above', 'many', 'other', 'movies', 'of', 'its', 'type', '.', 'the', 'brilliant', 'acting', ',', 'especially', 'by', 'tommy', 'lee', 'jones', 'as', 'agent', 'kay', ',', 'is', 'also', 'as', 'good', 'as', 'it', 'gets', '.', 'director', 'barry', 'sonnenfeld', ',', 'who', 'was', 'behind', 'the', 'camera', 'for', 'the', 'addams', 'family', 'movies', 'and', 'get', 'shorty', ',', 'has', 'crafted', 'a', 'masterpiece', '.', 'the', 'story', 'behind', 'men', 'in', 'black', 'is', 'just', 'as', 'interesting', 'as', 'you', 'would', 'want', 'it', 'to', 'be', '.', 'the', 'men', 'in', 'black', ',', 'or'

In [190]:
# we will select the most 5000 common words as per the frequency
word_features = all_words.most_common(5000)
word_features = [word[0] for word in word_features]

In [192]:
# function to determine which of these common words are present in the reviews
def find_features(document):
    words = set(document)
    features = dict()
    for word in word_features:
        features[word] = (word in words)
    return features

In [193]:
# form the dataset for training and testing
dataset = [(find_features(review), category) for (review, category) in documents]

In [194]:
# split the data for training and testing
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(dataset, test_size=0.25, random_state=3)
print('Train data: {}'.format(len(train_data)))
print('Test data: {}'.format(len(test_data)))

Train data: 1500
Test data: 500


In [195]:
# Build the model
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel='linear'))

In [196]:
# train the model using train_data
model.train(train_data)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))>

In [197]:
# evaluate the model using test_data
accuracy = nltk.classify.accuracy(model, test_data)
print(accuracy)

0.83
