# NLP - Introduction to NLTK

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
# Corpus- A large collection of text
from nltk.corpus import brown

In [8]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [9]:
data = brown.sents(categories='adventure')

In [14]:
' '.join(data[2])

"He certainly didn't want a wife who was fickle as Ann ."

In [15]:
type(data)

nltk.corpus.reader.util.ConcatenatedCorpusView

In [19]:
len(data[2]),len(data[0])

(12, 10)

# Bag of Words Pipeline
* Get the Data/Corpus
* Tokenization - Getting sentences and words out of Documents, Stopwords Removal - Removal of words       like was, the, is, etc.
* Stemming - Matching a pool of similar words to some base word.
* Building a Vocab - Creating a pool of unique words
* Vectorization - Forming a vector which serves as a feature to our classification algorithm.
* Classification

## Tokenization & Stopword Removal

In [20]:
document  = """It was a pleasant day. The weather was cool and there were light showers. I went to the
market to buy some cigarettes."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateekbhaiya@cid.com"

In [21]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [22]:
sents = sent_tokenize(document)

In [23]:
print(sents)

['It was a pleasant day.', 'The weather was cool and there were light showers.', 'I went to the\nmarket to buy some cigarettes.']


In [24]:
print(len(sents))

3


In [25]:
sents[0]

'It was a pleasant day.'

In [26]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'at',
 'prateekbhaiya@cid.com']

In [27]:
words = word_tokenize(sentence)

In [28]:
print(words)

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'prateekbhaiya', '@', 'cid.com']


## Stopwords Removal

In [29]:
from nltk.corpus import stopwords

In [31]:
sw = set(stopwords.words('english'))

In [32]:
print(sw)

{'isn', 's', "wasn't", 'whom', 'such', 'or', 'then', 'through', 'did', 'not', 'only', 'when', 'you', 't', 'ours', 'your', 'they', 'aren', 'too', 'hers', 'an', 'myself', 'she', 'have', 'can', 'now', 'here', 'him', 'just', 'been', "don't", 'mustn', "that'll", 'my', 'because', 'as', "you're", 'no', "couldn't", 'be', 'our', 'into', 'very', 'll', 'for', 'with', 'wouldn', 'other', 'these', 'against', 'same', 'above', 'why', 'had', 'ain', 'y', "hadn't", 'was', 'if', 'more', 'hadn', 'of', 'that', 'below', 'ourselves', 'most', 'doesn', 'out', 'before', 'from', 'how', 'o', 'its', 'haven', "weren't", 'he', 'her', 'nor', 'and', 'will', 'any', 'on', "doesn't", 'further', "hasn't", 'itself', 'all', 'to', 'am', 'i', 'being', "you'll", 'those', "mustn't", 'his', 'but', 'in', 'down', 'until', 'herself', 'over', 'some', 'than', "mightn't", "haven't", 'by', "needn't", 'so', 'couldn', 'the', 'under', 'few', "should've", 'hasn', 'off', 'this', 'mightn', "won't", 'needn', 'yourself', 'after', "she's", 'them

In [33]:
len(sw)

179

In [36]:
def remove_stopwords(doc,stopwords):
    useful_words = [w for w in doc if w not in stopwords]
    return useful_words

In [37]:
doc = "Please don't even try to bother me!"
useful_words = remove_stopwords(doc.split(),sw)

In [38]:
useful_words

['Please', 'even', 'try', 'bother', 'me!']

In [39]:
'try' in sw

False

## Tokenization with Regular Expression

In [41]:
sentence = "Send all the 50 documents related to chapters 1,2,3 at prateekbhaiya@cid.com"

In [42]:
from nltk.tokenize import RegexpTokenizer

In [47]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text = tokenizer.tokenize(sentence)

In [48]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateekbhaiya@cid.com']

## Stemming
* Process that transforms particular words(verbs,plurals) into their radical form.
* Preserve the semantics of the sentence without increasing the number of unique tokens.
* Example : jumps, jumping, jumped, jump ==> jump

In [49]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [50]:
ps = PorterStemmer()

In [51]:
ps.stem('jumping')

'jump'

In [53]:
ps.stem('jumped')

'jump'

In [54]:
ps.stem('lovely')

'love'

In [55]:
ss = SnowballStemmer('english')

In [56]:
ss.stem('lovely')

'love'

## Lemmatization

In [57]:
from nltk.stem import WordNetLemmatizer

In [63]:
wn = WordNetLemmatizer()

In [67]:
wn.lemmatize('jumps')

'jump'

## Building a Vocab and Vectorization

In [68]:
# Sample Corpus -  contains 4 documents, each document can have 1 or more sentences. 
corpus = [
    "Indian cricket team is about to win this game says Capt. Virat Kohli.",
    "All the leaders of Congress and BJP admits that they all are of no use of the public.",
    "The nobel prize will be given to the man who saved his neighbour's wife from drowning.",
    "The movie 3-idiots still tops the list of my favourite movies."
]

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [70]:
cv = CountVectorizer()

In [71]:
vectorized_corpus = cv.fit_transform(corpus)

In [72]:
vectorized_corpus

<4x48 sparse matrix of type '<class 'numpy.int64'>'
	with 52 stored elements in Compressed Sparse Row format>

In [73]:
vectorized_corpus = vectorized_corpus.toarray()

In [76]:
vectorized_corpus

array([[1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
        0, 0, 0, 1],
       [0, 1, 2, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 1, 0, 3, 0, 1, 0, 0, 0, 0, 1, 2, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0]], dtype=int64)

In [78]:
len(vectorized_corpus[0])

48

In [79]:
cv.vocabulary_

{'indian': 17,
 'cricket': 9,
 'team': 35,
 'is': 18,
 'about': 0,
 'to': 40,
 'win': 47,
 'this': 39,
 'game': 13,
 'says': 33,
 'capt': 7,
 'virat': 43,
 'kohli': 19,
 'all': 2,
 'the': 37,
 'leaders': 20,
 'of': 29,
 'congress': 8,
 'and': 3,
 'bjp': 6,
 'admits': 1,
 'that': 36,
 'they': 38,
 'are': 4,
 'no': 27,
 'use': 42,
 'public': 31,
 'nobel': 28,
 'prize': 30,
 'will': 46,
 'be': 5,
 'given': 14,
 'man': 22,
 'who': 44,
 'saved': 32,
 'his': 15,
 'neighbour': 26,
 'wife': 45,
 'from': 12,
 'drowning': 10,
 'movie': 23,
 'idiots': 16,
 'still': 34,
 'tops': 41,
 'list': 21,
 'my': 25,
 'favourite': 11,
 'movies': 24}

In [80]:
print(cv.vocabulary_)

{'indian': 17, 'cricket': 9, 'team': 35, 'is': 18, 'about': 0, 'to': 40, 'win': 47, 'this': 39, 'game': 13, 'says': 33, 'capt': 7, 'virat': 43, 'kohli': 19, 'all': 2, 'the': 37, 'leaders': 20, 'of': 29, 'congress': 8, 'and': 3, 'bjp': 6, 'admits': 1, 'that': 36, 'they': 38, 'are': 4, 'no': 27, 'use': 42, 'public': 31, 'nobel': 28, 'prize': 30, 'will': 46, 'be': 5, 'given': 14, 'man': 22, 'who': 44, 'saved': 32, 'his': 15, 'neighbour': 26, 'wife': 45, 'from': 12, 'drowning': 10, 'movie': 23, 'idiots': 16, 'still': 34, 'tops': 41, 'list': 21, 'my': 25, 'favourite': 11, 'movies': 24}


In [81]:
len(cv.vocabulary_.keys())

48

In [82]:
# Reverse Mapping - Making sentences from these feature matrix

numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0], dtype=int64)

In [83]:
s = cv.inverse_transform(numbers)

In [87]:
s[0]

array(['be', 'drowning', 'from', 'given', 'his', 'man', 'neighbour',
       'nobel', 'prize', 'saved', 'the', 'to', 'who', 'wife', 'will'],
      dtype='<U9')

In [88]:
print(s)

[array(['be', 'drowning', 'from', 'given', 'his', 'man', 'neighbour',
       'nobel', 'prize', 'saved', 'the', 'to', 'who', 'wife', 'will'],
      dtype='<U9')]


## Vectorization with Stopword Removal

In [89]:
def myTokenizer(doc):
    words = tokenizer.tokenize(doc.lower())
#   Remove stopwords
    words = remove_stopwords(words,sw)
    return words

In [90]:
myTokenizer(sentence)

['send', 'documents', 'related', 'chapters', 'prateekbhaiya@cid.com']

In [91]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [96]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [97]:
print(vectorized_corpus)

[[0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1]
 [1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0]]


In [98]:
len(vectorized_corpus[0])

30

In [99]:
cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'game', 'indian', 'kohli.', 'says', 'team',
        'virat', 'win'], dtype='<U9'),
 array(['admits', 'bjp', 'congress', 'leaders', 'public.', 'use'],
       dtype='<U9'),
 array(['drowning.', 'given', 'man', 'neighbour', 'nobel', 'prize',
        'saved', 'wife'], dtype='<U9'),
 array(['favourite', 'idiots', 'list', 'movie', 'movies.', 'still', 'tops'],
       dtype='<U9')]

In [100]:
#  Using on Test corpus
test_corpus = [
    "India is going to rock!"
]

In [102]:
# Remember for test corpus use only transform function not fit_transform
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [103]:
cv.vocabulary_

{'indian': 10,
 'cricket': 4,
 'team': 24,
 'win': 29,
 'game': 7,
 'says': 22,
 'capt.': 2,
 'virat': 27,
 'kohli.': 11,
 'leaders': 12,
 'congress': 3,
 'bjp': 1,
 'admits': 0,
 'use': 26,
 'public.': 20,
 'nobel': 18,
 'prize': 19,
 'given': 8,
 'man': 14,
 'saved': 21,
 'neighbour': 17,
 'wife': 28,
 'drowning.': 5,
 'movie': 15,
 'idiots': 9,
 'still': 23,
 'tops': 25,
 'list': 13,
 'favourite': 6,
 'movies.': 16}

## Bag of words - Bigrams, Trigrams, Ngrams
* Unigram - Every word as a feature
* Bigrams
* Trigrams
* Ngrams
* TF-IDF Normalization

In [2]:
sent_1 = ["This is a good movie"]
sent_2 = ["This is a good movie but the actor was not that good"]
sent_3 = ["This is not a good movie"]

In [4]:
cv = CountVectorizer(ngram_range=(2,2)) # Here changing (2,2) to (1,3) or (2,3) or any other combination 
# will them Ngrams

In [7]:
docs = [sent_1[0],sent_2[0],sent_3[0]]

In [8]:
cv.fit_transform(docs).toarray()

array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1],
       [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0]], dtype=int64)

In [9]:
cv.vocabulary_

{'this is': 10,
 'is good': 3,
 'good movie': 2,
 'movie but': 5,
 'but the': 1,
 'the actor': 9,
 'actor was': 0,
 'was not': 11,
 'not that': 7,
 'that good': 8,
 'is not': 4,
 'not good': 6}

## TF-IDF Normalization - Term Document Frequency-Inverse Document Frequency
* Avoid features that occur very often or in almost all type of documents like sports, politics etc.
  as they contain very less information.
* Information decreases as the no. of occurences increases across diff. types of documents.
* So we define another term - TF-IDf which associates a weight with every term.

In [15]:
sent_1 = ["This is a good movie"]
sent_2 = ["That was a good movie"]
sent_3 = ["This is not a good movie"]

corpus = [sent_1[0],sent_2[0],sent_3[0]]

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [17]:
tfidf = TfidfVectorizer()

In [18]:
vc = tfidf.fit_transform(corpus).toarray()

In [19]:
tfidf.vocabulary_

{'this': 5, 'is': 1, 'good': 0, 'movie': 2, 'that': 4, 'was': 6, 'not': 3}

In [21]:
print(vc)

[[0.43370786 0.55847784 0.43370786 0.         0.         0.55847784
  0.        ]
 [0.35959372 0.         0.35959372 0.         0.6088451  0.
  0.6088451 ]
 [0.34957775 0.45014501 0.34957775 0.59188659 0.         0.45014501
  0.        ]]
