# Introduction to Natural Language Processing

### Install NLTK
 pip install nltk

In [1]:
import nltk

In [2]:
#nltk.download()

In [3]:
#Corpus-A collection of large text

from nltk.corpus import brown

In [4]:
#brown? 
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [5]:
print(len(brown.categories()))

15


In [6]:
data=brown.sents(categories='adventure')
#sents()->used to get sentences

In [7]:
print(data)  #it is a list of lists

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]


In [8]:
len(data)  #no of adventure sentences present

4637

In [9]:
data=brown.sents(categories='fiction')
print(len(data))

4249


In [10]:
data[1]  #Sentence is in form of a list of words
#first sentence in fiction category

['Scotty', 'did', 'not', 'go', 'back', 'to', 'school', '.']

In [11]:
#join() ->used to print a complete sentence
print(''.join(data[1]))
print(' '.join(data[1]))
print('*'.join(data[1]))
print(' and '.join(data[1]))

Scottydidnotgobacktoschool.
Scotty did not go back to school .
Scotty*did*not*go*back*to*school*.
Scotty and did and not and go and back and to and school and .


# Bag of Words Pipline

 * Get the Data/Corpus
 * Tokenisation,Stopward Removal
 * Stemming
 * Building a Vocab
 * Vectorisation
 * Classification

In [12]:
#Plan in NLP
#Text-> Numbers ->Classifier 
#Classifier can be used for predictions

## Tokenisation and Stopword Removal

In [21]:
document="""It was a very plesant day. The weather was cool and there were light showers. I went to the market to buy some fruits."""

sentence="Send all the 50 documents related to chapters 1,2,3 to shubhikabhardwaj@gmail.com"

In [22]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [23]:
#import nltk
#nltk.download('punkt')

In [25]:
sents=sent_tokenize(document)
print(sents)
print(len(sents))

['It was a very plesant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [26]:
sents[0]   #first sentence

'It was a very plesant day.'

In [28]:
sentence.split()
#split() ->unable to separate 1,2,3
# we can use the separator as , comma to avoid this

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'to',
 'shubhikabhardwaj@gmail.com']

In [31]:
words=word_tokenize(sentence)

In [34]:
words
#word_tokenise()-> also breaks about special characters like @

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'to',
 'shubhikabhardwaj',
 '@',
 'gmail.com']

In [35]:
#word_tokenize() ->can't customize must
#write your own function!

## Stopwords

In [36]:
from nltk.corpus import stopwords

#common stopwords are present in corpus depending upon the language like English

In [37]:
sw= set(stopwords.words('english'))
print(sw)

{'theirs', 'any', 'was', 'than', 'd', 'or', 'there', 'not', "hasn't", 'how', 'at', 'him', 'a', 'no', 'shouldn', 'with', 'mustn', 'until', 'such', 'them', 'they', "you'd", "you've", 'before', 'she', 'you', 'if', 'while', 'm', 'our', "you'll", "shan't", 'when', 'against', 'yourself', 'so', 'it', 'down', "isn't", "mightn't", 'has', 'of', 'once', 'doing', 'but', 'haven', 'your', 'this', "didn't", 'its', 'll', 'off', 'ain', "wouldn't", "she's", 'being', 'those', 'y', 'do', 'then', 'some', 'very', 'over', 'each', 'here', "haven't", 'isn', 'now', 'their', 'an', 'having', 've', "should've", 'be', 'between', "mustn't", 'shan', 'won', 'same', 'the', 'up', 'aren', 'couldn', 'why', "don't", 're', 'o', 'did', 'whom', 'her', "wasn't", 'few', 'wasn', "shouldn't", 'me', 'these', 'i', 'can', 'yours', 'to', 'himself', 'themselves', 'below', 'under', 'other', 'about', 'weren', 'only', 'all', 'further', 'again', 'which', 'mightn', 'is', 'were', 'am', "that'll", 'as', 'are', 'don', 'been', 'most', 'should'

In [52]:
def remove_stopwords(text,stopwords):
    useful_words=[w for w in text if w not in stopwords]
    return useful_words

In [53]:
text="I am not bothered about her very much"
useful_text=remove_stopwords(text,sw)
print(useful_text)

['I', ' ', ' ', 'n', ' ', 'b', 'h', 'e', 'r', 'e', ' ', 'b', 'u', ' ', 'h', 'e', 'r', ' ', 'v', 'e', 'r', ' ', 'u', 'c', 'h']


In [54]:
#we should sent a list of words
#else -> iteration over every character is done by default

In [55]:
#use split() to split sentence into a list of words
#the list of words can be passed to the remove_stopwords() function

In [57]:
text="I am not bothered about her very much".split()
useful_text=remove_stopwords(text,sw)
print(useful_text)

['I', 'bothered', 'much']


In [58]:
# 'not' is also removed -> this is bad!

'not' in sw

True

# Tokenisation using Regular Expression

In [59]:
sentence="Send all the 50 documents related to chapters 1,2,3 to shubhikabhardwaj@gmail.com"

In [60]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1,2,3',
 'to',
 'shubhikabhardwaj@gmail.com']

In [61]:
from nltk.tokenize import RegexpTokenizer

In [67]:
tokenizer=RegexpTokenizer('[a-zA-Z@.]+')
useful_text=tokenizer.tokenize(sentence)
#constructor of RegExp

In [68]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'to',
 'shubhikabhardwaj@gmail.com']

In [69]:
#cheat sheet on regexpal.com

# Stemming
* Process that tansforms particular words(verns,plurals)into their radical form
* Preserve the semantcis of the sentence without increasing the number of unique tokens
* Ex- jumps,jumping,jumped,jump==> jump

In [70]:
text="""Foxes love to make jumps.The quick brown fox was seen jumping over the lovely dog from a 6ft high wall"""


### 3Types of Stemmer-> Snowball ,Porter, Lancaster Stemmer

In [71]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer


In [72]:
ps=PorterStemmer()
ps.stem('jumping')

'jump'

In [73]:
ps.stem('jumps')

'jump'

In [74]:
ps.stem('lovely')

'love'

In [75]:
ps.stem('loving')

'love'

In [76]:
#Snowball Stemmer-> multi-lingual
ss=SnowballStemmer('english')

In [77]:
ss.stem('lovely')

'love'

In [79]:
ss.stem('jumping')  #remove ing

'jump'

In [81]:
ss.stem('dancing')

'danc'

## Lemmatization

In [82]:
from nltk.stem import WordNetLemmatizer

wn=WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

## Building a Vocab & Vectorisation

In [84]:
##Based on bag of words -Model

#Sample corpus-> Conatins 4 documents,each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story'
]

In [86]:
from sklearn.feature_extraction.text import CountVectorizer


In [88]:
cv=CountVectorizer()

In [89]:
vectorized_corpus=cv.fit_transform(corpus)

In [90]:
#vectorized_corpus?

<4x42 sparse matrix of type '<class 'numpy.int64'>'
	with 47 stored elements in Compressed Sparse Row format>

In [91]:
vectorized_corpus=vectorized_corpus.toarray()

In [97]:
print(vectorized_corpus[0])
print(len(vectorized_corpus[0]))
#42 unique words

[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]
42


In [94]:
 print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [101]:
len(cv.vocabulary_.keys())


42

In [102]:
#reverse mapping
numbers=vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=int64)

In [104]:
s=cv.inverse_transform(numbers)
print(s)  #jumbled ->bag of words model

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
      dtype='<U9')]


# Vectorisation with Stopword Removal

In [106]:
def myTokenizer(document):
    words=tokenizer.tokenize(document.lower())
    #Remove stopwords
    words=remove_stopwords(words,sw)
    return words
    

In [107]:
myTokenizer(sentence)


['send', 'documents', 'related', 'chapters', 'shubhikabhardwaj@gmail.com']

In [118]:
cv=CountVectorizer(tokenizer=myTokenizer)

In [119]:
vectorized_corpus=cv.fit_transform(corpus).toarray()

In [120]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [121]:
print(len(vectorized_corpus[0]))  #reduced from 42 to 33

33


In [122]:
 cv.inverse_transform(vectorized_corpus)

[array(['capt.', 'cricket', 'cup', 'held', 'indian', 'kohli.', 'lanka.',
        'says', 'sri', 'team', 'virat', 'wins', 'world'], dtype='<U9'),
 array(['confident', 'elections', 'indian', 'lok', 'next', 'pm', 'sabha',
        'says', 'win'], dtype='<U9'),
 array(['hearts', 'laurate', 'nobel', 'people'], dtype='<U9'),
 array(['based', 'exciting', 'indian', 'movie', 'raazi', 'real', 'spy',
        'story', 'thriller', 'upon'], dtype='<U9')]

In [123]:
#For Test data-> call transform()
# for train data ->call fit_transform()
#->don't call transform ->overwrite the vocab
test_corpus=[
     'Indian Cricket rock!'
]


In [124]:
#cv=CountVectorizer(tokenizer=myTokenizer)

In [125]:
vectorized_corpus=cv.transform(corpus).toarray()

In [126]:
print(vectorized_corpus)

[[0 1 0 1 2 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 2]
 [0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0]]


In [127]:
cv.vocabulary_

{'indian': 9,
 'cricket': 3,
 'team': 26,
 'wins': 31,
 'world': 32,
 'cup': 4,
 'says': 22,
 'capt.': 1,
 'virat': 29,
 'kohli.': 10,
 'held': 8,
 'sri': 24,
 'lanka.': 11,
 'win': 30,
 'next': 15,
 'lok': 13,
 'sabha': 21,
 'elections': 5,
 'confident': 2,
 'pm': 18,
 'nobel': 16,
 'laurate': 12,
 'hearts': 7,
 'people': 17,
 'movie': 14,
 'raazi': 19,
 'exciting': 6,
 'spy': 23,
 'thriller': 27,
 'based': 0,
 'upon': 28,
 'real': 20,
 'story': 25}

In [128]:
cv.fit_transform(test_corpus).toarray()

array([[1, 1, 1]], dtype=int64)

In [130]:
cv.vocabulary_  #overwritten! ->avoid this

{'indian': 1, 'cricket': 0, 'rock': 2}

# More ways to create features

* Unigram- every word is a feature
* Bigrams
* Trigrams
* n-grams
* TF-IDF Normalisation

In [142]:
sent_1=["this is a good movie"]
sent_2=["this is not a good movie"]
sent_3=["this is not good movie"]

In [143]:
cv=CountVectorizer()

In [144]:
docs=[sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[1, 1, 1, 0, 1],
       [1, 1, 1, 1, 1]], dtype=int64)

In [145]:
#This causes confusion for the classifier
# both sentence contain 'good'
#'not good'->capture as a negation feature

In [151]:
cv=CountVectorizer(ngram_range=(2,3))

In [152]:
docs=[sent_1[0],sent_2[0]]
cv.fit_transform(docs).toarray()

array([[1, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [1, 0, 0, 1, 1, 1, 1, 1, 0, 1]], dtype=int64)

In [153]:
print(cv.vocabulary_)

{'this is': 7, 'is good': 1, 'good movie': 0, 'this is good': 8, 'is good movie': 2, 'is not': 3, 'not good': 5, 'this is not': 9, 'is not good': 4, 'not good movie': 6}


# TF-IDF Normalisation

* Avoid features that occur very often, becauase they contain less information
* Information decreases as the number of occurences increases across different type of documents
* So we define another term - term-document-frequency which associates a weight with every term

In [163]:
sent_1="this is a good movie"
sent_2="this is not a good movie"
sent_3="this is not good movie"

corpus=[sent_1,sent_2,sent_3]

In [164]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [165]:
tfidf=TfidfVectorizer()


In [166]:
vc=tfidf.fit_transform(corpus).toarray()


In [167]:
print(vc)

[[0.5        0.5        0.5        0.         0.5       ]
 [0.42040099 0.42040099 0.42040099 0.54134281 0.42040099]
 [0.42040099 0.42040099 0.42040099 0.54134281 0.42040099]]


In [162]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'not': 3}

In [169]:
#'not' has highest wt at index 3