# Natural Language Toolkit(NLTK)

In [1]:
# Steps in NLP:
#- Data Collection
#- Tokenization, Stopwards Removal, Stemming
#- Building a common vocab
#- Vectorize the documents
#- Performing Classification/Clustering

In [2]:
# NlTK deals with the data in the form of text or words with the help of corpus

In [3]:
# Corpora means a collection of written texts, especially the entire works of a particular author or a body of writing on a particular subject.
# The collection of such data(corpora) is known as corpus
# corpus is like the training dataset in text form

In [4]:
import nltk
#nltk.download()-to view all the corpus available in the nltk
nltk.download('punkt')#punkt is one of the corpus package name 

[nltk_data] Downloading package punkt to C:\Users\lavanya
[nltk_data]     rajeswari\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
nltk.download('brown')
from nltk.corpus import brown#brown corpus is the traing data

[nltk_data] Downloading package brown to C:\Users\lavanya
[nltk_data]     rajeswari\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


# Data Collection

In [6]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [7]:
data = brown.sents(categories='editorial')[:100]#returns sentences of the given category
#prints the 1st 100 sentences
print(type(data))
print(data)
print(len(data))

<class 'nltk.collections.LazySubsequence'>
[['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...]
100


# Tokenization and Stopword removal

In [8]:
#random text
text = "It was a very pleasant day, the weather was cool and there were showers. I went to market to buy some fruits."

In [9]:
# tokenization means splitting the sentence into set of words

In [10]:
#sent_tokenize splits the data into sentences
#word_tokenize splits the data into words
from nltk.tokenize import sent_tokenize, word_tokenize

In [11]:
sents = sent_tokenize(text)#tokenizes sentences based on fullstop(.) punctuation mark
sents

['It was a very pleasant day, the weather was cool and there were showers.',
 'I went to market to buy some fruits.']

In [12]:
word_list = word_tokenize(sents[0].lower())#word tokenization of 1st sentence

In [13]:
word_list

['it',
 'was',
 'a',
 'very',
 'pleasant',
 'day',
 ',',
 'the',
 'weather',
 'was',
 'cool',
 'and',
 'there',
 'were',
 'showers',
 '.']

# Stopword Removal

In [14]:
# Stopwords means which doesnt contribute much in the text like if,of,the etc...
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to C:\Users\lavanya
[nltk_data]     rajeswari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
sw = set(stopwords.words('english'))#displays stopwords in english language

In [16]:
print(sw,len(sw))#179 stopwords are there in-built

{'above', 'are', "it's", 'will', "hasn't", 'each', 'than', 'it', 'until', 't', "you'll", "didn't", 'does', 'mustn', 'before', 'doing', 'hers', 'is', 'up', 'then', 'shan', 'against', 'my', 'hasn', 'should', 'your', "wasn't", 'he', 'i', 'himself', 'down', 'by', 'own', 'but', 'only', 'between', 've', 'to', 'other', "shouldn't", "wouldn't", "aren't", 'which', 'being', 'why', 'any', 'once', 'most', 'its', 'not', 'those', 'm', 'their', 'whom', 'weren', 'couldn', 'our', 'haven', 'mightn', 'very', 'such', "doesn't", 'under', 'herself', 'below', 'aren', 'at', 'no', 'her', "hadn't", 'were', 'as', 'this', 'd', 'because', 'from', 'and', 'has', 'so', 'both', 'wasn', 'yourself', 'isn', 'out', 'that', "couldn't", 'didn', 'ma', "weren't", 'more', 'there', 'further', 'in', 'an', 'into', 'wouldn', 'nor', 'ourselves', "she's", 'where', 'have', 'his', 'all', "needn't", 'y', "haven't", 'we', 'll', 'do', 'or', 'been', 'a', 'am', 'while', 'yourselves', "shan't", 'shouldn', 'be', 'of', "you've", 'theirs', 'fe

In [17]:
#filter the words from the sentence, i.e., removing the stopwords from the sentence
def filter_words(word_list):
    useful_words = [w for w in word_list if w not in sw]
    return useful_words

In [18]:
useful_words = filter_words(word_list)
useful_words

['pleasant', 'day', ',', 'weather', 'cool', 'showers', '.']

In [19]:
#RegexpTokenizer tokenizes the text based on regular expressions
from nltk.tokenize import RegexpTokenizer

In [20]:
tokenizer = RegexpTokenizer("[a-zA-Z0-9]+")

In [21]:
sents = "send the 50 documents to abc, def, ghi."
print(tokenizer.tokenize(sents))#gives the words which are lowercase or uppercase or digits

['send', 'the', '50', 'documents', 'to', 'abc', 'def', 'ghi']


# Stemming
- stemming means transforms the verb or any other word to root word
- Ex: jumping,jumps,jump is transformed into jump etc.

## Types of Stemmers
- Snowball Stemmer (Multilingual)
- PorterStemmer, LancasterStemmer(English language only)

In [22]:
#importing different types of stemmer
from nltk.stem.snowball import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

In [23]:
#PorterStemmer, LancasterStemmer uses different algorithms for Stemming
ps = PorterStemmer()

In [24]:
ps.stem("jumped")
ps.stem("jumping")

'jump'

In [25]:
ps.stem("lovely")

'love'

In [26]:
print(ps.stem("awesome"))
ls = LancasterStemmer()
print(ls.stem("awesome"))

print(ls.stem("teenager"))
print(ps.stem("teenager"))

awesom
awesom
teen
teenag


In [27]:
ss = SnowballStemmer('spanish')
ss.stem('buano')

'buan'

# Bag of Words

- Bag of Words approach is storing the frequency of the corresponding words and the words index also
- Bag of words are collected by removing the stop words, text filtering etc.. from the given corpus
- each word in a bag of word represents a column
- It uses one hot encoding process means, when the word is found, 1 is placed in the column(word vector)
- the similarity between the word vectors is found by pairwise euclidean distance between the rows
- k-products(rows) which have closer distance are the required recommended products

In [28]:
corpus = [
    'Indian team will win today',
    'Lockdown expected to end by May 2020',
    'Colleges and schools are closed due to coronavirus pandemic',
    'There is nothing to talk about other than corona virus'
]

In [29]:
print(corpus)

['Indian team will win today', 'Lockdown expected to end by May 2020', 'Colleges and schools are closed due to coronavirus pandemic', 'There is nothing to talk about other than corona virus']


In [30]:
# to convert words into numerical features
# Building a common vocabulary and vectorize the documents

In [31]:
def myTokenizer(sentence):
    words = tokenizer.tokenize(sentence.lower())
    return filter_words(words)

list_words = myTokenizer(corpus[0])
print(len(list_words))

4


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
#tokenizer parameter takes the value as the function created by user for tokenization of words
#mytokenizerfunction is used for the stopword removal, text filtering
#ngram feature is used to take words upto a range(Change the range in function for better understanding)
cv = CountVectorizer(tokenizer = myTokenizer,ngram_range = (1,1) )

In [34]:
#gives the sparse matrix of the word of vectors
vectorized_corpus = cv.fit_transform(corpus)

In [35]:
#converting the result to array(sparse matrxix as most 0's will be present in the result) for reducing space complexity
vc = vectorized_corpus.toarray()

In [36]:
print(vc)
print(cv.vocabulary_)

[[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1]
 [1 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0]
 [0 1 1 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0]]
{'indian': 8, 'team': 15, 'win': 18, 'today': 16, 'lockdown': 9, 'expected': 7, 'end': 6, 'may': 10, '2020': 0, 'colleges': 2, 'schools': 13, 'closed': 1, 'due': 5, 'coronavirus': 4, 'pandemic': 12, 'nothing': 11, 'talk': 14, 'corona': 3, 'virus': 17}


In [37]:
#gives the sentences with words contained in the bag of words
cv.inverse_transform(vc)

[array(['indian', 'team', 'today', 'win'], dtype='<U11'),
 array(['2020', 'end', 'expected', 'lockdown', 'may'], dtype='<U11'),
 array(['closed', 'colleges', 'coronavirus', 'due', 'pandemic', 'schools'],
       dtype='<U11'),
 array(['corona', 'nothing', 'talk', 'virus'], dtype='<U11')]

### Bag of Words drawbacks:
- it doesnt maintain any order of words that appear in sentences
- we cant get the context from two words which have same meaning(Ex:worldcup, worldcups)
- takes huge memory (ex: if there are 10k words, 1st sentence contains only 3 words then remaining memory is wasted for storing 0 in other words)

# TF-IDF

- It gives good result than BOW approach
- But it also fails in the context

- TF means term Frequency, IDF means Inverse Document Frequency
- TF for a word in a document means number of times word occurs in document(count vectorizer approach)
- IDF for a word in the entire corpus is computed by the formula:
   log(no of documents we have/no of documents(sentences) we have the word in it)
- IDF gives the rareness(occurences) of the word in the corpus
   Ex:'The world cup held in india','this name is awesome'
   IDF for  word(world) in above corpus is, log(2/1)
- final weight assigned for each word is product of TF and IDF of a word

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
tfidf_vectorizer = TfidfVectorizer(tokenizer = myTokenizer, ngram_range = (1,2))

- In BOW euclidean distance is calculated to describe similarity
- In TF-IDF, cosine similairity is used to calculate similarity
- cosine similarity means the cosine of angle between the 2 vectors
- cosine similarity=dot product between 2 vectors/products of magnitudes of individual vectors
- if cosine similarity is nearer to 1 =>angle between them is 0 which means the vectors are very similar 

In [40]:
vectorized_corpus = tfidf_vectorizer.fit_transform(corpus).toarray()
print(vectorized_corpus)
print(tfidf_vectorizer.vocabulary_)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.37796447 0.37796447 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.37796447 0.37796447
  0.37796447 0.         0.37796447 0.37796447]
 [0.33333333 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.33333333
  0.33333333 0.33333333 0.33333333 0.         0.         0.33333333
  0.33333333 0.33333333 0.33333333 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.30151134 0.30151134 0.30151134 0.30151134 0.
  0.         0.30151134 0.30151134 0.30151134 0.30151134 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.30151134
  0.30151134 0.30151134 0.        