# Bag of words pipeline

## Tokenization

In [1]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
sentence = "Pritish is a peaceful soul"

In [4]:
wo = word_tokenize(sentence)
print(wo)

['Pritish', 'is', 'a', 'peaceful', 'soul']


In [5]:
document = """It was a very pleasant day. The weather was cool and windy. I went to market to buy grocery."""

In [6]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['It was a very pleasant day.', 'The weather was cool and windy.', 'I went to market to buy grocery.']
3


In [7]:
sents[0]

'It was a very pleasant day.'

In [8]:
sentence.split()

['Pritish', 'is', 'a', 'peaceful', 'soul']

## Stopwords

In [9]:
from nltk.corpus import stopwords

In [10]:
sw = set(stopwords.words('english'))

In [11]:
print(sw)

{"she's", 'to', 'doing', 'only', 'her', 'any', 'here', 'not', 'yours', "doesn't", 'mightn', 'its', 'can', 'after', 'where', "it's", 'it', 'they', 'herself', "aren't", "won't", 'ourselves', 'theirs', "you'll", 'whom', 'were', "mightn't", 'this', 'hadn', 'why', 'ain', 'couldn', 'but', 'having', 'at', 'off', "that'll", 'how', 'own', 'she', 'as', 'needn', "you'd", "couldn't", 'during', "wouldn't", 'over', 'once', 'from', "hadn't", "shan't", 'up', 'shouldn', 'that', 'my', 'in', 'am', 'such', 'will', "don't", "weren't", 'than', 'who', 'and', 'below', 'ma', 're', 'his', 'against', 'are', 'be', 'an', 'same', 'mustn', 'wouldn', 'under', "hasn't", 'on', 'just', 'yourself', 'our', 'themselves', 'all', 'too', 'o', 'there', 'hasn', 'should', 'is', 'again', "should've", 'isn', 'hers', 'if', 'nor', 'didn', 'he', 'has', 'down', 'those', 'i', 'no', 'itself', 'because', 'being', 'we', 'a', 'each', 'doesn', 'between', 'through', 'll', 'yourselves', 'their', 'for', 'both', 'haven', 'the', 'or', 'what', 'o

In [12]:
#sw contains the common words we can skip

In [13]:
def removeStopwords(text, stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [14]:
text = "i am not bothered about her so much".split()
useful_text = removeStopwords(text, sw)
print(useful_text)

['bothered', 'much']


## Tokenization using Regular Expression

In [15]:
sentence1 = "Send all the 50 documents of chapter 1,2,3 to pritishpattnaik7@gmail.com"

In [16]:
from nltk.tokenize import RegexpTokenizer

In [17]:
tokenizer = RegexpTokenizer('[a-zA-Z@]+')
useful_texts = tokenizer.tokenize(sentence1)
print(useful_texts)

['Send', 'all', 'the', 'documents', 'of', 'chapter', 'to', 'pritishpattnaik', '@gmail', 'com']


## Stemming

In [18]:
txt = """Foxes loves to make jumps. A quick brown fox was seen jumping over the lovely dog from a 6ft high wall."""

In [19]:
#types of stemming - Snowball , porter , lancaster stemmer

In [20]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [21]:
ps = PorterStemmer()   #object creation

In [22]:
ps.stem('jumping')

'jump'

In [23]:
ps.stem('jumps')

'jump'

In [24]:
ps.stem('lovely')

'love'

In [25]:
ps.stem('loving')

'love'

In [26]:
ss = SnowballStemmer('english')

In [27]:
ss.stem('lovely')

'love'

In [28]:
ss.stem('playful')

'play'

In [29]:
#Lemmatization

In [30]:
from nltk.stem import WordNetLemmatizer

In [31]:
wn = WordNetLemmatizer()
wn.lemmatize('playful')

'playful'

## Building vocab and Vectorization

In [50]:
corpus = [
    'Indian cricket team will win the world cup says capt. Virat Kohili. World cup will be held at Srilanka.',
    'We will win the next lok sabha eelections, says confidient pm Narendra Modi.',
    'The nobel laurate won the hearts of people.',
    'The movie Raazi is an excelent spy movie.'
]

In [51]:
from sklearn.feature_extraction.text import CountVectorizer

In [52]:
cv = CountVectorizer()

In [53]:
vectorized_corpus = cv.fit_transform(corpus)

In [54]:
vectorized_corpus = vectorized_corpus.toarray()

In [62]:
vectorized_corpus[0]

array([0, 1, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 2, 1, 0, 2], dtype=int64)

In [56]:
cv.vocabulary_

{'indian': 11,
 'cricket': 5,
 'team': 29,
 'will': 33,
 'win': 34,
 'the': 30,
 'world': 36,
 'cup': 6,
 'says': 26,
 'capt': 3,
 'virat': 31,
 'kohili': 13,
 'be': 2,
 'held': 10,
 'at': 1,
 'srilanka': 28,
 'we': 32,
 'next': 19,
 'lok': 15,
 'sabha': 25,
 'eelections': 7,
 'confidient': 4,
 'pm': 23,
 'narendra': 18,
 'modi': 16,
 'nobel': 20,
 'laurate': 14,
 'won': 35,
 'hearts': 9,
 'of': 21,
 'people': 22,
 'movie': 17,
 'raazi': 24,
 'is': 12,
 'an': 0,
 'excelent': 8,
 'spy': 27}

In [57]:
print(len(cv.vocabulary_.keys()))

37


In [58]:
#reverse mapping

In [68]:
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0], dtype=int64)

## Vectorization with Stopword removal 

In [72]:
def myTokenizer(doc):
    words = tokenizer.tokenize(doc.lower())
    #Indian and indian = same
    
    #remove stopwords
    words = removeStopwords(words, sw)
    return words

In [74]:
myTokenizer(sentence)

['pritish', 'peaceful', 'soul']

In [77]:
cv1 = CountVectorizer(tokenizer=myTokenizer)

In [78]:
vectorized_corpus1 = cv1.fit_transform(corpus).toarray()

In [80]:
print(vectorized_corpus1)

[[1 0 1 2 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 1 1 2]
 [0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1 1 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0]]


In [84]:
len(vectorized_corpus1[0])

28

In [85]:
#here the length of vector is reduced (we have made it more efficient we do it with stopword removal)