`pip install nltk`

In [1]:
import nltk

In [2]:
# nltk.download()

In [3]:
from nltk.corpus import brown

In [4]:
brown

<CategorizedTaggedCorpusReader in '/Users/parvbudhiraja/nltk_data/corpora/brown'>

In [5]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [6]:
data = brown.sents(categories=['adventure'])

In [7]:
print(data)

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]


In [8]:
print(type(data))

<class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>


In [9]:
len(data)

4637

In [10]:
' '.join(data[0])

'Dan Morgan told himself he would forget Ann Turner .'

In [11]:
data = brown.sents(categories=['fiction'])

In [12]:
' '.join(data[1])

'Scotty did not go back to school .'

## Bag of Words Pipeline
**Steps :**

---
1. Get the Data/Corpus
2. Tokenisation, Stopword Removal (Stopwards -> not so informational words e.g. a,an,the,you,his)
3. Stemming
4. Building a Vocab
5. Vectorisation
6. Classification

### Step 2 - Tokenisation and Stopword Removal

In [13]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [14]:
document = """ It was a pleasant day. The weather was cool and there were light showers.
I went to the market to buy some fruits."""

In [15]:
sentence = "Send all the 50 documents related to chapter 1,2,3 at sample@gmial.com"

In [16]:
sents = sent_tokenize(document)

In [17]:
print(sents)

[' It was a pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [18]:
len(sents)

3

In [19]:
sents[0]

' It was a pleasant day.'

In [20]:
words = word_tokenize(document)

In [21]:
print(len(words))

26


In [22]:
words

['It',
 'was',
 'a',
 'pleasant',
 'day',
 '.',
 'The',
 'weather',
 'was',
 'cool',
 'and',
 'there',
 'were',
 'light',
 'showers',
 '.',
 'I',
 'went',
 'to',
 'the',
 'market',
 'to',
 'buy',
 'some',
 'fruits',
 '.']

In [23]:
word_tokenize(sentence)

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapter',
 '1,2,3',
 'at',
 'sample',
 '@',
 'gmial.com']

In [24]:
sentence.split()

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapter',
 '1,2,3',
 'at',
 'sample@gmial.com']

In [25]:
word_tokenize?

> **custom tokenisation using split in python using custom separators**

### Stopwords removal

In [26]:
from nltk.corpus import stopwords

In [27]:
sw = set(stopwords.words('english'))

In [28]:
print(sw)

{'or', 'from', 'been', 'of', 'myself', "haven't", 'very', 'did', "shouldn't", 'having', 'her', 'as', "should've", 'what', 'the', 'shouldn', 'how', 'we', 'when', "didn't", 'before', 'doesn', 'mustn', 'wouldn', 'again', 'itself', 'isn', 'shan', 'these', 'and', "mightn't", 'hadn', 'ours', 'will', 'each', 'those', 'once', 'don', 'into', 'down', 'didn', 'for', 'ma', 'had', 'yourself', 'them', 'most', 'they', 'are', 'i', 'over', 'his', 'she', 'll', 'my', 'after', 'few', 'this', "weren't", 'to', "it's", 'above', 'in', 'here', 'am', 'ourselves', 'hers', 'further', 'he', 'ain', 'be', 'only', 'y', "she's", 'who', "you've", 'on', 'our', "wasn't", 'now', 'do', 'such', 'out', 'against', 'while', 'by', 'under', 'have', "shan't", 'just', "mustn't", 'because', 'why', "aren't", "you're", 'there', 'same', 'between', "doesn't", "you'd", 'then', 'more', 'but', 'no', 'during', 'that', 'couldn', 'their', "won't", 'nor', 'so', 'm', 'other', "hasn't", 'too', 'me', 'herself', 'has', 'any', 'whom', 've', 'up', 

In [29]:
def remove_stopwords(text,sw):
    useful_words = [w for w in text if w not in sw]
    return useful_words

In [30]:
text = "i am not bothered about her very much"
useful_text = remove_stopwords(text.split(),sw)

In [31]:
print(useful_text)

['bothered', 'much']


In [32]:
'not' in sw

True

This actually changed the sentiment of the sentence therefore we might actually need filtering of stopwords before using

### Tokenisation using a Regular Expression
- we can also customise our tokenising using regex
- To learn check this [site](https://www.regexpal.com/)

In [33]:
from nltk.tokenize import RegexpTokenizer

In [34]:
tokenizer = RegexpTokenizer('[a-zA-z@.]+')
useful_text = tokenizer.tokenize(sentence)

In [35]:
useful_text

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapter',
 'at',
 'sample@gmial.com']

### Step 3 - Stemming
- Process that transforms particular words into their radical form
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Example -> jumps,jumping,jumped,jump => jump

In [36]:
text = """Foxes love to make jumps.The quick brown fox was seen jumping over the 
lovely dog from a 6ft high wall"""

Snowball stemmer, Porter, Lancaster Stemmer

In [37]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer

In [38]:
from nltk.stem.lancaster import LancasterStemmer

In [39]:
ps = PorterStemmer()

In [40]:
ps.stem('jumping')

'jump'

In [41]:
ps.stem('jumps')

'jump'

In [42]:
ps.stem('lovely')

'love'

In [43]:
ps.stem('loving')

'love'

In [44]:
# Snowball Stemmer
ss = SnowballStemmer('english')

In [45]:
ss.stem('lovely')

'love'

In [46]:
ss.stem('jumping')

'jump'

In [47]:
ss.stem('jumpful')

'jump'

In [48]:
## Lemmatization
from nltk.stem.wordnet import WordNetLemmatizer

In [49]:
wn = WordNetLemmatizer()

In [50]:
wn.lemmatize('jumps')

'jump'

In [51]:
wn.lemmatize('jumping')

'jumping'

## Building Vocab and Vectorization
---

In [52]:
corpus = [
    "Indian cricket team will win World Cup, says Capt. Virat Kohli",
    "We will win next Lok Sabha Elections, says confident Indian PM",
    "The nobel laurate won the hearts of the people",
    "The movie Raazi is an exciting Indian Spy movie based upon a real story"
]

In [53]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
cv = CountVectorizer()

In [55]:
vectorized_corpus = cv.fit_transform(corpus)

In [56]:
vectorized_corpus = vectorized_corpus.toarray()

In [57]:
vectorized_corpus[0]

array([0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1])

In [58]:
vocab = cv.vocabulary_

In [59]:
len(vectorized_corpus[0])

35

In [60]:
len(vocab.keys())

35

In [61]:
# Reverse Mapping
numbers = vectorized_corpus[2]

In [62]:
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0])

In [63]:
# to get back all the unique words
cv.inverse_transform(numbers)

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
       dtype='<U9')]

## Vectorization with stopword removal

In [64]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    words = remove_stopwords(words,sw)
    return words

In [65]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [66]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [67]:
len(vectorized_corpus[0])

28

In [68]:
print(vectorized_corpus)

[[0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 1]
 [0 0 1 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 0 1 0 0 0 2 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0]]


> when we have test and training data then the vocab for test data is same as the vocab obtained from training data.

In [69]:
test_corpus = [
    'Indian team rocks!'
]

In [70]:
cv.transform(test_corpus).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0]])

- for test data `transform()` method
- for train data `fit_transform()` method

## More ways to Create Features
- Unigrams - every word is a feature
- Bigrams - two words can be combined to form a single feature
- Trigrams
- n-grams
- TF-IDF Normalisation

In [73]:
sent1 = ['this is good movie']
sent2 = ['this is not good movie']

In [74]:
doc = [sent1[0],sent2[0]]

In [76]:
cv = CountVectorizer()

In [77]:
cv.fit_transform(doc).toarray()

array([[1, 1, 1, 0, 1],
       [1, 1, 1, 1, 1]])

> **above vectorization might be confusing for the classifer because we need to capture negations discretely**

In [79]:
cv = CountVectorizer(ngram_range=(2,2))

In [81]:
cv.fit_transform(doc)

<2x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [82]:
cv.vocabulary_

{'this is': 4, 'is good': 1, 'good movie': 0, 'is not': 2, 'not good': 3}

In [83]:
cv = CountVectorizer(ngram_range=(3,3))
cv.fit_transform(doc)
cv.vocabulary_

{'this is good': 3,
 'is good movie': 0,
 'this is not': 4,
 'is not good': 1,
 'not good movie': 2}

#### to combine the Unigram and Bigram we give `ngram_range(1,2)`

In [84]:
cv = CountVectorizer(ngram_range=(1,2))

In [85]:
cv.fit_transform(doc)

<2x10 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [86]:
cv.vocabulary_

{'this': 8,
 'is': 2,
 'good': 0,
 'movie': 5,
 'this is': 9,
 'is good': 3,
 'good movie': 1,
 'not': 6,
 'is not': 4,
 'not good': 7}

## TF-IDF Normalization 
---
**(Term Frequency - Inverse Document Frequency)**
- avoid features that occur very often, because they contain less information
- Information decreases as the number of occurences increases across different type of documents
- So we define another term- term-document-frequency which associates a weight with each feature

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [97]:
tfidf = TfidfVectorizer()

In [98]:
vc = tfidf.fit_transform(doc).toarray()

In [99]:
# TF-IDF features
print(vc)

[[0.5        0.5        0.5        0.         0.5       ]
 [0.4090901  0.4090901  0.4090901  0.57496187 0.4090901 ]]


In [100]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'not': 3}