In [1]:
import nltk

In [2]:
# nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# Corpus - A large collection of text
from nltk.corpus import brown

In [4]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [5]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [6]:
data = brown.sents(categories = 'adventure')
data

[['Dan', 'Morgan', 'told', 'himself', 'he', 'would', 'forget', 'Ann', 'Turner', '.'], ['He', 'was', 'well', 'rid', 'of', 'her', '.'], ...]

In [7]:
len(data)

4637

In [8]:
' '.join(data[0])

'Dan Morgan told himself he would forget Ann Turner .'

# Bag of Words Pipeline
- Get the Data/Corpus
- Tokenisation, Stopword Removal
- Stemming
- Building a Vocab
- Vectorization
- Classification

### Tokenisation, Stopword Removal

In [9]:
document = """It was a very pleasant day. The weather was cool and there were light showers.
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents related to chapters 1,2,3 at prateek@cb.com"

In [10]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [11]:
sents = sent_tokenize(document)
print(sents)
print(len(sents))

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']
3


In [12]:
words = word_tokenize(sentence)

In [13]:
print(words)
print(len(words))

['Send', 'all', 'the', '50', 'documents', 'related', 'to', 'chapters', '1,2,3', 'at', 'prateek', '@', 'cb.com']
13


### Stopwords

In [14]:
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

In [15]:
print(sw)

{'should', 'shan', "weren't", 'no', 'mightn', 'this', 'over', 'how', 'won', 'any', 'before', 'had', "won't", 'against', 'into', 'off', 'after', 'both', 'has', 'them', 'your', 'but', 'with', "mustn't", 'not', "doesn't", 'nor', 'do', 'ours', 'which', 'll', "you're", 'himself', 're', "needn't", "wouldn't", 'herself', 'a', 'will', "you'll", 'during', 'myself', 'below', 'on', 'for', 'some', 'theirs', 'y', 'at', 'ma', 'we', 'in', 'out', 'these', 'it', 'themselves', 'further', 'd', 's', 'very', 'above', 'that', 'are', 'ourselves', 'm', 'he', 'to', 'having', 'between', "she's", 'hers', 'have', 't', 'same', "couldn't", 'again', 'and', 'their', "don't", 'where', 'me', 'than', 'isn', 'needn', 'down', "didn't", 'him', 'what', "hasn't", 'from', 'other', 'you', 'doesn', 'up', 'ain', 'yourselves', 'weren', 'most', 'why', "aren't", 'about', "shouldn't", "you've", 'they', 'own', 'when', 'those', 'her', 'while', 've', 'because', 'under', 'now', 'don', 'such', 'wouldn', 'few', 'too', 'was', "mightn't", '

In [16]:
def remove_stopwords(text,stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [17]:
text = "I am not bothered about her very much".split()
useful_text = remove_stopwords(text,sw)
print(useful_text)

['I', 'bothered', 'much']


In [18]:
'I' in sw

False

### Tokenization using Regular Expression

In [19]:
from nltk.tokenize import RegexpTokenizer

In [20]:
tokenizer = RegexpTokenizer('[a-zA-Z@.]+')
useful_text_ = tokenizer.tokenize(sentence)

In [21]:
useful_text_

['Send',
 'all',
 'the',
 'documents',
 'related',
 'to',
 'chapters',
 'at',
 'prateek@cb.com']

### Stemming
- Process that transforms particular words (verbs, plurals) into their radical form
- Preserve the semantics of the sentence without increasing the number of unique tokens
- Example : jumps, jumping, jumped, jump ==> jump

In [22]:
t = """Foxes love to make jumps.The quick brown fox was seen jumping over the
        lovely dog from a 6ft high wall"""

In [23]:
from nltk.stem.snowball import SnowballStemmer,PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [24]:
ps = PorterStemmer()

In [25]:
ps.stem('jumping')

'jump'

In [26]:
ps.stem('lovely')

'love'

In [27]:
# Snowball Stemmer
ss = SnowballStemmer('english')

In [28]:
ss.stem('lovely')

'love'

In [29]:
# Lemmatization
from nltk.stem import WordNetLemmatizer

In [30]:
wn = WordNetLemmatizer()
wn.lemmatize('jumping')

'jumping'

### Building a Vocab & Vectorization

In [31]:
# Sample Corpus - Contains 4 documents, each document can have 1 or more sentence
corpus = [
    'Indian cricket team will win World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
    'We will win the next Lok Sabha elections, says confident Indian PM.',
    'The nobel laurate won the hearts of the people.',
    'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

In [33]:
cv = CountVectorizer()

In [34]:
vectorized_corpus = cv.fit_transform(corpus)

In [35]:
vectorized_corpus = vectorized_corpus.toarray()

In [36]:
vectorized_corpus[0]

array([0, 1, 0, 1, 1, 0, 1, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 2, 1, 0, 2],
      dtype=int64)

In [37]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'win': 38, 'world': 40, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'the': 32, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'nobel': 20, 'laurate': 16, 'won': 39, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [38]:
print(len(vectorized_corpus[0]))
print(len(cv.vocabulary_.keys()))

41
41


In [39]:
# Reverse Mapping
numbers = vectorized_corpus[2]
numbers

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=int64)

In [41]:
cv.inverse_transform(numbers.reshape((1,-1)))

[array(['hearts', 'laurate', 'nobel', 'of', 'people', 'the', 'won'],
       dtype='<U9')]

In [None]:
def myTokenizer(document):
    words = tokenizer.tokenize(document.lower())
    # Remove Stopwords
    words = remove_stopwords(words,sw)
    return words

In [None]:
#myTokenizer(sentence)
#print(sentence)

In [None]:
cv = CountVectorizer(tokenizer=myTokenizer)

In [None]:
vectorized_corpus = cv.fit_transform(corpus).toarray()

In [None]:
print(vectorized_corpus)

In [None]:
print(len(vectorized_corpus[0]))

### More Ways to Create Features
- Unigram - every word as a feature
- Bigrams
- Trigrams
- n-grams
- TF-IDF Normalization

In [42]:
sent_1 = ["this is a good movie"]
sent_2 = ["this is not a good movie"]

In [44]:
cv_ = CountVectorizer(ngram_range=(2,2))

In [47]:
docs = [sent_1[0],sent_2[0]]
cv_.fit_transform(docs).toarray()

array([[1, 1, 0, 0, 1],
       [1, 0, 1, 1, 1]], dtype=int64)

In [48]:
cv_.vocabulary_

{'this is': 4, 'is good': 1, 'good movie': 0, 'is not': 2, 'not good': 3}

In [49]:
cv1 = CountVectorizer(ngram_range=(1,3))

In [50]:
docs = [sent_1[0],sent_2[0]]
cv1.fit_transform(docs).toarray()

array([[1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0],
       [1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1]], dtype=int64)

In [51]:
cv1.vocabulary_

{'this': 11,
 'is': 2,
 'good': 0,
 'movie': 7,
 'this is': 12,
 'is good': 3,
 'good movie': 1,
 'this is good': 13,
 'is good movie': 4,
 'not': 8,
 'is not': 5,
 'not good': 9,
 'this is not': 14,
 'is not good': 6,
 'not good movie': 10}

### Tf-idf Normalization
- Avoid features that occur very often, because they contain less information
- Information decreases as the no of occurrences increase across different types of documents
- So we define another term - term-document-frequency which associates a weight with every term

In [52]:
sent_1  = "this is good movie"
sent_2 = "this was good movie"
sent_3 = "this is not good movie"

corpus = [sent_1,sent_2,sent_3]

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [54]:
tfidf = TfidfVectorizer()

In [56]:
vc = tfidf.fit_transform(corpus).toarray()

In [57]:
print(vc)

[[0.46333427 0.59662724 0.46333427 0.         0.46333427 0.        ]
 [0.41285857 0.         0.41285857 0.         0.41285857 0.69903033]
 [0.3645444  0.46941728 0.3645444  0.61722732 0.3645444  0.        ]]


In [58]:
tfidf.vocabulary_

{'this': 4, 'is': 1, 'good': 0, 'movie': 2, 'was': 5, 'not': 3}