In [5]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
document = """It was a very pleasant day. The weather was cool and there were light showers. I went to the market to buy some fruits"""

In [9]:
sents = sent_tokenize(document)
print(sents)

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits']


In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [10]:
print(len(sents))

3


In [11]:
sents[0]

'It was a very pleasant day.'

In [13]:
sents[0].split()

['It', 'was', 'a', 'very', 'pleasant', 'day.']

## Stopwords

In [6]:
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

In [24]:
print(sw)

{"isn't", 'very', "shouldn't", 'o', 'mightn', 'hers', 'm', 'below', 'above', 'again', 'once', 'just', 'weren', 'won', 've', 'into', 'should', 'that', 's', 'yourself', 'or', 'most', 'having', 'theirs', 'down', "hasn't", 'yours', 'all', 'couldn', 'it', 'during', 'few', 'some', 'our', 'what', 'but', 'him', 'isn', "mightn't", 'wouldn', "don't", 'has', 'shouldn', 'same', 'being', 'how', "wasn't", 'her', 'through', 'am', 'needn', 'my', 'didn', 'now', 'up', 'doesn', 'by', 'in', "doesn't", "wouldn't", "she's", 'do', 'only', 'who', 'are', 't', 'with', 'll', 'they', 'before', "mustn't", 'herself', 'each', 'other', 'those', 'more', 'she', 'be', 'he', 'yourselves', 'and', 'after', 'under', 'nor', 'been', 'hadn', 'itself', "hadn't", 'i', 'of', 'because', 'while', 'd', 'me', 'can', 'myself', 'were', 'had', 'doing', 'if', 'to', "needn't", 'wasn', 'too', 'ours', 'don', 'on', 'your', 'for', 'there', "haven't", 'why', 'against', 'does', 'where', 'was', 'both', 'aren', 'these', 'mustn', 'until', 'when', 

In [25]:
def remove_stopwords(text, stopwords):
    useful_words = [w for w in text if w not in stopwords]
    return useful_words

In [32]:
# Did the split to convert the text into a list of words else the loop in function will travel along every char
text = "i am not bothered about her very much".split()
useful_text = remove_stopwords(text, sw)

In [33]:
print(useful_text)

['bothered', 'much']


## Tokenization using REGEXP
 - regexpal.com

In [38]:
sentence = "Send all the chapter 1,2,3 to hello@hotmail.com"

In [39]:
from nltk.tokenize import RegexpTokenizer

In [61]:
tokenizer = RegexpTokenizer('[a-zA-Z0-9._]+@[a-zA-Z]+[.][a-zA-Z]{2,4}')
# '+' is fo selecting words
useful_text = tokenizer.tokenize(sentence)

In [62]:
print(useful_text)

['hello@hotmail.com']


## Stemming
 - Snowball (multilingual), Porter, Lancaster
 - ex jump, jumping, jumps, jumped ==> jump

In [45]:
text = """Foxes love to make jumps. The quick brown fox was seen jumping over the lazy dog from a 6 ft high wall"""

In [34]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [35]:
ps = PorterStemmer()

In [36]:
ps.stem('jumping')

'jump'

In [37]:
ps.stem('quickly')

'quickli'

In [51]:
ps.stem('loving')

'love'

In [58]:
## Lemmatization
from nltk.stem import WordNetLemmatizer

wn = WordNetLemmatizer()
wn.lemmatize('jumpss')

'jump'

## Building a Vocab and Vectorization

In [10]:
# Sample Corpus - Contains 4 Documents, each document can have 1 or more sentences
corpus = [
        'Indian cricket team will wins World Cup, says Capt. Virat Kohli. World cup will be held at Sri Lanka.',
        'We will win next Lok Sabha Elections, says confident Indian PM',
        'The nobel laurate won the hearts of the people.',
        'The movie Raazi is an exciting Indian Spy thriller based upon a real story.'
]

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
cv = CountVectorizer()

In [3]:
CountVectorizer?

[1;31mInit signature:[0m
[0mCountVectorizer[0m[1;33m([0m[1;33m
[0m    [0minput[0m[1;33m=[0m[1;34m'content'[0m[1;33m,[0m[1;33m
[0m    [0mencoding[0m[1;33m=[0m[1;34m'utf-8'[0m[1;33m,[0m[1;33m
[0m    [0mdecode_error[0m[1;33m=[0m[1;34m'strict'[0m[1;33m,[0m[1;33m
[0m    [0mstrip_accents[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlowercase[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mpreprocessor[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtokenizer[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mstop_words[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mtoken_pattern[0m[1;33m=[0m[1;34m'(?u)\\b\\w\\w+\\b'[0m[1;33m,[0m[1;33m
[0m    [0mngram_range[0m[1;33m=[0m[1;33m([0m[1;36m1[0m[1;33m,[0m [1;36m1[0m[1;33m)[0m[1;33m,[0m[1;33m
[0m    [0manalyzer[0m[1;33m=[0m[1;34m'word'[0m[1;33m,[0m[1;33m
[0m    [0mmax_df[0m[1;33m=[0m[1;36m1.0[0m[1;

In [11]:
vectorized_corpus = cv.fit_transform(corpus)

In [17]:
vectorized_corpus[0]

<1x42 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [19]:
 vectorized_corpus = vectorized_corpus.toarray()

In [20]:
print(vectorized_corpus)

[[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1
  0 2 0 1 0 2]
 [0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
  1 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 3 0 0 0
  0 0 0 0 1 0]
 [1 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 0
  0 0 0 0 0 0]]


In [22]:
print(cv.vocabulary_)

{'indian': 12, 'cricket': 6, 'team': 31, 'will': 37, 'wins': 39, 'world': 41, 'cup': 7, 'says': 27, 'capt': 4, 'virat': 35, 'kohli': 14, 'be': 3, 'held': 11, 'at': 1, 'sri': 29, 'lanka': 15, 'we': 36, 'win': 38, 'next': 19, 'lok': 17, 'sabha': 26, 'elections': 8, 'confident': 5, 'pm': 23, 'the': 32, 'nobel': 20, 'laurate': 16, 'won': 40, 'hearts': 10, 'of': 21, 'people': 22, 'movie': 18, 'raazi': 24, 'is': 13, 'an': 0, 'exciting': 9, 'spy': 28, 'thriller': 33, 'based': 2, 'upon': 34, 'real': 25, 'story': 30}


In [24]:
len(vectorized_corpus[0])

42

In [25]:
len(cv.vocabulary_.keys())

42

In [27]:
## Reverse Maping
numbers = vectorized_corpus[0]
print(numbers)

[0 1 0 1 1 0 1 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0
 2 0 1 0 2]


In [29]:
## Jumbled as it is a bag of words
cv.inverse_transform(numbers)

[array(['at', 'be', 'capt', 'cricket', 'cup', 'held', 'indian', 'kohli',
        'lanka', 'says', 'sri', 'team', 'virat', 'will', 'wins', 'world'],
       dtype='<U9')]

## Vectorization with Stopward Removal

In [30]:
cv = CountVectorizer(tokenizer= myTokenizer)

NameError: name 'myTokenizer' is not defined

In [31]:
from nltk import word_tokenize
sent = "Hey! Welcome to Coding Blocks ?."
words = set(word_tokenize(sent))

In [32]:
words

{'!', '.', '?', 'Blocks', 'Coding', 'Hey', 'Welcome', 'to'}