### CountVectorizer 

In [1]:
train_set = ("The sky is blue.", "The sun is bright")
test_set = ("The sun is shining.", "The sun is shining brightly.")

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [3]:
vectorizer.fit(train_set)
vectorizer.vocabulary_

{'the': 5, 'sky': 3, 'is': 2, 'blue': 0, 'sun': 4, 'bright': 1}

In [4]:
vocab_dict = vectorizer.vocabulary_.copy()
dict(sorted(vocab_dict.items(), key=lambda item: item[1]))

{'blue': 0, 'bright': 1, 'is': 2, 'sky': 3, 'sun': 4, 'the': 5}

In [5]:
test_set

('The sun is shining.', 'The sun is shining brightly.')

In [6]:
test_vec = vectorizer.transform(test_set)
test_vec.toarray()  #Convert into numpy array

array([[0, 0, 1, 0, 1, 1],
       [0, 0, 1, 0, 1, 1]])

In [7]:
vectorizer.transform(['The ball is red']).toarray()

array([[0, 0, 1, 0, 0, 1]])

In [8]:
vectorizer.inverse_transform(test_vec)

[array(['is', 'sun', 'the'], dtype='<U6'),
 array(['is', 'sun', 'the'], dtype='<U6')]

### Tokenization
Tokenization is the process of breaking down the given text in natural language processing into the smallest unit in a sentence called a token. Punctuation marks, words, and numbers can be considered tokens.

In [9]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [10]:
text.split(' ')

['Hi',
 'Everyone!',
 'This',
 'is',
 'Hackers',
 'Realm.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing.',
 'We',
 'reached',
 '1000000',
 'views.']

In [11]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [15]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [16]:
sent_tokens = sent_tokenize(text)
sent_tokens

['Hi Everyone!',
 'This is Hackers Realm.',
 'We are learning Natural Language Processing.',
 'We reached 1000000 views.']

In [17]:
word_tokens = word_tokenize(text)
word_tokens

['Hi',
 'Everyone',
 '!',
 'This',
 'is',
 'Hackers',
 'Realm',
 '.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing',
 '.',
 'We',
 'reached',
 '1000000',
 'views',
 '.']

### Stemming
Stemming is the process of finding the root words. A word stem need not be the same root as a dictionary-based morphological root, it just is an equal to or smaller form of the word. 

In [18]:
from nltk.stem import PorterStemmer, SnowballStemmer
ps = PorterStemmer()

In [19]:
word = ('eats')
ps.stem(word)

'eat'

In [20]:
word = ('eating')
ps.stem(word)

'eat'

In [21]:
word = ('eaten')
ps.stem(word)

'eaten'

In [22]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'
word_tokens = word_tokenize(text)

In [25]:
stemmed_sentence = " ".join(ps.stem(word) for word in word_tokens)
stemmed_sentence

'hi everyon ! thi is hacker realm . we are learn natur languag process . we reach 1000000 view .'

### Lemmatization
Lemmatization is the process of finding the form of the related word in the dictionary. It is different from Stemming. It involves longer processes to calculate than Stemming. 


In [26]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [27]:
lemmatizer.lemmatize('worker')

'worker'

In [28]:
lemmatizer.lemmatize('words')

'word'

In [29]:
lemmatizer.lemmatize('feet')

'foot'

In [30]:
lemmatizer.lemmatize('stripes','v')

'strip'

In [31]:
lemmatizer.lemmatize('stripes','n')

'stripe'

In [32]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [33]:
word_tokens = word_tokenize(text)

In [35]:
lemmatized_sentence = " ".join(lemmatizer.lemmatize(word.lower()) for word in word_tokens)
lemmatized_sentence

'hi everyone ! this is hacker realm . we are learning natural language processing . we reached 1000000 view .'

### Part of Speech Tagging (POS)
Part of Speech Tagging is a process of converting a sentence to forms — list of words, list of tuples (where each tuple is having a form (word, tag)). The tag in case of is a part-of-speech tag, and signifies whether the word is a noun, adjective, verb, and so on.

<a href='https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html' target="_blank">POS Tags</a>

In [37]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [38]:
from nltk import pos_tag
pos_tag(['fighting'])

[('fighting', 'VBG')]

In [39]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [40]:
word_tokens = word_tokenize(text)

In [41]:
pos_tag(word_tokens)

[('Hi', 'NNP'),
 ('Everyone', 'NN'),
 ('!', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('Hackers', 'NNP'),
 ('Realm', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('are', 'VBP'),
 ('learning', 'VBG'),
 ('Natural', 'NNP'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('reached', 'VBD'),
 ('1000000', 'CD'),
 ('views', 'NNS'),
 ('.', '.')]