# <b> <center>Basic feature Extraction Methods</center></b>

In [1]:
sample_documents = ['This is the NLP notebook', 
                    'This is basic NLP. NLP is easy',
                    'NLP is awesome']

# BOW:

In [2]:
##import count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
#creating CountVectorizer instance
bow_vec = CountVectorizer(lowercase=True, ngram_range=(1,1), analyzer='word')
#fitting with our data
bow_vec.fit(sample_documents)
#transforming the data to the vector
sample_bow_metrix = bow_vec.transform(sample_documents)
#printing
print("Unique words -->", bow_vec.get_feature_names())
print("BOW Matrix -->",sample_bow_metrix.toarray())
print("vocab to index dict -->", bow_vec.vocabulary_)

Unique words --> ['awesome', 'basic', 'easy', 'is', 'nlp', 'notebook', 'the', 'this']
BOW Matrix --> [[0 0 0 1 1 1 1 1]
 [0 1 1 2 2 0 0 1]
 [1 0 0 1 1 0 0 0]]
vocab to index dict --> {'this': 7, 'is': 3, 'the': 6, 'nlp': 4, 'notebook': 5, 'basic': 1, 'easy': 2, 'awesome': 0}


## Bi-gram BOW

In [3]:
#creating CountVectorizer instance with ngram_range = (1,2) i.e uni-gram and bi-gram
bow_vec = CountVectorizer(lowercase=True, ngram_range=(1,2), analyzer='word')
#fitting with our data
bow_vec.fit(sample_documents)
#transforming the data to the vector
sample_bow_metrix = bow_vec.transform(sample_documents)
#printing
print("Unique words -->", bow_vec.get_feature_names())
print("BOW Matrix -->",sample_bow_metrix.toarray())
print("vocab to index dict -->", bow_vec.vocabulary_)

Unique words --> ['awesome', 'basic', 'basic nlp', 'easy', 'is', 'is awesome', 'is basic', 'is easy', 'is the', 'nlp', 'nlp is', 'nlp nlp', 'nlp notebook', 'notebook', 'the', 'the nlp', 'this', 'this is']
BOW Matrix --> [[0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 1]
 [0 1 1 1 2 0 1 1 0 2 1 1 0 0 0 0 1 1]
 [1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0]]
vocab to index dict --> {'this': 16, 'is': 4, 'the': 14, 'nlp': 9, 'notebook': 13, 'this is': 17, 'is the': 8, 'the nlp': 15, 'nlp notebook': 12, 'basic': 1, 'easy': 3, 'is basic': 6, 'basic nlp': 2, 'nlp nlp': 11, 'nlp is': 10, 'is easy': 7, 'awesome': 0, 'is awesome': 5}


## TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
#creating TfidfVectorizer instance
tfidf_vec = TfidfVectorizer()
#fitting with our data
tfidf_vec.fit(sample_documents)
#transforming the data to the vector
sample_tfidf_metrix = tfidf_vec.transform(sample_documents)
#printing
print("Unique words -->", tfidf_vec.get_feature_names())
print("TFIDF Matrix -->", '\n',sample_tfidf_metrix.toarray())
print("vocab to index dict -->", tfidf_vec.vocabulary_)

Unique words --> ['awesome', 'basic', 'easy', 'is', 'nlp', 'notebook', 'the', 'this']
TFIDF Matrix --> 
 [[0.         0.         0.         0.32630952 0.32630952 0.55249005
  0.55249005 0.42018292]
 [0.         0.43157129 0.43157129 0.50978591 0.50978591 0.
  0.         0.32822109]
 [0.76749457 0.         0.         0.45329466 0.45329466 0.
  0.         0.        ]]
vocab to index dict --> {'this': 7, 'is': 3, 'the': 6, 'nlp': 4, 'notebook': 5, 'basic': 1, 'easy': 2, 'awesome': 0}


## reduce number of vocab in the BOW

In [5]:
#creating CountVectorizer instance, limited to 4 features only
bow_vec = CountVectorizer(lowercase=True, ngram_range=(1,1), analyzer='word', max_features=4)
#fitting with our data
bow_vec.fit(sample_documents)
#transforming the data to the vector
sample_bow_metrix = bow_vec.transform(sample_documents)
#printing
print("Unique words -->", bow_vec.get_feature_names())
print("BOW Matrix -->",sample_bow_metrix.toarray())
print("vocab to index dict -->", bow_vec.vocabulary_)

Unique words --> ['awesome', 'is', 'nlp', 'this']
BOW Matrix --> [[0 1 1 1]
 [0 2 2 1]
 [1 1 1 0]]
vocab to index dict --> {'this': 3, 'is': 1, 'nlp': 2, 'awesome': 0}


## reduce number of vocab in the TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
#creating TfidfVectorizer instance
tfidf_vec = TfidfVectorizer(max_features=3)
#fitting with our data
tfidf_vec.fit(sample_documents)
#transforming the data to the vector
sample_tfidf_metrix = tfidf_vec.transform(sample_documents)
#printing
print("Unique words -->", tfidf_vec.get_feature_names())
print("TFIDF Matrix -->", '\n',sample_tfidf_metrix.toarray())
print("vocab to index dict -->", tfidf_vec.vocabulary_)

Unique words --> ['is', 'nlp', 'this']
TFIDF Matrix --> 
 [[0.52284231 0.52284231 0.67325467]
 [0.64355191 0.64355191 0.41434513]
 [0.70710678 0.70710678 0.        ]]
vocab to index dict --> {'this': 2, 'is': 0, 'nlp': 1}


## Creating the BOW and TFIDF for the same data using iterator over the data

In [25]:
##for tokenization
import nltk
#vertical stack of sparse matrix
from scipy.sparse import vstack
#vocab set
vocab_set = set()
#looping through the points(for huge data, you will get from your disk/table)
for data_point in sample_documents:
    #getting words
    for word in nltk.tokenize.word_tokenize(data_point):
        if word.isalpha():
            vocab_set.add(word.lower())

vectorizer_bow = CountVectorizer(vocabulary=vocab_set)

bow_data = [] 
for data_point in sample_documents: # use a generator
    ##if we give the vocab, there will be no data lekage for fit_transform so we can use that
    bow_data.append(vectorizer_bow.fit_transform([data_point]))

final_bow = vstack(bow_data)

print("Unique words -->", vectorizer_bow.get_feature_names())
print("BOW Matrix -->",final_bow.toarray())
print("vocab to index dict -->", vectorizer_bow.vocabulary_)

Unique words --> ['awesome', 'basic', 'easy', 'is', 'nlp', 'notebook', 'the', 'this']
BOW Matrix --> [[0 0 0 1 1 1 1 1]
 [0 1 1 2 2 0 0 1]
 [1 0 0 1 1 0 0 0]]
vocab to index dict --> {'awesome': 0, 'basic': 1, 'easy': 2, 'is': 3, 'nlp': 4, 'notebook': 5, 'the': 6, 'this': 7}


## TFIDF using above BOW matrix

In [28]:
#importing
from sklearn.feature_extraction.text import TfidfTransformer
#instanciate the class
vec_tfidftransformer = TfidfTransformer()
#fit with the BOW sparse data 
vec_tfidftransformer.fit(final_bow)
vec_tfidf = vec_tfidftransformer.transform(final_bow)
print(vec_tfidf.toarray())

[[0.         0.         0.         0.32630952 0.32630952 0.55249005
  0.55249005 0.42018292]
 [0.         0.43157129 0.43157129 0.50978591 0.50978591 0.
  0.         0.32822109]
 [0.76749457 0.         0.         0.45329466 0.45329466 0.
  0.         0.        ]]


## Hashing Vectorizer

In [36]:
#importing the hashvectorizer
from sklearn.feature_extraction.text import HashingVectorizer
#instanciating the HashingVectorizer
hash_vectorizer = HashingVectorizer(n_features=5, norm=None, alternate_sign=False)
#transforming the data, No need to fit the data because, it is stateless
hash_vector = hash_vectorizer.transform(sample_documents)
#printing the output
print("Hash vectors -->",hash_vector.toarray())

Hash vectors --> [[0. 1. 3. 1. 0.]
 [0. 1. 5. 1. 0.]
 [0. 0. 3. 0. 0.]]


## TFIDF using above Hash matrix

In [38]:
#instanciate the class
vec_idftrans = TfidfTransformer()
#fit with the hash BOW sparse data 
vec_idftrans.fit(hash_vector)
##transforming the data
vec_tfidf2 = vec_idftrans.transform(hash_vector)
print("tfidf using hash BOW -->",vec_tfidf2.toarray())

tfidf using hash BOW --> [[0.         0.36691832 0.85483442 0.36691832 0.        ]
 [0.         0.2419863  0.93961974 0.2419863  0.        ]
 [0.         0.         1.         0.         0.        ]]
