## N Grams : Feature Extraction method in NLP

In [3]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

#### Create data manually

In [4]:
df = pd.DataFrame({'Text' : ['This Pasta is tasty' , 
                             'This pasta is not tasty',
                              'Pasta is delicious',
                              'I like pasta very much',
                              'Pasta is good but cost is little high' ,
                              'Pasta taste is good and affordable',
                              'Pasta taste is really nice however price is not affordable'] , 
                   'Output': [1,0,1,1,0,1,0]})

In [5]:
df

Unnamed: 0,Text,Output
0,This Pasta is tasty,1
1,This pasta is not tasty,0
2,Pasta is delicious,1
3,I like pasta very much,1
4,Pasta is good but cost is little high,0
5,Pasta taste is good and affordable,1
6,Pasta taste is really nice however price is no...,0


##### 1 means its positive sentence and 0 means its negative sentence

### Uni Gram

#### Uni Grams (ngram_range = 1, 1) means its bag of words means extract feature on the single word only

In [6]:
cv1 = CountVectorizer(ngram_range = (1,1))

#### Fit the data with countvectorizer(bow / uni grams)

In [7]:
uni_gram = cv1.fit_transform(df['Text'])

In [8]:
uni_gram

<7x21 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

#### Vocabulary in uni gram

In [9]:
print(cv1.vocabulary_)

{'this': 19, 'pasta': 14, 'is': 8, 'tasty': 18, 'not': 13, 'delicious': 4, 'like': 9, 'very': 20, 'much': 11, 'good': 5, 'but': 2, 'cost': 3, 'little': 10, 'high': 6, 'taste': 17, 'and': 1, 'affordable': 0, 'really': 16, 'nice': 12, 'however': 7, 'price': 15}


In [11]:
len(cv1.vocabulary_)

21

#### We get 21 vocabs from n gram in one corpus (2 documents)

#### metrics to array

In [12]:
uni_gram.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 1, 1, 0, 2, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0]],
      dtype=int64)

### Uni Gram + Bi gram

In [13]:
cv2 = CountVectorizer(ngram_range = (1,2))

#### Here we create features/ vocabs on uni gram (on single word) and Bi gram(on pair of word i.e. 2 consecutive words)

In [14]:
uni_bi_gram = cv2.fit_transform(df['Text'])

In [15]:
uni_bi_gram

<7x46 sparse matrix of type '<class 'numpy.int64'>'
	with 71 stored elements in Compressed Sparse Row format>

#### Vocabs

In [16]:
print(cv2.vocabulary_)

{'this': 42, 'pasta': 31, 'is': 14, 'tasty': 41, 'this pasta': 43, 'pasta is': 32, 'is tasty': 20, 'not': 28, 'is not': 18, 'not tasty': 30, 'delicious': 7, 'is delicious': 15, 'like': 21, 'very': 44, 'much': 25, 'like pasta': 22, 'pasta very': 34, 'very much': 45, 'good': 8, 'but': 3, 'cost': 5, 'little': 23, 'high': 11, 'is good': 16, 'good but': 10, 'but cost': 4, 'cost is': 6, 'is little': 17, 'little high': 24, 'taste': 39, 'and': 1, 'affordable': 0, 'pasta taste': 33, 'taste is': 40, 'good and': 9, 'and affordable': 2, 'really': 37, 'nice': 26, 'however': 12, 'price': 35, 'is really': 19, 'really nice': 38, 'nice however': 27, 'however price': 13, 'price is': 36, 'not affordable': 29}


In [17]:
len(cv2.vocabulary_)

46

#### Here it create 46 vocabs.First on uni gram (on single word) and then Bi gram ( on pair of words)

#### metrics to array

In [18]:
uni_bi_gram.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1],
       [0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 0],
       [1, 0, 0, 0, 0,

## Bi gram

In [19]:
cv3 = CountVectorizer(ngram_range = (2 ,2))

##### Here it will create features or vocab only on pair of words (2 consecutive words)

In [20]:
bi_gram = cv3.fit_transform(df['Text'])

In [21]:
bi_gram

<7x25 sparse matrix of type '<class 'numpy.int64'>'
	with 33 stored elements in Compressed Sparse Row format>

#### Vocabs

In [22]:
print(cv3.vocabulary_)

{'this pasta': 23, 'pasta is': 17, 'is tasty': 11, 'is not': 9, 'not tasty': 16, 'is delicious': 6, 'like pasta': 12, 'pasta very': 19, 'very much': 24, 'is good': 7, 'good but': 4, 'but cost': 1, 'cost is': 2, 'is little': 8, 'little high': 13, 'pasta taste': 18, 'taste is': 22, 'good and': 3, 'and affordable': 0, 'is really': 10, 'really nice': 21, 'nice however': 14, 'however price': 5, 'price is': 20, 'not affordable': 15}


In [23]:
len(cv3.vocabulary_)

25

##### Here it built 25 features / vocabs only

###### metrics to array

In [24]:
bi_gram.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 1],
       [0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
        1, 0, 0]], dtype=int64)

### Uni gram + Bi gram + Tri Gram

In [25]:
cv4 = CountVectorizer(ngram_range = (1,3))

#### Here it create features or vocabs on single word and 2 cons word and 3 cons word

In [26]:
uni_bi_tri_gram = cv4.fit_transform(df['Text'])

In [27]:
uni_bi_tri_gram

<7x70 sparse matrix of type '<class 'numpy.int64'>'
	with 97 stored elements in Compressed Sparse Row format>

#### Vocabs

In [28]:
print(cv4.vocabulary_)

{'this': 65, 'pasta': 44, 'is': 19, 'tasty': 64, 'this pasta': 66, 'pasta is': 45, 'is tasty': 31, 'this pasta is': 67, 'pasta is tasty': 49, 'not': 41, 'is not': 26, 'not tasty': 43, 'pasta is not': 48, 'is not tasty': 28, 'delicious': 9, 'is delicious': 20, 'pasta is delicious': 46, 'like': 32, 'very': 68, 'much': 37, 'like pasta': 33, 'pasta very': 52, 'very much': 69, 'like pasta very': 34, 'pasta very much': 53, 'good': 10, 'but': 3, 'cost': 6, 'little': 35, 'high': 15, 'is good': 21, 'good but': 13, 'but cost': 4, 'cost is': 7, 'is little': 24, 'little high': 36, 'pasta is good': 47, 'is good but': 23, 'good but cost': 14, 'but cost is': 5, 'cost is little': 8, 'is little high': 25, 'taste': 60, 'and': 1, 'affordable': 0, 'pasta taste': 50, 'taste is': 61, 'good and': 11, 'and affordable': 2, 'pasta taste is': 51, 'taste is good': 62, 'is good and': 22, 'good and affordable': 12, 'really': 57, 'nice': 38, 'however': 16, 'price': 54, 'is really': 29, 'really nice': 58, 'nice howev

In [29]:
len(cv4.vocabulary_)

70

##### Here it create 70 vocabs .

#### metrics to array

In [30]:
uni_bi_tri_gram.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1],
       [0, 0, 0, 1, 1, 1, 1,

### Tri Gram

In [31]:
cv5 = CountVectorizer(ngram_range = (3,3))

#### Here it craete vocabs or feature on 3 cons words only

In [32]:
tri_gram = cv5.fit_transform(df['Text'])

In [33]:
tri_gram

<7x24 sparse matrix of type '<class 'numpy.int64'>'
	with 26 stored elements in Compressed Sparse Row format>

#### metrics to array

In [34]:
print(cv5.vocabulary_)

{'this pasta is': 23, 'pasta is tasty': 16, 'pasta is not': 15, 'is not tasty': 9, 'pasta is delicious': 13, 'like pasta very': 11, 'pasta very much': 18, 'pasta is good': 14, 'is good but': 6, 'good but cost': 3, 'but cost is': 0, 'cost is little': 1, 'is little high': 7, 'pasta taste is': 17, 'taste is good': 21, 'is good and': 5, 'good and affordable': 2, 'taste is really': 22, 'is really nice': 10, 'really nice however': 20, 'nice however price': 12, 'however price is': 4, 'price is not': 19, 'is not affordable': 8}


In [35]:
len(cv5.vocabulary_)

24

#### It create only 24 vocabs

#### metrics to array

In [36]:
tri_gram.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0],
       [1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
        0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0,
        1, 0]], dtype=int64)

##### If remove stop words then also its meaning is not change or not loss of meaning

#### But N grams having some Limitations

#### 1.Sparsity when large data (We noticed as compare to 2 sentence this having large sparse matrix)
#### 2.It not able to tell which word is significant 
#### 3.oov (out of vocabulary)

### To avoid this things TF-IDF is use