## N Grams : Feature Extraction method in NLP

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

#### Create data manually

In [2]:
df = pd.DataFrame({'Text' : ['This Pasta is tasty' , 'This pasta is not tasty'] , 'Output': [1,0]})

In [3]:
df

Unnamed: 0,Text,Output
0,This Pasta is tasty,1
1,This pasta is not tasty,0


##### 1 means its positive sentence and 0 means its negative sentence

### Uni Gram

#### Uni Grams (ngram_range = 1, 1) means its bag of words means extract feature on the single word only

In [5]:
cv1 = CountVectorizer(ngram_range = (1,1))

#### Fit the data with countvectorizer(bow / uni grams)

In [8]:
uni_gram = cv1.fit_transform(df['Text'])

In [9]:
uni_gram

<2x5 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

#### Vocabulary in uni gram

In [11]:
print(cv1.vocabulary_)

{'this': 4, 'pasta': 2, 'is': 0, 'tasty': 3, 'not': 1}


#### We get five vocabs from n gram in one corpus (2 documents)

#### metrics to array

In [22]:
uni_gram.toarray()

array([[1, 0, 1, 1, 1],
       [1, 1, 1, 1, 1]], dtype=int64)

### Uni Gram + Bi gram

In [12]:
cv2 = CountVectorizer(ngram_range = (1,2))

#### Here we create features/ vocabs on uni gram (on single word) and Bi gram(on pair of word i.e. 2 consecutive words)

In [13]:
uni_bi_gram = cv2.fit_transform(df['Text'])

In [14]:
uni_bi_gram

<2x10 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

#### Vocabs

In [15]:
print(cv2.vocabulary_)

{'this': 8, 'pasta': 5, 'is': 0, 'tasty': 7, 'this pasta': 9, 'pasta is': 6, 'is tasty': 2, 'not': 3, 'is not': 1, 'not tasty': 4}


#### Here it create 10 vocabs.First on uni gram (on single word) and then Bi gram ( on pair of words)
#### 5 vocabs by uni and another 5 vocabs by Bi gram

#### metrics to array

In [21]:
uni_bi_gram.toarray()

array([[1, 0, 1, 0, 0, 1, 1, 1, 1, 1],
       [1, 1, 0, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

## Bi gram

In [16]:
cv3 = CountVectorizer(ngram_range = (2 ,2))

##### Here it will create features or vocab only on pair of words (2 consecutive words)

In [17]:
bi_gram = cv3.fit_transform(df['Text'])

In [18]:
bi_gram

<2x5 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

#### Vocabs

In [19]:
print(cv3.vocabulary_)

{'this pasta': 4, 'pasta is': 3, 'is tasty': 1, 'is not': 0, 'not tasty': 2}


##### Here it built 5 features / vocabs only

###### metrics to array

In [20]:
bi_gram.toarray()

array([[0, 1, 0, 1, 1],
       [1, 0, 1, 1, 1]], dtype=int64)

### Uni gram + Bi gram + Tri Gram

In [24]:
cv4 = CountVectorizer(ngram_range = (1,3))

#### Here it create features or vocabs on single word and 2 cons word and 3 cons word

In [26]:
uni_bi_tri_gram = cv4.fit_transform(df['Text'])

In [28]:
uni_bi_tri_gram

<2x14 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

#### Vocabs

In [29]:
print(cv4.vocabulary_)

{'this': 11, 'pasta': 6, 'is': 0, 'tasty': 10, 'this pasta': 12, 'pasta is': 7, 'is tasty': 3, 'this pasta is': 13, 'pasta is tasty': 9, 'not': 4, 'is not': 1, 'not tasty': 5, 'pasta is not': 8, 'is not tasty': 2}


##### Here it create 14 vocabs . 5 by uni and 5 by bi and 4 by tri gram

#### metrics to array

In [30]:
uni_bi_tri_gram.toarray()

array([[1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1],
       [1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1]], dtype=int64)

### Tri Gram

In [31]:
cv5 = CountVectorizer(ngram_range = (3,3))

#### Here it craete vocabs or feature on 3 cons words only

In [33]:
tri_gram = cv5.fit_transform(df['Text'])

In [34]:
tri_gram

<2x4 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

#### metrics to array

In [36]:
print(cv5.vocabulary_)

{'this pasta is': 3, 'pasta is tasty': 2, 'pasta is not': 1, 'is not tasty': 0}


#### It create only 4 vocabs

#### metrics to array

In [37]:
tri_gram.toarray()

array([[0, 0, 1, 1],
       [1, 1, 0, 1]], dtype=int64)

##### If remove stop word then also its meaning is not change or loss of meaning

#### But N grams having some Limitations

#### 1.Sparsity when large data
#### 2.It not able to tell which word is significant 
#### 3.oov (out of vocabulary)