### Term Frequency - Inverse Document Frequency

In [58]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

#### Create Data manually

In [59]:
df = pd.DataFrame({'Text' : ['This pasta is tasty',
                            'This pasta is not tasty'],
                   'Output' : [1,0]
})

In [60]:
df

Unnamed: 0,Text,Output
0,This pasta is tasty,1
1,This pasta is not tasty,0


#### In Output variable 1 means positive sentence and 0 means negative sentence

In [61]:
tfidf = TfidfVectorizer(ngram_range=(1, 1))

In [62]:
tfidf

#### Here tfidf behave like uni gram because we intentionally give range ngram_range = (1,1) (on each unique word(vocab) of document it create seperate feature) but it has one additional facility is it gives tfidf score so we can understand which word is significant. That not done by uni gram

#### Fit and transform data to tfidf 

In [63]:
features= tfidf.fit_transform(df['Text'])

In [64]:
features

<2x5 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

#### Vocabs

In [65]:
print(tfidf.vocabulary_)

{'this': 4, 'pasta': 2, 'is': 0, 'tasty': 3, 'not': 1}


#### Here we provide ngram_range = (1,1) so it create 5 vocabs

#### Metrics to array

In [66]:
features.toarray()

array([[0.5       , 0.        , 0.5       , 0.5       , 0.5       ],
       [0.4090901 , 0.57496187, 0.4090901 , 0.4090901 , 0.4090901 ]])

#### Here we noticed in first document we have 4 words are significant and in 2nd document 1 word is signficant (0.57496187) . actually this is product of TF * IDF so which has highest TFIDF score that is signficant

## TFIDF  on  uni and Bi gram (pairs of 2 consecutive words)

In [67]:
tfidf_1 = TfidfVectorizer(ngram_range = (1,2))

In [68]:
tfidf_1

#### Here we give (1,2) means its make vocab on single unique word and it make vocab on pair of words as well.

In [69]:
features_1 = tfidf_1.fit_transform(df['Text'])

In [70]:
features_1

<2x10 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

#### Vocabs

In [71]:
print(tfidf_1.vocabulary_)

{'this': 8, 'pasta': 5, 'is': 0, 'tasty': 7, 'this pasta': 9, 'pasta is': 6, 'is tasty': 2, 'not': 3, 'is not': 1, 'not tasty': 4}


#### Here it creates 10 vocabs . 5 by uni and 5 by bi

#### Metrics to array

In [72]:
features_1.toarray()

array([[0.35409974, 0.        , 0.49767483, 0.        , 0.        ,
        0.35409974, 0.35409974, 0.35409974, 0.35409974, 0.35409974],
       [0.2895694 , 0.40697968, 0.        , 0.40697968, 0.40697968,
        0.2895694 , 0.2895694 , 0.2895694 , 0.2895694 , 0.2895694 ]])

#### We get to know 0.49767483 is signficant

### TFIDF on Bi gram

In [83]:
tfidf_2 = TfidfVectorizer(ngram_range = (2,2))


print(tfidf_2)


##### Here we give (2,2) means its make vocab on pair of words only.


features_2 = tfidf_2.fit_transform(df['Text'])


#print(features_2)


##### Vocabs
print('*****' * 20)
print('Vocabulary by TFIDF(Bi gram) :------>')
print(tfidf_2.vocabulary_)
print('*****' * 20)


# #### Here it creates 5 vocabs .

##### Metrics to array

print(features_2.toarray())

TfidfVectorizer(ngram_range=(2, 2))
****************************************************************************************************
Vocabulary by TFIDF(Bi gram) :------>
{'this pasta': 4, 'pasta is': 3, 'is tasty': 1, 'is not': 0, 'not tasty': 2}
****************************************************************************************************
[[0.         0.70490949 0.         0.50154891 0.50154891]
 [0.57615236 0.         0.57615236 0.40993715 0.40993715]]


#### In 1 st documennt we get high TFIDF score is 0.70490949 then it is signficant in 1st document

### TFIDF on uni , Bi ,Tri gram

In [84]:
tfidf_3 = TfidfVectorizer(ngram_range = (1,3))


print(tfidf_3)


##### Here we give (3,3) means its make vocab single unique word and pair of word and 3 cons word.


features_3 = tfidf_3.fit_transform(df['Text'])


#print(features_3)


##### Vocabs
print('*****' * 20)
print('Vocabulary by TFIDF(1,3) :------>')
print(tfidf_3.vocabulary_)
print('*****' * 20)


# #### Here it creates 14 vocabs . 5 by uni gram and 5 by bi gram and 4 by tri gram

##### Metrics to array

print(features_3.toarray())

TfidfVectorizer(ngram_range=(1, 3))
****************************************************************************************************
Vocabulary by TFIDF(1,3) :------>
{'this': 11, 'pasta': 6, 'is': 0, 'tasty': 10, 'this pasta': 12, 'pasta is': 7, 'is tasty': 3, 'this pasta is': 13, 'pasta is tasty': 9, 'not': 4, 'is not': 1, 'not tasty': 5, 'pasta is not': 8, 'is not tasty': 2}
****************************************************************************************************
[[0.30218978 0.         0.         0.42471719 0.         0.
  0.30218978 0.30218978 0.         0.42471719 0.30218978 0.30218978
  0.30218978 0.30218978]
 [0.24342027 0.3421187  0.3421187  0.         0.3421187  0.3421187
  0.24342027 0.24342027 0.3421187  0.         0.24342027 0.24342027
  0.24342027 0.24342027]]


#### In 1st documennt , 0.42471719 is highest TFIDF score so it is significant

### TFIDF on Tri gram

In [87]:
tfidf_4 = TfidfVectorizer(ngram_range = (3,3))


print(tfidf_4)


##### Here we give (3,3) means its make vocab on 3 cons word.


features_4 = tfidf_4.fit_transform(df['Text'])


#print(features_4)


##### Vocabs
print('*****' * 20)
print('Vocabulary by TFIDF(3,3) :------>')
print(tfidf_4.vocabulary_)
print('*****' * 20)


# #### Here it creates 4 vocabs . 4 by tri gram

##### Metrics to array

print(features_4.toarray())

TfidfVectorizer(ngram_range=(3, 3))
****************************************************************************************************
Vocabulary by TFIDF(3,3) :------>
{'this pasta is': 3, 'pasta is tasty': 2, 'pasta is not': 1, 'is not tasty': 0}
****************************************************************************************************
[[0.         0.         0.81480247 0.57973867]
 [0.6316672  0.6316672  0.         0.44943642]]


#### In 1st document 0.81480247 and in 2nd document0.6316672 is signficant

### TFIDF on uni , bi , tri and tetra

In [90]:
tfidf_5 = TfidfVectorizer(ngram_range = (1,4))


print(tfidf_5)


#### Here we give (1,4) means its make vocab on uni gram (unique single word)  , bi (pair of word)
### tri gram (3 cons word) and tetra gram( 4 cons word)


features_5 = tfidf_5.fit_transform(df['Text'])


#print(features_5)


##### Vocabs
print('*****' * 20)
print('Vocabulary by TFIDF(1,4) :------>')
print(tfidf_5.vocabulary_)
print('Length of vocab : ')
print(len(tfidf_5.vocabulary_))
print('*****' * 20)


# #### Here it creates 17 vocabs . 5  by uni , 5 by bi and 4 by tri gram and 3 by tetra

##### Metrics to array

print(features_5.toarray())

TfidfVectorizer(ngram_range=(1, 4))
****************************************************************************************************
Vocabulary by TFIDF(1,4) :------>
{'this': 12, 'pasta': 6, 'is': 0, 'tasty': 11, 'this pasta': 13, 'pasta is': 7, 'is tasty': 3, 'this pasta is': 14, 'pasta is tasty': 10, 'this pasta is tasty': 16, 'not': 4, 'is not': 1, 'not tasty': 5, 'pasta is not': 8, 'is not tasty': 2, 'this pasta is not': 15, 'pasta is not tasty': 9}
Length of vocab : 
17
****************************************************************************************************
[[0.2781429  0.         0.         0.39092014 0.         0.
  0.2781429  0.2781429  0.         0.         0.39092014 0.2781429
  0.2781429  0.2781429  0.2781429  0.         0.39092014]
 [0.21912062 0.30796639 0.30796639 0.         0.30796639 0.30796639
  0.21912062 0.21912062 0.30796639 0.30796639 0.         0.21912062
  0.21912062 0.21912062 0.21912062 0.30796639 0.        ]]


### TFIDF on tetra gram

In [91]:
tfidf_5 = TfidfVectorizer(ngram_range = (4,4))


print(tfidf_5)


##### Here we give (4,4) means its make vocab on 4 cons word.


features_5 = tfidf_5.fit_transform(df['Text'])


#print(features_5)


##### Vocabs
print('*****' * 20)
print('Vocabulary by TFIDF(4,4) :------>')
print(tfidf_5.vocabulary_)
print('*****' * 20)


##### Here it creates 3 vocabs . 3 by tri gram

##### Metrics to array

print(features_5.toarray())

TfidfVectorizer(ngram_range=(4, 4))
****************************************************************************************************
Vocabulary by TFIDF(4,4) :------>
{'this pasta is tasty': 2, 'this pasta is not': 1, 'pasta is not tasty': 0}
****************************************************************************************************
[[0.         0.         1.        ]
 [0.70710678 0.70710678 0.        ]]
