In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from nltk.corpus import stopwords # nltk- natural language toolkit
from sklearn.feature_extraction.text import CountVectorizer # Convert a collection of text documents to a matrix of token counts. #document matrix
from sklearn.feature_extraction.text import TfidfVectorizer # Convert a collection of raw documents to a matrix of TF-IDF features. #encoded vecor(inverse document frequency)



**Data**

In [None]:
# Data 
txt1 = ["Basketball is a good game, I love basketball", " Soccer is a popular game, I play soccer", " Swimming is not a game, Swimming is good for health"]
txt1

['Basketball is a good game, I love basketball',
 ' Soccer is a popular game, I play soccer',
 ' Swimming is not a game, Swimming is good for health']

**Finding the Document Vector**

In [None]:
# Create CountVectorizer object: count_vec
count_vec = CountVectorizer(stop_words="english", analyzer='word', ngram_range=(1, 1)) #uses the stopwords and word. word here counts a duplicated word once
# Fit the data 
tf = count_vec.fit(txt1) #counts the unique word in the texts
                        
# features with indexes
tf.vocabulary_

{'basketball': 0,
 'game': 1,
 'good': 2,
 'health': 3,
 'love': 4,
 'play': 5,
 'popular': 6,
 'soccer': 7,
 'swimming': 8}

In [None]:
# print only unique words
count_vec.get_feature_names()



['basketball',
 'game',
 'good',
 'health',
 'love',
 'play',
 'popular',
 'soccer',
 'swimming']

In [None]:
# Encode document
newvector = tf.transform(txt1) #transform words and returns a matrix of non-zero numbers only.
newvector.shape # 3 documents and 9 unique words
print(newvector)

  (0, 0)	2
  (0, 1)	1
  (0, 2)	1
  (0, 4)	1
  (1, 1)	1
  (1, 5)	1
  (1, 6)	1
  (1, 7)	2
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1
  (2, 8)	2


In [None]:
#Print the document term matrix in data-frame
#print(newvector.toarray())
df= pd.DataFrame(newvector.toarray(), columns = count_vec.get_feature_names()) #two dimension array(matrix)
df #(This is the document term matrix)



Unnamed: 0,basketball,game,good,health,love,play,popular,soccer,swimming
0,2,1,1,0,1,0,0,0,0
1,0,1,0,0,0,1,1,2,0
2,0,1,1,1,0,0,0,0,2


**Finding tf-idf of document vector**

In [None]:
# This time we do not need the document vector. tfidfVectoririzer automatically compute the data into docuemnt vector and compute idf.
# Create TfidfVectorizer object: idfv
idfv = TfidfVectorizer(stop_words="english", smooth_idf=False, norm=None, analyzer='word')

# Fit the data
idfv_fitted = idfv.fit(txt1) #Find the unique word

In [None]:
# Features with indexes
idfv_fitted.vocabulary_

{'basketball': 0,
 'game': 1,
 'good': 2,
 'health': 3,
 'love': 4,
 'play': 5,
 'popular': 6,
 'soccer': 7,
 'swimming': 8}

In [None]:
# Features with idf 
idf = idfv_fitted.idf_  # idf value #idf is chossing the important words
df1= pd.DataFrame(idf, idfv.get_feature_names())
df1




Unnamed: 0,0
basketball,2.098612
game,1.0
good,1.405465
health,2.098612
love,2.098612
play,2.098612
popular,2.098612
soccer,2.098612
swimming,2.098612


In [None]:
# Encode document
idfv_transformed = idfv_fitted.transform(txt1)

In [None]:
# Data frame of Encoded vector
df2= pd.DataFrame(idfv_transformed.toarray(), columns = idfv.get_feature_names())
df2



Unnamed: 0,basketball,game,good,health,love,play,popular,soccer,swimming
0,4.197225,1.0,1.405465,0.0,2.098612,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,2.098612,2.098612,4.197225,0.0
2,0.0,1.0,1.405465,2.098612,0.0,0.0,0.0,0.0,4.197225
