In [23]:
import pandas as pd 
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


from nltk.tokenize import word_tokenize

<h1>Creating bag of words using pandas</h1>

In [24]:
sent1 = 'It is good practice for us'
sent2 = 'It was also good to know about it'

In [25]:
arr1 = (sent1.lower().split())
arr = arr1.copy()
arr2 = sent2.lower().split()
arr.extend(arr2)
arr = set(arr)
arr,arr1,arr2

({'about',
  'also',
  'for',
  'good',
  'is',
  'it',
  'know',
  'practice',
  'to',
  'us',
  'was'},
 ['it', 'is', 'good', 'practice', 'for', 'us'],
 ['it', 'was', 'also', 'good', 'to', 'know', 'about', 'it'])

In [26]:
count1 = [arr1.count(x) for x in arr]
count1

[1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1]

In [27]:
count2 = [arr2.count(x) for x in arr]
count2

[0, 1, 1, 2, 0, 1, 1, 1, 0, 1, 0]

In [28]:
pd.DataFrame([count1,count2],columns=list(arr))

Unnamed: 0,practice,was,to,it,is,good,also,know,for,about,us
0,1,0,0,1,1,1,0,0,1,0,1
1,0,1,1,2,0,1,1,1,0,1,0


<h1>Creating bags using tokenizers</h1>

In [29]:
sent1 = 'It is good practice for us.'
sent2 = 'It was also good to know about it.'

In [30]:
#creating tokens
token1 = word_tokenize(sent1.lower())
token2 = word_tokenize(sent2.lower())

tokens = set(token1 + token2)
tokens = [x for x in tokens if x.isalnum()]

In [31]:
# creating bag dataframe
df = pd.DataFrame({},index=[1,2],columns=tokens)
df

Unnamed: 0,practice,was,to,it,is,good,also,know,for,about,us
1,,,,,,,,,,,
2,,,,,,,,,,,


In [32]:
#counting frequency of each words of both sentence present inside bags
count1 = [token1.count(x) for x in tokens]
count2 = [token2.count(x) for x in tokens]


#updating counts inside bag
df.iloc[0] = count1
df.iloc[1] = count2
df


Unnamed: 0,practice,was,to,it,is,good,also,know,for,about,us
1,1,0,0,1,1,1,0,0,1,0,1
2,0,1,1,2,0,1,1,1,0,1,0


<h1>Count Vectorizers</h1>

In [33]:
sent1 = 'It is good practice for us.'
sent2 = 'It was also good to know about it.'

cvt = CountVectorizer() #count vectorizer object

new_data = cvt.fit_transform([sent1,sent2]) #for bag of words 
data = new_data.toarray() #extract frequency in list of list format)
columns = cvt.get_feature_names_out() #get features name from cvt
columns

array(['about', 'also', 'for', 'good', 'is', 'it', 'know', 'practice',
       'to', 'us', 'was'], dtype=object)

In [34]:
#Creating dataframe of BoG
bag = pd.DataFrame(data,columns=columns,index = [1,2])
bag

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
1,0,0,1,1,1,1,0,1,0,1,0
2,1,1,0,1,0,2,1,0,1,0,1


In [35]:
#adding new sentence with words present already inside bag
new_sent = 'It was about good practice'

new_row = cvt.transform([new_sent])
d2 = pd.DataFrame(new_row.toarray(),columns=columns,index = [3])
pd.concat([bag,d2])

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
1,0,0,1,1,1,1,0,1,0,1,0
2,1,1,0,1,0,2,1,0,1,0,1
3,1,0,0,1,0,1,0,1,0,0,1


<h1>N gram</h1>

In [36]:
sent1 = 'It is good practice for us.'
sent2 = 'It was also good to know about it.'

ngram_cvt = CountVectorizer(ngram_range=(2,2))

new_data = ngram_cvt.fit_transform([sent1,sent2])
(ngram_cvt.get_feature_names_out())


array(['about it', 'also good', 'for us', 'good practice', 'good to',
       'is good', 'it is', 'it was', 'know about', 'practice for',
       'to know', 'was also'], dtype=object)

In [37]:

bag_2gram = pd.DataFrame(new_data.toarray(),columns = ngram_cvt.get_feature_names_out())
bag_2gram

Unnamed: 0,about it,also good,for us,good practice,good to,is good,it is,it was,know about,practice for,to know,was also
0,0,0,1,1,0,1,1,0,0,1,0,0
1,1,1,0,0,1,0,0,1,1,0,1,1


<h1>TF-IDF scoring</h1>

In [51]:
sent1 = 'It is good practice for us.'
sent2 = 'It was also good to know about it.'


tfidf = TfidfVectorizer()
new_text = tfidf.fit_transform([sent1,sent2])


In [59]:
bag = pd.DataFrame(new_text.toarray(),columns=tfidf.get_feature_names_out(),index = [0,1])
bag

Unnamed: 0,about,also,for,good,is,it,know,practice,to,us,was
0,0.0,0.0,0.446656,0.3178,0.446656,0.3178,0.0,0.446656,0.0,0.446656,0.0
1,0.364391,0.364391,0.0,0.259267,0.0,0.518534,0.364391,0.0,0.364391,0.0,0.364391


In [57]:
bag2 =pd.DataFrame({0 : new_data.toarray()[0],1 : new_data.toarray()[1]},index =t fidf.get_feature_names_out())

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2180930906.py, line 1)

In [56]:
new_data.toarray().shape,tfidf.get_feature_names_out().shape

((2, 12), (11,))

<h1>Sample dataset</h1>

In [68]:
with open('../datasets/sample_sentences.txt','r') as f:
    tfidf = TfidfVectorizer()
    new_data = tfidf.fit_transform(f.readlines())
    bag = pd.DataFrame(new_data.toarray(),columns = tfidf.get_feature_names_out())
bag

Unnamed: 0,30,at,before,can,deadline,do,does,door,finish,forget,...,the,this,time,to,tonight,what,when,will,work,you
0,0.0,0.0,0.0,0.433291,0.0,0.0,0.0,0.0,0.0,0.0,...,0.206465,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.29018
1,0.360948,0.360948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.171994,0.0,0.0,0.0,0.0,0.0,0.0,0.360948,0.0,0.241731
2,0.0,0.0,0.381303,0.0,0.381303,0.0,0.0,0.0,0.381303,0.0,...,0.181693,0.381303,0.0,0.307633,0.0,0.0,0.0,0.0,0.381303,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.400736,0.0,0.0,0.0,...,0.190953,0.0,0.400736,0.0,0.400736,0.400736,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.333753,0.0,0.333753,0.0,0.333753,...,0.159035,0.0,0.0,0.269269,0.0,0.0,0.333753,0.0,0.0,0.223518
