In [29]:
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [22]:
text = "The US Supreme Court has cleared the way for ex-President Donald Trump's tax forms to be released to a Democratic-controlled congressional committee."

In [23]:
tokenized = word_tokenize(text)
print(tokenized)

['The', 'US', 'Supreme', 'Court', 'has', 'cleared', 'the', 'way', 'for', 'ex-President', 'Donald', 'Trump', "'s", 'tax', 'forms', 'to', 'be', 'released', 'to', 'a', 'Democratic-controlled', 'congressional', 'committee', '.']


In [24]:
tokenizer_integer = Tokenizer()
tokenizer_integer.fit_on_texts(tokenized)
print(tokenizer_integer.word_index)
print(tokenizer_integer.word_counts)

{'the': 1, 'to': 2, 'us': 3, 'supreme': 4, 'court': 5, 'has': 6, 'cleared': 7, 'way': 8, 'for': 9, 'ex': 10, 'president': 11, 'donald': 12, 'trump': 13, "'s": 14, 'tax': 15, 'forms': 16, 'be': 17, 'released': 18, 'a': 19, 'democratic': 20, 'controlled': 21, 'congressional': 22, 'committee': 23}
OrderedDict([('the', 2), ('us', 1), ('supreme', 1), ('court', 1), ('has', 1), ('cleared', 1), ('way', 1), ('for', 1), ('ex', 1), ('president', 1), ('donald', 1), ('trump', 1), ("'s", 1), ('tax', 1), ('forms', 1), ('to', 2), ('be', 1), ('released', 1), ('a', 1), ('democratic', 1), ('controlled', 1), ('congressional', 1), ('committee', 1)])


In [47]:
def build_bag_of_words(documnet):
    document = documnet.replace('.', '')
    document = document.lower()
    tokenized_document = word_tokenize(document)
    
    word_to_index = {}
    bow = []
    
    for word in tokenized_document:
        if word not in word_to_index.keys():
            word_to_index[word] = len(word_to_index)
            #initialize BoW with value 1
            bow.insert(len(word_to_index) - 1, 1)
        else :
            #index for be repeated
            index = word_to_index.get(word)
            #counting (plus 1)
            bow[index] += 1
            
    return word_to_index, bow

In [48]:
vocab, bow = build_bag_of_words(text)
print('Vocabulary :',vocab)
print('bag of words vecotr :', bow)

Vocabulary : {'the': 0, 'us': 1, 'supreme': 2, 'court': 3, 'has': 4, 'cleared': 5, 'way': 6, 'for': 7, 'ex-president': 8, 'donald': 9, 'trump': 10, "'s": 11, 'tax': 12, 'forms': 13, 'to': 14, 'be': 15, 'released': 16, 'a': 17, 'democratic-controlled': 18, 'congressional': 19, 'committee': 20}
bag of words vecotr : [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]


In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [54]:
corpus = ["The US Supreme Court has cleared the way for ex-President Donald Trump's tax forms to be released to a Democratic-controlled congressional committee."]
vector = CountVectorizer()
corpus[0].lower()

print('bag of words vector :', vector.fit_transform(corpus).toarray())
print('vocabulary :',vector.vocabulary_)

bag of words vector : [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1]]
vocabulary : {'the': 16, 'us': 19, 'supreme': 14, 'court': 5, 'has': 11, 'cleared': 1, 'way': 20, 'for': 9, 'ex': 8, 'president': 12, 'donald': 7, 'trump': 18, 'tax': 15, 'forms': 10, 'to': 17, 'be': 0, 'released': 13, 'democratic': 6, 'controlled': 4, 'congressional': 3, 'committee': 2}


In [55]:
from nltk.corpus import stopwords

In [56]:
text= ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the","a","an","is","not"])
print('Bag of Words vector :', vect.fit_transform(text).toarray())
print('Vocabulary :',vect.vocabulary_)

Bag of Words vector : [[1 1 1 1 1]]
Vocabulary : {'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [57]:
text = ["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")
print('Bag of Words vector :', vect.fit_transform(text).toarray())
print('Vocabulary :',vect.vocabulary_)

Bag of Words vector : [[1 1 1]]
Vocabulary : {'family': 0, 'important': 1, 'thing': 2}


In [58]:
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words("english")
vect = CountVectorizer(stop_words=stop_words)
print('Bag of Words vector :', vect.fit_transform(text).toarray())
print('Vocabulary :',vect.vocabulary_)

Bag of Words vector : [[1 1 1 1]]
Vocabulary : {'family': 1, 'important': 2, 'thing': 3, 'everything': 0}
