In [7]:
import nltk 
import re
import numpy
from nltk.corpus import stopwords

In [8]:
def word_extraction(sentence):
    words = re.sub("[^\w]", " ",  sentence).split()
    cleaned_text = [w.lower() for w in words if w not in stopwords.words('english')]
    return cleaned_text

In [9]:
def tokenize(sentences):
    words = []
    for sentence in sentences:
        w = word_extraction(sentence)
        words.extend(w)
    words = sorted(list(set(words)))
    return words

In [10]:
text = "Mary and Samantha arrived at the bus station early but waited until noon for the bus"

In [11]:
word_extraction(text)

['mary',
 'samantha',
 'arrived',
 'bus',
 'station',
 'early',
 'waited',
 'noon',
 'bus']

In [12]:
tokenize(word_extraction(text))

['arrived', 'bus', 'early', 'mary', 'noon', 'samantha', 'station', 'waited']

In [13]:
def generate_bow(allsentences):
    vocab = tokenize(allsentences)
    print("Word List for Document \n{0} \n".format(vocab))
    for sentence in allsentences:
        words = word_extraction(sentence)
        bag_vector = numpy.zeros(len(vocab))
        for w in words:
            for i,word in enumerate(vocab):
                if word == w:
                    bag_vector[i] += 1
        print("{0}\n{1}\n".format(sentence,numpy.array(bag_vector)))
        print('---------------------------------------------------')

In [14]:
generate_bow(word_extraction(text))

Word List for Document 
['arrived', 'bus', 'early', 'mary', 'noon', 'samantha', 'station', 'waited'] 

mary
[0. 0. 0. 1. 0. 0. 0. 0.]

---------------------------------------------------
samantha
[0. 0. 0. 0. 0. 1. 0. 0.]

---------------------------------------------------
arrived
[1. 0. 0. 0. 0. 0. 0. 0.]

---------------------------------------------------
bus
[0. 1. 0. 0. 0. 0. 0. 0.]

---------------------------------------------------
station
[0. 0. 0. 0. 0. 0. 1. 0.]

---------------------------------------------------
early
[0. 0. 1. 0. 0. 0. 0. 0.]

---------------------------------------------------
waited
[0. 0. 0. 0. 0. 0. 0. 1.]

---------------------------------------------------
noon
[0. 0. 0. 0. 1. 0. 0. 0.]

---------------------------------------------------
bus
[0. 1. 0. 0. 0. 0. 0. 0.]

---------------------------------------------------


In [15]:
allsentences = ["Joe waited for the train",
                "The train was late",
                "Mary and Samantha took the bus",
               "I looked for Mary and Samantha at the bus station",
                "Mary and Samantha arrived at the bus station early but waited until noon for the bus"]

In [16]:
generate_bow(allsentences)

Word List for Document 
['arrived', 'bus', 'early', 'i', 'joe', 'late', 'looked', 'mary', 'noon', 'samantha', 'station', 'the', 'took', 'train', 'waited'] 

Joe waited for the train
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]

---------------------------------------------------
The train was late
[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0.]

---------------------------------------------------
Mary and Samantha took the bus
[0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.]

---------------------------------------------------
I looked for Mary and Samantha at the bus station
[0. 1. 0. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0.]

---------------------------------------------------
Mary and Samantha arrived at the bus station early but waited until noon for the bus
[1. 2. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 0. 1.]

---------------------------------------------------


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(allsentences)

for i in range(len(allsentences)) : 
    print(allsentences[i])
    print(list(X.toarray()[i]))
    
    print('---------------------------------------------------')

Joe waited for the train
[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0]
---------------------------------------------------
The train was late
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1]
---------------------------------------------------
Mary and Samantha took the bus
[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0]
---------------------------------------------------
I looked for Mary and Samantha at the bus station
[1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0]
---------------------------------------------------
Mary and Samantha arrived at the bus station early but waited until noon for the bus
[1, 1, 1, 2, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 2, 0, 0, 1, 1, 0]
---------------------------------------------------
