In [2]:
# Implementing Bag of Words from scratch

In [3]:
documents = ['I am Saiban',
             'I love Paksitan',
             'I love java',
             'and python too']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print(lower_case_documents)

['i am saiban', 'i love paksitan', 'i love java', 'and python too']


In [4]:
sans_punctuation_documents = []
import string

for i in lower_case_documents:
    sans_punctuation_documents.append(''.join(c for c in i if c not in string.punctuation))
    
print(sans_punctuation_documents)

['i am saiban', 'i love paksitan', 'i love java', 'and python too']


In [5]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(' '))
print(preprocessed_documents)

[['i', 'am', 'saiban'], ['i', 'love', 'paksitan'], ['i', 'love', 'java'], ['and', 'python', 'too']]


In [6]:
frequency_list = []
import pprint
from collections import Counter

for i in preprocessed_documents:
    frequency_list.append(Counter(i))
    
pprint.pprint(frequency_list)

[Counter({'i': 1, 'am': 1, 'saiban': 1}),
 Counter({'i': 1, 'love': 1, 'paksitan': 1}),
 Counter({'i': 1, 'love': 1, 'java': 1}),
 Counter({'and': 1, 'python': 1, 'too': 1})]


In [7]:
# Implementing Bag of Words in scikit-learn

In [8]:
import pandas as pd
documents = ['I am Saiban',
             'I love Paksitan',
             'I love java',
             'and python too']

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(documents)

In [10]:
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8',
                input=['I am Saiban', 'I love Paksitan', 'I love java',
                       'and python too'],
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [11]:
count_vector.fit(documents)
count_vector.get_feature_names()

['am', 'and', 'java', 'love', 'paksitan', 'python', 'saiban', 'too']

In [12]:
doc_array = count_vector.transform(documents).toarray()
doc_array

array([[1, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [13]:
frequency_matrix = pd.DataFrame(doc_array,index=documents,columns=count_vector.get_feature_names())
frequency_matrix

Unnamed: 0,am,and,java,love,paksitan,python,saiban,too
I am Saiban,1,0,0,0,0,0,1,0
I love Paksitan,0,0,0,1,1,0,0,0
I love java,0,0,1,1,0,0,0,0
and python too,0,1,0,0,0,1,0,1
