In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
def vectorize_text(corpus):
    """
    Will return a dataframe in which every row will, be
    vector representation of a document in corpus
    :param corpus: input text corpus
    :return: dataframe of vectors
    """
    bag_of_words_model = CountVectorizer()
    
    """
    performs the above described three tasks on
    the given data corpus
    """
    dense_vec_matrix = bag_of_words_model.fit_transform(corpus).todense()
    bag_of_word_df = pd.DataFrame(dense_vec_matrix)
    bag_of_word_df.columns = sorted(bag_of_words_model.vocabulary_)
    return bag_of_word_df

corpus = [
        'Data Science is an overlap between Arts and Science',
        'Generally, Arts graduates are right-brained and Science graduates are left-brained',
        'Excelling in both Arts and Science at a time becomes difficult',
        'Natural Language Processing is a part of Data Science']
df = vectorize_text(corpus)
df.head()['and']

0    1
1    1
2    1
3    0
Name: and, dtype: int64

In [4]:
def bow_top_n(corpus, n):
    """
    Will rerturn a dataframe in which every row
    will be represented by presence or absence of top 10
    most frequently occuring words in data corpus
    :param corpus: input text corpus
    :return: dataframe of vectors
    """
    bag_of_words_model_small = CountVectorizer(max_features=n)
    bag_of_word_df_small = pd.DataFrame(bag_of_words_model_small.fit_transform(corpus).todense())
    bag_of_word_df_small.columns = sorted(bag_of_words_model_small.vocabulary_)
    return bag_of_word_df_small

df_2 = bow_top_n(corpus, 10)
df_2.head()

Unnamed: 0,an,and,are,arts,brained,data,graduates,is,right,science
0,1,1,0,1,0,1,0,1,0,2
1,0,1,2,1,2,0,2,0,1,1
2,0,1,0,1,0,0,0,0,0,1
3,0,0,0,0,0,1,0,1,0,1
