In [None]:
## Note : 'binary=True' in the CountVectorizer  makes One Hot Encoding
##         CountVectorizer can also be used for Bag of Words but 'binary=False'

#### Example 01

In [2]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining and the weather is sweet'])

bag = count.fit_transform(docs)

print(count.vocabulary_)

print(bag.toarray())

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


### Example 02

In [8]:
import pandas as pd

train = [("Thanks for an excellent excellent report", "pos"),
         ("Your service is very quick and fast and", "pos"),
        ("I am pleased with your service", "pos"),
        ("I did not know i was diabetic until you gave me this report", "neg"),
        ("Service - Little slow, probably because too many people.", "neg"),
        ("The place is not easy to locate", "neg"),
        ("The place is very easy to locate", "pos"),
        ("Not satisfied will take a second opinion", "neg"),
        ("No human contact everything is so robotic here", "neg")]

df = pd.DataFrame(train,columns=['review','sentiment'])

df.head()

Unnamed: 0,review,sentiment
0,Thanks for an excellent excellent report,pos
1,Your service is very quick and fast and,pos
2,I am pleased with your service,pos
3,I did not know i was diabetic until you gave m...,neg
4,"Service - Little slow, probably because too ma...",neg


In [9]:
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocessData(review):
    
    # tokenize words
    review = word_tokenize(review)
    
    # lower the text
    review = [x.lower() for x in review]
    
    review = ' '.join([x for x in review])
    
    return review

In [10]:
# Clean the Text
df['cleaned_review'] = df['review'].apply(preprocessData)

In [11]:
df.head()

Unnamed: 0,review,sentiment,cleaned_review
0,Thanks for an excellent excellent report,pos,thanks for an excellent excellent report
1,Your service is very quick and fast and,pos,your service is very quick and fast and
2,I am pleased with your service,pos,i am pleased with your service
3,I did not know i was diabetic until you gave m...,neg,i did not know i was diabetic until you gave m...
4,"Service - Little slow, probably because too ma...",neg,"service - little slow , probably because too m..."


In [12]:
# One Hot Encoding using 'SKLearn CountVectorizer'
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=False,lowercase=True)

vectors = vectorizer.fit_transform(df['cleaned_review'])

dff = pd.DataFrame(vectors.todense().tolist(), columns=vectorizer.get_feature_names())

dff.head(5)

Unnamed: 0,am,an,and,because,contact,diabetic,did,easy,everything,excellent,...,this,to,too,until,very,was,will,with,you,your
0,0,1,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,1,1,0,0,0,...,1,0,0,1,0,1,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [15]:
dff.head(1).values  # excellent is appearing 2 tmes

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0]], dtype=int64)