## Count Vectorizer
* Text encoding technique

### 1. Fit and Transform separately

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['Why do I have to learn everything from scratch Why?',
            'There are already builtin functions to do it.',
            "Why can't we simply use that?"]
vec = CountVectorizer()
vec = vec.fit(text_data)

# Transformed Data
data_transformed = vec.transform(text_data)

print(vec.vocabulary_)
# Stopwords are already removed and words are converted into lowercase
df = DataFrame(data_transformed.toarray(),columns=vec.vocabulary_)
df

{'why': 18, 'do': 4, 'have': 8, 'to': 15, 'learn': 10, 'everything': 5, 'from': 6, 'scratch': 11, 'there': 14, 'are': 1, 'already': 0, 'builtin': 2, 'functions': 7, 'it': 9, 'can': 3, 'we': 17, 'simply': 12, 'use': 16, 'that': 13}


Unnamed: 0,why,do,have,to,learn,everything,from,scratch,there,are,already,builtin,functions,it,can,we,simply,use,that
0,0,0,0,0,1,1,1,0,1,0,1,1,0,0,0,1,0,0,2
1,1,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,1


### 2. Fit and Transform combined

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['Why do I have to learn everything from scratch Why?',
            'There are already builtin functions to do it.',
            "Why can't we simply use that?",]
vec = CountVectorizer()

data_transformed = vec.fit_transform(text_data)
# Stopwords are already removed and words are converted into lowercase
df = DataFrame(data_transformed.toarray(),columns=vec.vocabulary_)
df

Unnamed: 0,why,do,have,to,learn,everything,from,scratch,there,are,already,builtin,functions,it,can,we,simply,use,that
0,0,0,0,0,1,1,1,0,1,0,1,1,0,0,0,1,0,0,2
1,1,1,1,0,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,1


In [10]:
vec.transform(['There are already builtin functions to do it.']).toarray()

array([[1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]])

### 3. Dealing with Stop Words

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['Why do I have to learn everything from scratch Why?',
            'There are already builtin functions to do it.',
            "Why can't we simply use that?"]
vec = CountVectorizer(stop_words='english',lowercase=True)
vec = vec.fit(text_data)

# Transformed Data
data_transformed = vec.transform(text_data)

print(vec.vocabulary_)
# Stopwords are already removed and words are converted into lowercase
df = DataFrame(data_transformed.toarray(),columns=vec.vocabulary_)
df

{'learn': 2, 'scratch': 3, 'builtin': 0, 'functions': 1, 'simply': 4, 'use': 5}


Unnamed: 0,learn,scratch,builtin,functions,simply,use
0,0,0,1,1,0,0
1,1,1,0,0,0,0
2,0,0,0,0,1,1


### 4. Dealing with N-Gram

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['Why do I have to learn everything from scratch Why?',
            'There are already builtin functions to do it.',
            "Why can't we simply use that?"]

# Rather than taking one word take pairs at a time
# Take all one word + pair of two words
vec = CountVectorizer(ngram_range=(1,2))
vec = vec.fit(text_data)

# Transformed Data
data_transformed = vec.transform(text_data)
# Stopwords are already removed and words are converted into lowercase
df = DataFrame(data_transformed.toarray(),columns=vec.vocabulary_)
df

Unnamed: 0,why,do,have,to,learn,everything,from,scratch,why do,do have,have to,to learn,learn everything,everything from,from scratch,scratch why,there,are,already,builtin,functions,it,there are,are already,already builtin,builtin functions,functions to,to do,do it,can,we,simply,use,that,why can,can we,we simply,simply use,use that
0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,1,0,0,1,1,0,1,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0,2,0,1
1,1,1,1,1,1,1,0,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,1,1,1,1,0


### 5. Count Vectorizer with Data Preprocessing

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['Why do I have to learn everything from scratch Why?',
            'There are already builtin functions to do it.',
            "Why can't we simply use that?"]

# Rather than taking one word take pairs at a time
# Take all one word + pair of two words
vec = CountVectorizer(ngram_range=(2,2),stop_words='english',lowercase=True)
vec = vec.fit(text_data)

# Transformed Data
data_transformed = vec.transform(text_data)
# Stopwords are already removed and words are converted into lowercase
df = DataFrame(data_transformed.toarray(),columns=vec.vocabulary_)
df

Unnamed: 0,learn scratch,builtin functions,simply use
0,0,1,0
1,1,0,0
2,0,0,1
