In [1]:
import collections
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold

In [3]:
from sklearn.datasets import fetch_20newsgroups
train_raw_df = fetch_20newsgroups(subset='train', data_home='./scikit_learn_data')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


## Count Occurrence

Counting word occurrence. The reason behind of using this approach is that keyword or important signal will occur again and again. So if the number of occurrence represent the importance of word. More frequency means more importance.

In [19]:
doc = "In the-state-of-art of the NLP field, Embedding is the \
success way to resolve text related problem and outperform \
Bag of Words ( BoW ). Indeed, BoW introduced limitations \
large feature dimension, sparse representation etc."

# Initialize a CountVectorizer object
count_vec = CountVectorizer()

# Transforms the data into a bag of words (sparse matrix)
count_occurs = count_vec.fit_transform([doc])

# Create a table to count occurrence of each word
count_occur_df = pd.DataFrame((count, word) for word, count in 
                              zip(count_occurs.toarray().tolist()[0], count_vec.get_feature_names()))
count_occur_df.columns = ['Word', 'Count']
count_occur_df.sort_values('Count', ascending=False, inplace=True)
count_occur_df.head()

Unnamed: 0,Word,Count
16,of,3
26,the,3
3,bow,2
0,and,1
28,way,1


## Normalized Count Occurrence

Normalization can be applied to avoid model bias.

In [20]:
doc = "In the-state-of-art of the NLP field, Embedding is the \
success way to resolve text related problem and outperform \
Bag of Words ( BoW ). Indeed, BoW introduced limitations \
large feature dimension, sparse representation etc."

# Initialize a TfidfVectorizer object
norm_count_vec = TfidfVectorizer(use_idf=False, norm='l2')

# Transforms the data into a bag of words (sparse matrix)
norm_count_occurs = norm_count_vec.fit_transform([doc])

# Create a table to count occurrence of each word
norm_count_occur_df = pd.DataFrame((count, word) for word, count in 
                                   zip(norm_count_occurs.toarray().tolist()[0], norm_count_vec.get_feature_names()))
norm_count_occur_df.columns = ['Word', 'Count']
norm_count_occur_df.sort_values('Count', ascending=False, inplace=True)
norm_count_occur_df.head()

Unnamed: 0,Word,Count
16,of,0.428571
26,the,0.428571
3,bow,0.285714
0,and,0.142857
28,way,0.142857


## TF-IDF

TF-IDF take another approach which is believe that high frequency may not able to provide much information gain. In another word, rare words may contribute more weights to the model. 
Word importance will be increased if the word occurs less frequently in the document (i.e. training record).

In [21]:
doc = "In the-state-of-art of the NLP field, Embedding is the \
success way to resolve text related problem and outperform \
Bag of Words ( BoW ). Indeed, BoW introduced limitations \
large feature dimension, sparse representation etc."

# Initialize a TfidfVectorizer object
tfidf_vec = TfidfVectorizer()

# Transforms the data into a bag of words (sparse matrix)
tfidf_count_occurs = tfidf_vec.fit_transform([doc])

# Create a table to count occurrence of each word
tfidf_count_occur_df = pd.DataFrame((count, word) for word, count in 
                                    zip(tfidf_count_occurs.toarray().tolist()[0], tfidf_vec.get_feature_names()))
tfidf_count_occur_df.columns = ['Word', 'Count']
tfidf_count_occur_df.sort_values('Count', ascending=False, inplace=True)
tfidf_count_occur_df.head()

Unnamed: 0,Word,Count
16,of,0.428571
26,the,0.428571
3,bow,0.285714
0,and,0.142857
28,way,0.142857


In [22]:
# list of irrelevant words
stop_words = ['a', 'an', 'the']

# cleaning one element of the corpus
def cleaning(text):
    # split text into a list of words
    tokens = text.split(' ')
    # change all letters to lower case
    tokens = [w.lower() for w in tokens]
    # Remove stop words
    tokens = [w for w in tokens if w not in stop_words]
    # return cleaned text
    return ' '.join(tokens)

# add all cleaned text in to corpus
def preprocess_x(x):
    processed_x = [cleaning(text) for text in x]
    
    return processed_x

# build bag of words
def build_model(mode):
    vect = None
    if mode == 'count':
        vect = CountVectorizer()
    elif mode == 'tf':
        vect = TfidfVectorizer(use_idf=False, norm='l2')
    elif mode == 'tfidf':
        vect = TfidfVectorizer()
    else:
        raise ValueError('Mode should be either count or tfidf')
    
    return Pipeline([
        ('vect', vect),
        ('clf' , LogisticRegression(solver='newton-cg',n_jobs=-1))
    ])

def pipeline(x, y, mode):
    processed_x = preprocess_x(x)
    
    model_pipeline = build_model(mode)
    cv = KFold(n_splits=5, shuffle=True)
    
    scores = cross_val_score(model_pipeline, processed_x, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
    
    return model_pipeline

In [23]:
x = preprocess_x(x_train)
y = y_train
    
model_pipeline = build_model(mode='count')
model_pipeline.fit(x, y)

print('Number of Vocabulary: %d'% (len(model_pipeline.named_steps['vect'].get_feature_names())))

Number of Vocabulary: 130107


In [10]:
print('Using Count Vectorizer------')
model_pipeline = pipeline(x_train, y_train, mode='count')

print('Using TF Vectorizer------')
model_pipeline = pipeline(x_train, y_train, mode='tf')

print('Using TF-IDF Vectorizer------')
model_pipeline = pipeline(x_train, y_train, mode='tfidf')

Using Count Vectorizer------
Accuracy: 0.8911 (+/- 0.0091)
Using TF Vectorizer------
Accuracy: 0.8035 (+/- 0.0115)
Using TF-IDF Vectorizer------
Accuracy: 0.8915 (+/- 0.0084)
