In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold

## Dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups
train_raw_df = fetch_20newsgroups(subset='train', data_home='./scikit_learn_data')
x_train = train_raw_df.data
y_train = train_raw_df.target

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Now we write the required functions to process our dataset.

In [3]:
# library for cleaning texts
import re

# download the list of all non-significant/irrelevant words
import nltk
nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')
from nltk.corpus import stopwords

# library for stemming (filter out the base form of the words)
from nltk.stem.porter import PorterStemmer

# cleaning one sentence of the dataset
def cleaning(sentence):
    # replace all characters except letters in dataset with ' '
    tokens = re.sub('[^a-zA-Z]', ' ', sentence)

    # change all letters to lower case
    tokens = tokens.lower()

    # split the sentence into a list of words
    tokens = tokens.split()
    
    # remove irrelevant words
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens if not word in set(stopwords.words('english'))]
    
    # return the sentence as a list of words
    return tokens

# create a list of lists containing words from each sentence
def preprocess_wv(x):
    corpus = [cleaning(sentence) for sentence in x]   
    return corpus

# create a corpus of cleaned sentences for BoW models
def preprocess_bow(x):
    corpus = [' '.join(cleaning(sentence)) for sentence in x]   
    return corpus

# build BoW models from the corpus
def build_model(mode):
    # choose which type of model to use
    vect = None
    if mode == 'count':
        vect = CountVectorizer()
    elif mode == 'tf':
        vect = TfidfVectorizer(use_idf=False, norm='l2')
    else:
        raise ValueError('Mode should be either count or tfidf')
    
    return Pipeline([
        ('vect', vect),
        ('clf' , LogisticRegression(solver='newton-cg',n_jobs=-1))
    ])

# process our dataset
def pipeline(x, y, mode):
    processed_x = preprocess_bow(x)
    
    model_pipeline = build_model(mode)
    cv = KFold(n_splits=5, shuffle=True)
    
    scores = cross_val_score(model_pipeline, processed_x, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
    
    return model_pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Training the Word2Vec model

Let's build the vocabulary and train the Word2Vec model.

In [4]:
documents = preprocess_wv(x_train)
model = word2vec.Word2Vec(documents, size=100, window=10, min_count=2, workers=4)
model.train(documents, total_examples=len(documents), epochs=10)

(19806945, 21334320)

## Let's look at some output

This is the vocabulary of our dataset:

In [5]:
model.wv.vocab

{'lerxst': <gensim.models.keyedvectors.Vocab at 0x1a235342b0>,
 'wam': <gensim.models.keyedvectors.Vocab at 0x1a23534748>,
 'umd': <gensim.models.keyedvectors.Vocab at 0x1a2354b0f0>,
 'edu': <gensim.models.keyedvectors.Vocab at 0x1a2354bcf8>,
 'thing': <gensim.models.keyedvectors.Vocab at 0x1a2354b6d8>,
 'subject': <gensim.models.keyedvectors.Vocab at 0x1a2354bac8>,
 'car': <gensim.models.keyedvectors.Vocab at 0x1a2354b278>,
 'nntp': <gensim.models.keyedvectors.Vocab at 0x1a2354b978>,
 'post': <gensim.models.keyedvectors.Vocab at 0x1a2354bc50>,
 'host': <gensim.models.keyedvectors.Vocab at 0x1a23547da0>,
 'rac': <gensim.models.keyedvectors.Vocab at 0x1a23547be0>,
 'organ': <gensim.models.keyedvectors.Vocab at 0x1a235470b8>,
 'univers': <gensim.models.keyedvectors.Vocab at 0x1a235478d0>,
 'maryland': <gensim.models.keyedvectors.Vocab at 0x1a235474e0>,
 'colleg': <gensim.models.keyedvectors.Vocab at 0x1a23547f28>,
 'park': <gensim.models.keyedvectors.Vocab at 0x1a23547320>,
 'line': <gen

This is the vector for the word "car":

In [6]:
model.wv['car']

array([ 0.48168394,  0.58812076,  3.3706393 , -0.1423689 ,  3.3050134 ,
        3.076569  , -4.221543  ,  1.4831884 ,  2.164517  ,  2.5697005 ,
        0.6336334 ,  0.5374686 ,  3.9173477 ,  1.2986215 ,  2.155824  ,
       -0.28841922,  2.8315086 , -1.0542144 , -2.8753872 , -3.1601791 ,
       -2.944351  ,  3.3646429 ,  0.70514286,  0.6503262 , -1.0394558 ,
        1.6471945 , -0.27621242,  0.21327908,  3.52527   , -0.5307024 ,
       -2.804661  ,  0.9072769 , -3.1328914 , -1.7292366 ,  0.36830965,
        2.1848297 , -1.6726061 , -1.0553693 , -2.1336393 ,  0.1097566 ,
        2.8939824 , -0.26097524,  4.0498776 , -0.3719951 ,  0.4365983 ,
       -1.3117374 , -2.969182  , -2.1095781 ,  0.10295936,  3.6426418 ,
        2.1645617 , -0.7936139 ,  0.31238562,  0.9756818 ,  3.1398711 ,
       -8.522598  , -1.5727097 , -0.9947048 , -3.379533  , -2.6543257 ,
       -6.1201386 ,  1.8999187 ,  1.7206688 , -1.831763  , -0.10841425,
       -1.6270516 ,  0.8015438 ,  2.4489594 ,  3.5294902 ,  3.22

Now let's try to look up words similar to the word "car"

In [7]:
model.wv.similar_by_word('car', topn = 5)

  if np.issubdtype(vec.dtype, np.int):


[('sedan', 0.6838402152061462),
 ('driven', 0.673595130443573),
 ('porsch', 0.6728496551513672),
 ('wagon', 0.6658891439437866),
 ('toyota', 0.65086829662323)]

### Similarity between two words in the vocabulary

We can also use the Word2Vec model to return the similarity between two words that are present in the vocabulary.

In [8]:
model.wv.similarity(w1="car",w2="bike")

  if np.issubdtype(vec.dtype, np.int):


0.52340454

In [9]:
model.wv.similarity(w1="car",w2="car")

  if np.issubdtype(vec.dtype, np.int):


1.0

In [10]:
model.wv.similarity(w1="car",w2="hard")

  if np.issubdtype(vec.dtype, np.int):


0.14325282

## Bag of Words
Now let's have a look at the BoW model. In some situations, using BoW may be better than Word Embedding, for example:
1. Building an baseline model. By using scikit-learn, you need just a few lines of code to build model.
2. If your dataset is small and context is domain specific. Context is very domain specific means that you cannot find corresponding Vector from pre-trained word embedding models (GloVe, fastText etc).

Below are some simple ways to build BoW models:

## Count Occurrences
Counting word occurrences. The reason behind using this approach is that keywords or important signals occur repeatedly. So the number of occurrences can represent the importance of word. For example:

In [11]:
# example sentence
doc = "In the-state-of-art of the NLP field, Embedding is the \
success way to resolve text related problem and outperform \
Bag of Words ( BoW ). Indeed, BoW introduced limitations \
large feature dimension, sparse representation etc."

In [12]:
# Initialize a CountVectorizer object
count_vec = CountVectorizer()

# Transforms the data into a bag of words (sparse matrix)
count_occurs = count_vec.fit_transform([doc])

# Create a table to count occurrences of each word
count_occur_df = pd.DataFrame((count, word) for word, count in 
                              zip(count_occurs.toarray().tolist()[0], count_vec.get_feature_names()))
count_occur_df.columns = ['Word', 'Count']
count_occur_df.sort_values('Count', ascending=False, inplace=True)
count_occur_df.head(10)

Unnamed: 0,Word,Count
16,of,3
26,the,3
3,bow,2
0,and,1
28,way,1
27,to,1
25,text,1
24,success,1
23,state,1
22,sparse,1


## Normalized Count Occurrences
Normalization can be applied to avoid model bias. For example:

In [13]:
# Initialize a TfidfVectorizer object
norm_count_vec = TfidfVectorizer(use_idf=False, norm='l2')

# Transforms the data into a bag of words (sparse matrix)
norm_count_occurs = norm_count_vec.fit_transform([doc])

# Create a table to count occurrences of each word
norm_count_occur_df = pd.DataFrame((count, word) for word, count in 
                                   zip(norm_count_occurs.toarray().tolist()[0], norm_count_vec.get_feature_names()))
norm_count_occur_df.columns = ['Word', 'Count']
norm_count_occur_df.sort_values('Count', ascending=False, inplace=True)
norm_count_occur_df.head(10)

Unnamed: 0,Word,Count
16,of,0.428571
26,the,0.428571
3,bow,0.285714
0,and,0.142857
28,way,0.142857
27,to,0.142857
25,text,0.142857
24,success,0.142857
23,state,0.142857
22,sparse,0.142857


## TF-IDF
TF-IDF take another approach which is believe that high frequency may not able to provide much information gain. In another word, rare words contribute more weights to the model. 
Word importance will be increased if the number of occurrence within same document (i.e. training record). On the other hand, it will be decreased if it occurs in corpus (i.e. other training records).

In [14]:
# Initialize a TfidfVectorizer object
tfidf_vec = TfidfVectorizer()

# Transforms the data into a bag of words (sparse matrix)
tfidf_count_occurs = tfidf_vec.fit_transform([doc])

# Create a table to count occurrences of each word
tfidf_count_occur_df = pd.DataFrame((count, word) for word, count in zip(
    tfidf_count_occurs.toarray().tolist()[0], tfidf_vec.get_feature_names()))
tfidf_count_occur_df.columns = ['Word', 'Count']
tfidf_count_occur_df.sort_values('Count', ascending=False, inplace=True)
tfidf_count_occur_df.head(10)

Unnamed: 0,Word,Count
16,of,0.428571
26,the,0.428571
3,bow,0.285714
0,and,0.142857
28,way,0.142857
27,to,0.142857
25,text,0.142857
24,success,0.142857
23,state,0.142857
22,sparse,0.142857


Now let's evaluate the two BoW models:

In [None]:
print('Using Count Vectorizer:')
model_pipeline = pipeline(x_train, y_train, mode='count')

print('\nUsing TF Vectorizer:')
model_pipeline = pipeline(x_train, y_train, mode='tf')

print('\nUsing TF-IDF Vectorizer------')
model_pipeline = pipeline(x_train, y_train, mode='tfidf')

Using Count Vectorizer:
Accuracy: 0.8945 (+/- 0.0185)

Using TF Vectorizer:
