In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from gensim.models import word2vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold

## Dataset

In [2]:
from sklearn.datasets import fetch_20newsgroups
train_raw_df = fetch_20newsgroups(subset='train', data_home='./scikit_learn_data')
x_train = train_raw_df.data
y_train = train_raw_df.target

Now we write the required functions to process our dataset.

In [3]:
# library for cleaning texts
import re

# download the list of all non-significant/irrelevant words
import nltk
nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')
from nltk.corpus import stopwords

# library for stemming (filter out the base form of the words)
from nltk.stem.porter import PorterStemmer

# cleaning one sentence of the dataset
def cleaning(sentence):
    # replace all characters except letters in dataset with ' '
    tokens = re.sub('[^a-zA-Z]', ' ', sentence)

    # change all letters to lower case
    tokens = tokens.lower()

    # split the sentence into a list of words
    tokens = tokens.split()
    
    # remove irrelevant words
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens if not word in set(stopwords.words('english'))]
    
    # return the sentence as a list of words
    return tokens

# create a list of lists containing words from each sentence
def preprocess_wv(x):
    corpus = [cleaning(sentence) for sentence in x]   
    return corpus

# create a corpus of cleaned sentences for BoW models
def preprocess_bow(x):
    corpus = [' '.join(cleaning(sentence)) for sentence in x]   
    return corpus

# build BoW models from the corpus
def build_model(mode):
    # choose which type of model to use
    vect = None
    if mode == 'count':
        vect = CountVectorizer()
    elif mode == 'tf':
        vect = TfidfVectorizer(use_idf=False, norm='l2')
    else:
        raise ValueError('Mode should be either count or tfidf')
    
    return Pipeline([
        ('vect', vect),
        ('clf' , LogisticRegression(solver='newton-cg',n_jobs=-1))
    ])

# process our dataset
def pipeline(x, y, mode):
    processed_x = preprocess_bow(x)
    
    model_pipeline = build_model(mode)
    cv = KFold(n_splits=5, shuffle=True)
    
    scores = cross_val_score(model_pipeline, processed_x, y, cv=cv, scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))
    
    return model_pipeline

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Training the Word2Vec model

Let's build the vocabulary and train the Word2Vec model.

In [4]:
documents = preprocess_wv(x_train)
model = word2vec.Word2Vec(documents, size=100, window=10, min_count=2, workers=4)
model.train(documents, total_examples=len(documents), epochs=10)

(19807064, 21334320)

## Let's look at some output

This is the vocabulary of our dataset:

In [5]:
model.wv.vocab

{'lerxst': <gensim.models.keyedvectors.Vocab at 0x1a2ce967f0>,
 'wam': <gensim.models.keyedvectors.Vocab at 0x1a2ce965c0>,
 'umd': <gensim.models.keyedvectors.Vocab at 0x1a2cead9b0>,
 'edu': <gensim.models.keyedvectors.Vocab at 0x1a2cead358>,
 'thing': <gensim.models.keyedvectors.Vocab at 0x1a2cead4e0>,
 'subject': <gensim.models.keyedvectors.Vocab at 0x1a2cead5c0>,
 'car': <gensim.models.keyedvectors.Vocab at 0x1a2cead128>,
 'nntp': <gensim.models.keyedvectors.Vocab at 0x1a2ceadd30>,
 'post': <gensim.models.keyedvectors.Vocab at 0x1a2ceadc18>,
 'host': <gensim.models.keyedvectors.Vocab at 0x1a2cea89b0>,
 'rac': <gensim.models.keyedvectors.Vocab at 0x1a2cea8048>,
 'organ': <gensim.models.keyedvectors.Vocab at 0x1a2cea8278>,
 'univers': <gensim.models.keyedvectors.Vocab at 0x1a2cec01d0>,
 'maryland': <gensim.models.keyedvectors.Vocab at 0x1a2cec0f98>,
 'colleg': <gensim.models.keyedvectors.Vocab at 0x1a2cec0ef0>,
 'park': <gensim.models.keyedvectors.Vocab at 0x1a2cec0fd0>,
 'line': <gen

This is the vector for the word "car":

In [6]:
model.wv['car']

array([ 3.0149181e+00,  3.4787848e+00,  1.3744807e+00,  1.4684216e+00,
       -2.5903833e+00, -3.1519880e+00, -4.1504889e+00,  2.2451546e+00,
       -1.5356789e+00,  1.0744878e+00, -2.0762601e+00, -1.9990988e+00,
        7.8651035e-01, -2.2692094e+00,  1.4802520e+00,  1.3099872e+00,
       -4.4425554e+00, -3.0555847e-01, -3.1820889e+00, -5.2851143e+00,
       -2.9989252e+00, -6.5653855e-01,  4.4279890e+00,  1.5355009e-01,
       -5.7501669e+00, -1.9123240e+00, -1.9906388e-01, -1.4370259e+00,
        3.2023165e-01,  1.5755341e+00, -1.3583846e+00, -1.0737414e+00,
        8.2900530e-01, -4.9709005e+00,  3.3249788e+00,  2.2614090e+00,
        4.0890727e-06, -1.9892337e-02, -1.5423027e+00,  2.3841318e-01,
       -1.0772984e+00,  2.5773036e+00,  1.9911669e+00,  2.6654141e+00,
       -1.0312412e+00, -4.1696601e+00,  7.0407271e-01,  2.6111569e+00,
       -3.8363812e+00, -4.5188814e-01,  1.2628269e-01, -2.8677020e+00,
        4.6998715e+00,  2.1673179e-01, -1.2232567e+00, -1.3001294e+00,
      

Now let's try to look up words similar to the word "car"

In [7]:
model.wv.similar_by_word('car', topn = 5)

  if np.issubdtype(vec.dtype, np.int):


[('nissan', 0.6621789932250977),
 ('toyota', 0.6598957180976868),
 ('driven', 0.6593411564826965),
 ('wagon', 0.6563270688056946),
 ('dealership', 0.6523199081420898)]

### Similarity between two words in the vocabulary

We can also use the Word2Vec model to return the similarity between two words that are present in the vocabulary.

In [8]:
model.wv.similarity(w1="car",w2="bike")

  if np.issubdtype(vec.dtype, np.int):


0.5462671

In [9]:
model.wv.similarity(w1="car",w2="car")

  if np.issubdtype(vec.dtype, np.int):


1.0

In [10]:
model.wv.similarity(w1="car",w2="hard")

  if np.issubdtype(vec.dtype, np.int):


0.09504748

## Bag of Words
Now let's have a look at the BoW model. In some situations, using BoW may be better than Word Embedding, for example:
1. Building an baseline model. By using scikit-learn, you need just a few lines of code to build model.
2. If your dataset is small and context is domain specific. Context is very domain specific means that you cannot find corresponding Vector from pre-trained word embedding models (GloVe, fastText etc).

Below are some simple ways to build BoW models:

## Count Occurrences
Counting word occurrences. The reason behind using this approach is that keywords or important signals occur repeatedly. So the number of occurrences can represent the importance of word. For example:

In [11]:
# example sentence
doc = "In the-state-of-art of the NLP field, Embedding is the \
success way to resolve text related problem and outperform \
Bag of Words ( BoW ). Indeed, BoW introduced limitations \
large feature dimension, sparse representation etc."

In [12]:
# Initialize a CountVectorizer object
count_vec = CountVectorizer()

# Transforms the data into a bag of words (sparse matrix)
count_occurs = count_vec.fit_transform([doc])

# Create a table to count occurrences of each word
count_occur_df = pd.DataFrame((count, word) for word, count in 
                              zip(count_occurs.toarray().tolist()[0], count_vec.get_feature_names()))
count_occur_df.columns = ['Word', 'Count']
count_occur_df.sort_values('Count', ascending=False, inplace=True)
count_occur_df.head(10)

Unnamed: 0,Word,Count
16,of,3
26,the,3
3,bow,2
0,and,1
28,way,1
27,to,1
25,text,1
24,success,1
23,state,1
22,sparse,1


## Normalized Count Occurrences
Normalization can be applied to avoid model bias. For example:

In [13]:
# Initialize a TfidfVectorizer object
norm_count_vec = TfidfVectorizer(use_idf=False, norm='l2')

# Transforms the data into a bag of words (sparse matrix)
norm_count_occurs = norm_count_vec.fit_transform([doc])

# Create a table to count occurrences of each word
norm_count_occur_df = pd.DataFrame((count, word) for word, count in 
                                   zip(norm_count_occurs.toarray().tolist()[0], norm_count_vec.get_feature_names()))
norm_count_occur_df.columns = ['Word', 'Count']
norm_count_occur_df.sort_values('Count', ascending=False, inplace=True)
norm_count_occur_df.head(10)

Unnamed: 0,Word,Count
16,of,0.428571
26,the,0.428571
3,bow,0.285714
0,and,0.142857
28,way,0.142857
27,to,0.142857
25,text,0.142857
24,success,0.142857
23,state,0.142857
22,sparse,0.142857


Now let's evaluate the two BoW models:

In [14]:
print('Using Count Vectorizer:')
model_pipeline = pipeline(x_train, y_train, mode='count')

print('\nUsing TF Vectorizer:')
model_pipeline = pipeline(x_train, y_train, mode='tf')

Using Count Vectorizer:
Accuracy: 0.8951 (+/- 0.0113)

Using TF Vectorizer:
Accuracy: 0.8564 (+/- 0.0161)
