# Sentiment analysis 

# Introduction
Analyze & classify sentiment of text data, articles into positive or negative

# Objective
Sentiment analysis notebooks dives in very depth of various concepts, methods related to text analysis and understand the meaning of it semantically and/or syntactly. They are classified in the following five based notebooks based on different methods & tools used to analyze & classify text.

1. Sentiment Analysis with Text Blob, Word Cloud, Count Vectorizer, N-Gram
2. Sentiment Analysis using Doc2Vec, N-Gram & Phrase Modelling
3. Sentiment Analysis with Chi2 Square & PCA Dimension Reduction
4. Sentiment Analysis with Keras & Tensorflow
5. Sentiment Analysis with Keras & Tensorflow using Doc2Vec, Pretrained GloVe

# Cinco
## 5. Sentiment Analysis with Text Blob, Word Cloud, Count Vectorizer, N-Gram, Zif2 law, Lexical Analysis

In [1]:
# Basic import

import re
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from gensim.models.word2vec import Word2Vec
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence, TaggedDocument
from gensim.models.phrases import Phrases, Phraser

import gensim.downloader as api

In [3]:
from textblob import TextBlob
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import scale

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from datetime import datetime
import pickle

import multiprocessing

In [4]:
# Read TF dataframe

df = pd.read_hdf('./data/redstone.hdf')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1600000 entries, 0 to 1599999
Data columns (total 3 columns):
sentiment        1600000 non-null int64
text             1600000 non-null object
pre_clean_len    1600000 non-null int64
dtypes: int64(2), object(1)
memory usage: 48.8+ MB


Unnamed: 0,sentiment,text,pre_clean_len
0,0,awww that bummer you shoulda got david carr of...,115
1,0,is upset that he can not update his facebook b...,111
2,0,dived many times for the ball managed to save ...,89
3,0,my whole body feels itchy and like its on fire,47
4,0,no it not behaving at all mad why am here beca...,111


In [5]:
# Santitizing dataframe

df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 3 columns):
sentiment        1600000 non-null int64
text             1600000 non-null object
pre_clean_len    1600000 non-null int64
dtypes: int64(2), object(1)
memory usage: 36.6+ MB


In [6]:
from sklearn import utils
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression

train = df.text
label = df.sentiment
SEED = 21

# Splitting data into train, test & validation sets
x_train, x_val_test, y_train, y_val_test = train_test_split(train, label, test_size=.02, random_state=SEED)

x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=.5, random_state=SEED)



In [7]:
# Quantifying the positive & negative sentiments in the dataset

from collections import Counter

counter = Counter(y_train)
print('Train set entries.')
for key in counter:
    if key == 0:
        print('{:.2f}% Negative Entries'.format( (counter[key]/len(y_train))*100 ))
    elif key == 1:
        print('{:.2f}% Positive Entries'.format( (counter[key]/len(y_train))*100 ))
        
counter = Counter(y_val)
print('\nValidation set entries.')
for key in counter:
    if key == 0:
        print('{:.2f}% Negative Entries'.format( (counter[key]/len(y_val))*100 ))
    elif key == 1:
        print('{:.2f}% Positive Entries'.format( (counter[key]/len(y_val))*100 ))

counter = Counter(y_test)
print('\nTest set entries.')
for key in counter:
    if key == 0:
        print('{:.2f}% Negative Entries'.format( (counter[key]/len(y_test))*100 ))
    elif key == 1:
        print('{:.2f}% Positive Entries'.format( (counter[key]/len(y_test))*100 ))

Train set entries.
50.00% Negative Entries
50.00% Positive Entries

Validation set entries.
50.01% Negative Entries
49.99% Positive Entries

Test set entries.
50.21% Negative Entries
49.79% Positive Entries


In [8]:
# Calculate accuracy & summmary of different set of features

def accuracy_features(pipeline, x_train, y_train, x_test, y_test):
    
    counter = Counter(y_test)

    if (counter[0] / (len(y_test)*1.)) > 0.5:
        baseline_accuracy = counter[0] / (len(y_test)*1.)
    else:
        baseline_accuracy = 1. - (counter[0] / (len(y_test)*1.))
   
    # Timer starts
    timer = datetime.now()
    
    model = pipeline.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    elapsed_time = datetime.now() - timer
    # Timer stops

    accuracy = accuracy_score(y_test, y_pred)
    
    
    print('Baseline accuracy: {:.2f}%'.format(baseline_accuracy*100))
    print('Accuracy score: {:.2f}%'.format(accuracy*100))
    
    if(accuracy > baseline_accuracy):
        print('\nModel accuracy:{:.2f}% - Baseline accuracy:{:.2f}%: Increase of {:.2f}%'.format(accuracy*100, baseline_accuracy*100, (accuracy-baseline_accuracy)*100))
    else:
        print('Model accuracy:{:.2f}% - Baseline accuracy:{:.2f}%: Decrease of {:.2f}%'.format(accuracy*100, baseline_accuracy*100, (accuracy-baseline_accuracy)*100))
    
    print('Overall Train and Prediction time: {:.2f}s'.format(elapsed_time.total_seconds()))
    print('-'*89)
          
    return accuracy, elapsed_time

In [9]:
# Feature extraction, Iteratively

# Count Vectorizer, Logistic Regression model
cvec = CountVectorizer()
lr = LogisticRegression()

n_features = np.arange(10000, 100001, 10000)

def classical_feature_extraction(vectorizer=cvec, n_features=n_features, stop_words=None, ngram_range=(1, 1), classifier=lr):
    result = []
    print(classifier)

    for features in n_features:
        # Setting vectorizer parameters & creating pipeline
        cvec.set_params(stop_words=stop_words, max_features=features, ngram_range=ngram_range)
        pipeline = Pipeline([
            ('vectorizer', cvec),
            ('classifier', classifier)
        ])
        print("\nValidation result for {} features".format(features))
        
        # Calculate accuracy & summmary
        features_accuracy, features_time = accuracy_features(pipeline, x_train, y_train, x_val, y_val)
        result.append((features, features_accuracy, features_time))
        
    return result

In [10]:
# Vectorize train, validation sets using two models

def vectorize_concate(model1, model2, corpus, size):
    # Numpy zeros initialization
    vectors = np.zeros((len(corpus), size))
    
    for idx, count in zip(corpus.index, range(len(corpus.index))):
        prefix = 'all_' + str(idx)
        # Appending document vectors
        vectors[count] = np.append(model1.docvecs[prefix], model2.docvecs[prefix])

    return vectors

In [9]:
# Load the dbow_ug_model and delete temporary training data

dbow_ug_model = Doc2Vec.load('./data/dbow_ug_model.doc2vec')
dbow_ug_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [12]:
# Load the dmm_tg_model and delete temporary training data

dmm_tg_model = Doc2Vec.load('./data/dmm_tg_model.doc2vec')
dmm_tg_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [13]:
# Vectorize & concate document vectors of train, validation sets obtained from Distributed Bag Of Words & Distributed Memory Mean 

train_vecs_dbow_dmm = vectorize_concate(dbow_ug_model, dmm_tg_model, x_train, 200)
val_vecs_dbow_dmm = vectorize_concate(dbow_ug_model, dmm_tg_model, x_val, 200)

In [14]:
# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmm, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
clf.score(val_vecs_dbow_dmm, y_val)

0.754

### Artificial Neural Networks

After experimenting with Logistic Regression, it would be interesting to evaluate the result of neural network classifier. Logistic regression can be thought as a basic neural network with no hidden layer and just one output node.

![title](./images/lr_nn.png)

In [39]:
# Basic Keras Import

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras.models import load_model

import numpy as np

# Fix the seed
seed = 21

Using TensorFlow backend.


In [13]:
# Parameters

batch_size = 16

In [20]:
%%time

np.random.seed(seed)

# Create Model

model = Sequential()
model.add(Dense(64, input_dim=200, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile Model

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model

model.fit(train_vecs_dbow_dmm, y_train, validation_data=(val_vecs_dbow_dmm, y_val), batch_size=batch_size,
                    epochs=5, verbose=2)

Train on 1568000 samples, validate on 16000 samples
Epoch 1/5
 - 782s - loss: 0.4813 - acc: 0.7738 - val_loss: 0.4725 - val_acc: 0.7788
Epoch 2/5
 - 625s - loss: 0.4664 - acc: 0.7815 - val_loss: 0.4736 - val_acc: 0.7771
Epoch 3/5
 - 174s - loss: 0.4618 - acc: 0.7839 - val_loss: 0.4680 - val_acc: 0.7801
Epoch 4/5
 - 177s - loss: 0.4593 - acc: 0.7853 - val_loss: 0.4663 - val_acc: 0.7794
Epoch 5/5
 - 177s - loss: 0.4577 - acc: 0.7859 - val_loss: 0.4672 - val_acc: 0.7792
CPU times: user 1h 1min 14s, sys: 6min 49s, total: 1h 8min 4s
Wall time: 32min 16s


One can tune the hyperparameters using different combinations of Input, Hidden & Output layers, no of epochs etc.

#### Implementing Checkpoint & Early stopping

In [27]:
%%time

# Setting checkpoint & early stopping
checkpoint_path = './checkpoint/model_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5'
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

early_stop = EarlyStopping(monitor='val_acc', patience=5, mode='max') 
callbacks_list = [checkpoint, early_stop]

np.random.seed(seed)

# Create Model

model = Sequential()
model.add(Dense(64, input_dim=200, activation='relu'))
# model.add(Dense(256, activation='relu'))
# model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile Model

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model

model.fit(train_vecs_dbow_dmm, y_train, validation_data=(val_vecs_dbow_dmm, y_val), batch_size=batch_size,
                    epochs=3, verbose=2, callbacks=callbacks_list)

Train on 1568000 samples, validate on 16000 samples
Epoch 1/3
 - 178s - loss: 0.4811 - acc: 0.7738 - val_loss: 0.4714 - val_acc: 0.7778

Epoch 00001: val_acc improved from -inf to 0.77781, saving model to ./checkpoint/model_best_weights.01-0.7778.hdf5
Epoch 2/3
 - 179s - loss: 0.4663 - acc: 0.7817 - val_loss: 0.4722 - val_acc: 0.7789

Epoch 00002: val_acc improved from 0.77781 to 0.77887, saving model to ./checkpoint/model_best_weights.02-0.7789.hdf5
Epoch 3/3
 - 181s - loss: 0.4616 - acc: 0.7840 - val_loss: 0.4673 - val_acc: 0.7815

Epoch 00003: val_acc improved from 0.77887 to 0.78150, saving model to ./checkpoint/model_best_weights.03-0.7815.hdf5
CPU times: user 16min 44s, sys: 1min 27s, total: 18min 11s
Wall time: 8min 58s


In [26]:
pwd

'/home/indiano/PycharmProjects/TwitterAI/redstone'

In [28]:
# Evaluation gives the result from the last trained model 

model.evaluate(x=val_vecs_dbow_dmm, y=y_val)



[0.4673392976522446, 0.7815]

In [30]:
# Loading the best model

best_model = load_model('./checkpoint/model_best_weights.03-0.7815.hdf5')

In [31]:
# Evaluation gives the result from the best saved model 

best_model.evaluate(x=val_vecs_dbow_dmm, y=y_val)



[0.4673392976522446, 0.7815]

# Word2Vec

Due to different vocabularies 0f N-Grams, i will not use concatenated word vectors extracted from Doc2Vec model. Hence I will load N-Grams containing the same vocabulary i.e. Unigram DBOW & Unigram DMM and create concatenated word vectors.

In [14]:
# Load the dmm_ug_model and delete temporary training data

dmm_ug_model = Doc2Vec.load('./data/dmm_ug_model.doc2vec')
dmm_ug_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

### Word vectors extracted from Doc2Vec models (Average/Sum)¶

### Average

One idea is to sum up all the word vectors of a tweet found in Doc2Vec model vocabulary while counting the number of words having word vectors. Finally, we can divide the sum by the count to get averaged vector.

In [50]:
# Extract average words vectors for train, validation sets using two Doc2Vec models

def avg_un_dbow_dmm_wordvec(tweet, size):
    # Numpy zeros initialization
    vector = np.zeros(size).reshape((1, size))
    
    counter = 0
    for word in tweet.split():
        try:
            vector += np.append(dbow_ug_model[word], dmm_ug_model[word]).reshape((1, size))
            counter += 1
        except KeyError:
            continue
    
    # Counter not zero, divide by it to get averaged vector
    if counter:
        vector /= counter
    
    return vector

In [16]:
train_avg_vecs_dbow_dmm = np.concatenate([avg_un_dbow_dmm_wordvec(tweet, 200) for tweet in x_train])
val_avg_vecs_dbow_dmm = np.concatenate([avg_un_dbow_dmm_wordvec(tweet, 200) for tweet in x_val])

In [18]:
# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_vecs_dbow_dmm, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
clf.score(val_avg_vecs_dbow_dmm, y_val)

0.7178125

#### Scaling the train & validation sets using Sklearn

In [23]:
train_avg_vecs_dbow_dmm_scaled = scale(train_avg_vecs_dbow_dmm)
val_avg_vecs_dbow_dmm_scaled = scale(val_avg_vecs_dbow_dmm)

In [24]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_vecs_dbow_dmm_scaled, y_train)

CPU times: user 47.4 s, sys: 688 ms, total: 48.1 s
Wall time: 51.8 s


In [25]:
clf.score(val_avg_vecs_dbow_dmm_scaled, y_val)

0.728

### Sum

Another idea is to sum up all the word vectors of a tweet found in Doc2Vec model vocabulary without averaging them. It might distort the word vectors weightage if some tweets only have a few words in the Doc2Vec vocabulary and other have a lot of them.

In [51]:
# Extract average words vectors for train, validation sets using two Doc2Vec models

def sum_un_dbow_dmm_wordvec(tweet, size):
    # Numpy zeros initialization
    vector = np.zeros(size).reshape((1, size))
    
    for word in tweet.split():
        try:
            vector += np.append(dbow_ug_model[word], dmm_ug_model[word]).reshape((1, size))
        except KeyError:
            continue
    
    return vector

In [29]:
train_sum_vecs_dbow_dmm = np.concatenate([sum_un_dbow_dmm_wordvec(tweet, 200) for tweet in x_train])
val_sum_vecs_dbow_dmm = np.concatenate([sum_un_dbow_dmm_wordvec(tweet, 200) for tweet in x_val])

Word iiitttt not found in the vocabulary.
Word appppparantly not found in the vocabulary.
Word helicoper not found in the vocabulary.
Word musyt not found in the vocabulary.
Word featurize not found in the vocabulary.
Word rockwater not found in the vocabulary.
Word prettiful not found in the vocabulary.
Word winan not found in the vocabulary.
Word oraiotato not found in the vocabulary.
Word grafeio not found in the vocabulary.
Word theloume not found in the vocabulary.
Word imeras not found in the vocabulary.
Word parathiro not found in the vocabulary.
Word annaek not found in the vocabulary.
Word shouddd not found in the vocabulary.
Word festivalness not found in the vocabulary.
Word logline not found in the vocabulary.
Word inkn not found in the vocabulary.
Word plecing not found in the vocabulary.
Word kangkung not found in the vocabulary.
Word taliwang not found in the vocabulary.
Word extremeeeee not found in the vocabulary.
Word uihasuidhuiashduihausihduiahsuidhauisdh not found 

In [30]:
# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_sum_vecs_dbow_dmm, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [31]:
clf.score(val_sum_vecs_dbow_dmm, y_val)

0.72775

#### Scaling the train & validation sets using Sklearn

In [32]:
train_sum_vecs_dbow_dmm_scaled = scale(train_sum_vecs_dbow_dmm)
val_sum_vecs_dbow_dmm_scaled = scale(val_sum_vecs_dbow_dmm)

In [34]:
del train_avg_vecs_dbow_dmm, val_avg_vecs_dbow_dmm, train_sum_vecs_dbow_dmm, val_sum_vecs_dbow_dmm, train_avg_vecs_dbow_dmm_scaled, val_avg_vecs_dbow_dmm_scaled

NameError: name 'train_sum_vecs_dbow_dmm' is not defined

In [35]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_sum_vecs_dbow_dmm_scaled, y_train)

CPU times: user 1min 44s, sys: 2.44 s, total: 1min 46s
Wall time: 2min 10s


In [38]:
clf.score(val_sum_vecs_dbow_dmm_scaled, y_val)

0.72775

In [37]:
#### Populate table with Models & it's Accuracy

In [39]:
mydata = [['Averaged Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '71.78%'], 
          ['Averaged & Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '72.80%'],
          ['Summed Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '72.77%'], 
          ['Summed & Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '72.77%'],
          ['Document Vectors from D2V Unigram DBOW + Unigram DMM', '75.51%'],
          ['Document Vectors from D2V Unigram DBOW + Trigram DMM', '75.76%']]

In [40]:
from tabulate import tabulate
from IPython.display import HTML

display(HTML(tabulate(mydata, headers= ['Model', 'Validation Accuracy'], floatfmt='.4f', tablefmt='html')))

Model,Validation Accuracy
Averaged Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,71.78%
Averaged & Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,72.80%
Summed Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,72.77%
Summed & Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,72.77%
Document Vectors from D2V Unigram DBOW + Unigram DMM,75.51%
Document Vectors from D2V Unigram DBOW + Trigram DMM,75.76%


With scaling, Logistic Regression fitting took only 2 minutes.

### Word vectors extracted from Doc2Vec models with TFIDF weighting (Average/Sum)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(min_df=2)
tvec = tvec.fit(x_train)

In [16]:
# Zip Feature names & TFIDF scores

tf_dict = dict(zip(tvec.get_feature_names(), tvec.idf_))
len(tf_dict)

103730

In [25]:
# Five elements of ditionary

import operator

for k, v in sorted(tf_dict.items(), key=operator.itemgetter(1))[:5]:
    print("{}: {}".format(k, v))

to: 2.250638633462157
the: 2.3190274243809395
my: 2.7347349928433418
and: 2.7719162140316103
it: 2.8083640290651157


In [34]:
# Five elements of ditionary

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

take(5, tf_dict.items())

[('badu', 12.046436292779187),
 ('youuuuuuu', 11.738951593031228),
 ('represent', 10.787975303169182),
 ('braziiiil', 14.16669982897928),
 ('sovereign', 12.62625478803213)]

In [33]:
list(islice(tf_dict.items(), 0, 5))

[('badu', 12.046436292779187),
 ('youuuuuuu', 11.738951593031228),
 ('represent', 10.787975303169182),
 ('braziiiil', 14.16669982897928),
 ('sovereign', 12.62625478803213)]

In [35]:
# TFIDF features in Unigram DBOW model

len(set(dbow_ug_model.wv.vocab.keys()) & set(tvec.get_feature_names()))

103730

In [None]:
%%time

# Common features of DBOW model & TFIDF vectorizer
common_features = set(dbow_ug_model.wv.vocab.keys()).intersection(set(tvec.get_feature_names()))

tf_dict_weighted = {}
# Iterate through common features and make a dictionary of word vectors weighted by TFIDF score
for feature in common_features:
    tf_dict_weighted[feature] = np.append(dbow_ug_model[feature], dmm_ug_model[feature]) * tf_dict[feature]

In [47]:
# Dumping dictionary 

with open('./data/tf_dict_weighted.pkl', 'wb') as file:
    pickle.dump(tf_dict_weighted, file, protocol=pickle.HIGHEST_PROTOCOL)

In [64]:
# Loading dictionary 

with open('./data/tf_dict_weighted.pkl', 'rb') as file:
    tf_dict_weighted = pickle.load(file)

EOFError: Ran out of input

In [58]:
# Five elements of ditionary

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

take(5, tf_dict_weighted.items())

NameError: name 'tf_dict_custom_weighted' is not defined

In [26]:
# Extract average words vectors for train, validation sets using Doc2Vec models

def wordvec_generali(tweet, size, tf_dict_weighted, aggregation='avg'):
    # Numpy zeros initialization
    vector = np.zeros(size).reshape((1, size))
    
    counter = 0
    for word in tweet.split():
        try:
            vector += tf_dict_weighted[word].reshape((1, size))
            counter += 1
        except KeyError:
            continue
    
    # Counter not zero, divide by it to get averaged vector
    if(aggregation == 'avg') and counter > 0:
        vector /= counter
    
    return vector

#### Scaling the train & validation sets using Sklearn

In [56]:
# Averaging over each word vector in a tweet 

train_avg_vecs_dbow_tf_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_weighted, 'avg') for tweet in x_train]))
val_avg_vecs_dbow_tf_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_weighted, 'avg') for tweet in x_val]))

In [54]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_vecs_dbow_tf_weighted_scaled, y_train)

CPU times: user 1min 12s, sys: 529 ms, total: 1min 13s
Wall time: 1min 13s


In [57]:
clf.score(val_avg_vecs_dbow_tf_weighted_scaled, y_val)

0.654625

In [58]:
del train_avg_vecs_dbow_tf_weighted_scaled, val_avg_vecs_dbow_tf_weighted_scaled

In [59]:
# Summing over each word vector in a tweet 

train_sum_vecs_dbow_tf_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_weighted, 'sum') for tweet in x_train]))
val_sum_vecs_dbow_tf_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_weighted, 'sum') for tweet in x_val]))

In [60]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_sum_vecs_dbow_tf_weighted_scaled, y_train)

CPU times: user 1min 28s, sys: 623 ms, total: 1min 29s
Wall time: 1min 29s


In [62]:
clf.score(val_sum_vecs_dbow_tf_weighted_scaled, y_val)

0.6543125

By weighting word vectors with TFIDF values, the validation accuracy dropped in both case of averaging and summing.

#### Populate table with Models & it's Accuracy

In [64]:
mydata = [['Averaged Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '71.78%'], 
          ['Averaged & Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '72.80%'],
          ['Averaged Word Vectors extracted from D2V Unigram DBOW & TFIDF Weighted', '65.46%'], 
          ['Averaged & Scaled Word Vectors extracted from D2V Unigram DBOW & TFIDF Weighted', '65.43%'],
          ['Summed Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '72.77%'], 
          ['Summed & Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '72.77%'],
          ['Document Vectors from D2V Unigram DBOW + Unigram DMM', '75.51%'],
          ['Document Vectors from D2V Unigram DBOW + Trigram DMM', '75.76%']]

In [65]:
from tabulate import tabulate
from IPython.display import HTML

display(HTML(tabulate(mydata, headers= ['Model', 'Validation Accuracy'], floatfmt='.4f', tablefmt='html')))

Model,Validation Accuracy
Averaged Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,71.78%
Averaged & Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,72.80%
Averaged Word Vectors extracted from D2V Unigram DBOW & TFIDF Weighted,65.46%
Averaged & Scaled Word Vectors extracted from D2V Unigram DBOW & TFIDF Weighted,65.43%
Summed Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,72.77%
Summed & Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,72.77%
Document Vectors from D2V Unigram DBOW + Unigram DMM,75.51%
Document Vectors from D2V Unigram DBOW + Trigram DMM,75.76%


### Word vectors extracted from Doc2Vec models with Custom Weighting (Average/Sum)

The harmonic mean rank seems like the same as pos_freq_pct. By calculating the harmonic mean, the impact of small value (in this case, pos_freq_pct) is too aggravated and ended up dominating the mean value. This is again exactly same as just the frequency value rank and doesn't provide a meaningful result.

What we can try next is to get the CDF (Cumulative Distribution Function) value of both pos_rate and pos_freq_pct. CDF can be explained as "distribution function of X, evaluated at x, is the probability that X will take a value less than or equal to x". By calculating CDF value, we can see where the value of either pos_rate or pos_freq_pct lies in the distribution in terms of cumulative manner. In the below result of the code, we can see a word "welcome" with pos_rate_normcdf of 0.995625, and pos_freq_pct_normcdf of 0.999354. This means roughly 99.56% of the tokens will take a pos_rate value less than or equal to 0.91535, and 99.99% will take a pos_freq_pct value less than or equal to 0.001521.

Next, we calculate a harmonic mean of these two CDF values, as we did earlier. By calculating the harmonic mean, we can see that pos_normcdf_hmean metric provides a more meaningful measure of how important a word is within the class.

The custom weighting has been used before to visualize positive & negative tokens and create custom lexicon for the classification task.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(max_features=10000)
cvec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
from scipy.stats import hmean, norm

def norm_cdf(x):
    return norm.cdf(x, x.mean(), x.std())

In [15]:
# Positive, negative sentiments dataframes
neg_train = x_train[y_train == 0]
pos_train = x_train[y_train == 1]

# Document Matrices
neg_doc_matrix = cvec.transform(neg_train)
pos_doc_matrix = cvec.transform(pos_train)

# Summing tf-idf values 
neg_tf = np.sum(neg_doc_matrix, axis=0)
pos_tf = np.sum(pos_doc_matrix, axis=0)

# Remove single-dimensional entries from the shape of an array.
negative = np.squeeze(np.asarray(neg_tf))
positive = np.squeeze(np.asarray(pos_tf))

# tf-idf values saved in the dataframe
tf_df = pd.DataFrame([negative, positive], columns=cvec.get_feature_names()).transpose()
#tf_df.to_hdf('./data/tfidf.hdf', 'TFIDF')

# Sorting positive, negative sentiments
tf_df.columns = ['negative', 'positive']
tf_df['total'] = tf_df['negative'] + tf_df['positive']

# Metrics from Scattertext
tf_df['pos_rate'] = tf_df['positive'] * 1./tf_df['total']

# Another metric is the frequency a words occurs in the class
tf_df['pos_freq_perc'] = tf_df['positive'] * 1./tf_df['positive'].sum()

# Calculate Cumulative Distribution Function value of both pos_rate and pos_freq_perc.
tf_df['pos_rate_norm_cdf'] = norm_cdf(tf_df['pos_rate'])
tf_df['pos_freq_perc_norm_cdf'] = norm_cdf(tf_df['pos_freq_perc'])

# Calculate hmean of pos_rate_norm_cdf & pos_freq_perc_norm_cdf
tf_df['pos_norm_cdf_hmean'] = hmean([tf_df['pos_rate_norm_cdf'], tf_df['pos_freq_perc_norm_cdf']])

tf_df.sort_values(by='pos_norm_cdf_hmean', ascending=False).iloc[:10]

Unnamed: 0,negative,positive,total,pos_rate,pos_freq_perc,pos_rate_norm_cdf,pos_freq_perc_norm_cdf,pos_norm_cdf_hmean
thanks,5643,33676,39319,0.856482,0.004059,0.988566,1.0,0.99425
thank,2241,15422,17663,0.873125,0.001859,0.991379,0.990694,0.991036
awesome,3744,14176,17920,0.791071,0.001709,0.968526,0.984312,0.976355
happy,6354,20102,26456,0.759828,0.002423,0.951625,0.999058,0.974765
great,8020,24803,32823,0.755659,0.002989,0.948903,0.999945,0.973755
love,16641,46750,63391,0.737486,0.005634,0.935582,1.0,0.966719
hey,4938,13730,18668,0.735483,0.001655,0.933962,0.981253,0.957024
nice,6468,16650,23118,0.720218,0.002007,0.920562,0.994633,0.956165
yay,3107,10299,13406,0.768238,0.001241,0.956765,0.936626,0.946588
haha,9644,21218,30862,0.687512,0.002557,0.88509,0.999495,0.93882


In [19]:
%%time

# Common features of DBOW model & Harmonic mean custom weighting
common_features = set(dbow_ug_model.wv.vocab.keys()).intersection(set(tf_df.index))

tf_dict_custom_weighted = {}
# Iterate through common features and make a dictionary of word vectors weighted by Harmonic mean custom weighting
for feature in common_features:
    tf_dict_custom_weighted[feature] = np.append(dbow_ug_model[feature], dmm_ug_model[feature]) * tf_df.pos_norm_cdf_hmean[feature]

CPU times: user 264 ms, sys: 8.35 ms, total: 273 ms
Wall time: 266 ms


In [24]:
len(dbow_ug_model.wv.vocab.keys())

106259

In [23]:
len(tf_df.index)

10000

In [22]:
# TFIDF features in Unigram DBOW model

len(set(dbow_ug_model.wv.vocab.keys()) & set(tf_df.index))

10000

In [25]:
# Dumping dictionary 

with open('./data/tf_dict_custom_weighted.pkl', 'wb') as file:
    pickle.dump(tf_dict_custom_weighted, file, protocol=pickle.HIGHEST_PROTOCOL)

#### Scaling the train & validation sets using Sklearn

In [40]:
# Averaging over each word vector in a tweet 

train_avg_vecs_dbow_tf_custom_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_custom_weighted, 'avg') for tweet in x_train]))
val_avg_vecs_dbow_tf_custom_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_custom_weighted, 'avg') for tweet in x_val]))

In [41]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_vecs_dbow_tf_custom_weighted_scaled, y_train)

CPU times: user 1min 26s, sys: 648 ms, total: 1min 26s
Wall time: 1min 26s


In [42]:
clf.score(val_avg_vecs_dbow_tf_custom_weighted_scaled, y_val)

0.688625

In [31]:
del train_avg_vecs_dbow_tf_custom_weighted_scaled, val_avg_vecs_dbow_tf_custom_weighted_scaled

In [32]:
# Summing over each word vector in a tweet 

train_sum_vecs_dbow_tf_custom_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_custom_weighted, 'sum') for tweet in x_train]))
val_sum_vecs_dbow_tf_custom_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_custom_weighted, 'sum') for tweet in x_val]))

In [37]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_sum_vecs_dbow_tf_custom_weighted_scaled, y_train)

CPU times: user 1min 30s, sys: 564 ms, total: 1min 30s
Wall time: 1min 30s


In [38]:
clf.score(val_sum_vecs_dbow_tf_custom_weighted_scaled, y_val)

0.689375

In [47]:
del tf_dict_custom_weighted, train_avg_vecs_dbow_tf_custom_weighted_scaled, val_avg_vecs_dbow_tf_custom_weighted_scaled

The custom weighting improves validation accuracy a little bit as compared to TFIDF.

### Word vectors extracted from pre-trained GloVe (Average/Sum)

GloVe is another kind of word vector representaiton proposed by Pennington et al. (2014) from the Stanford NLP Group. 
- https://nlp.stanford.edu/pubs/glove.pdf

The difference between Word2Vec and Glove is how the models compute the word vectors. In Word2Vec, the word vectors can be seen as a by-product of shallow neural network, when it tries to predict either centre word given surrounding words or vice versa. 

GloVe word vectors are the object matrix of GloVe model which is calculated by using co-occurrence matrix and dimensionality reduction.

In addition to some pre-trained word vectors, new datasets can also be easily added using their downloader API. If you want to know more about this, please check this blog post by RaRe Technologies.
- https://rare-technologies.com/new-download-api-for-pretrained-nlp-models-and-datasets-in-gensim/

The Stanford NLP Group has made their pre-trained GloVe vectors publicly available and among them some of the GloVe vectors are trained specifically with Tweets. They have four different versions of Tweet vectors each with different dimensions (25, 50, 100, 200) trained on 2 billion Tweets. You can find more detail in their website. 
- https://nlp.stanford.edu/projects/glove/

For this post, I will use 200 dimesion pre-trrained GloVe vectors.

In [48]:
# 200 Dimensions GloVe vectors specifically trained on tweets

glove_twitter = api.load("glove-twitter-200")



#### Scaling the train & validation sets using Sklearn

In [49]:
# Averaging over each word vector in a tweet 

train_avg_vecs_dbow_glove_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, glove_twitter, 'avg') for tweet in x_train]))
val_avg_vecs_dbow_glove_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, glove_twitter, 'avg') for tweet in x_val]))

In [50]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_vecs_dbow_glove_weighted_scaled, y_train)

CPU times: user 1min 12s, sys: 801 ms, total: 1min 13s
Wall time: 1min 19s


In [52]:
clf.score(val_avg_vecs_dbow_glove_weighted_scaled, y_val)

0.7665625

In [54]:
# Summing over each word vector in a tweet 

train_sum_vecs_dbow_glove_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, glove_twitter, 'sum') for tweet in x_train]))
val_sum_vecs_dbow_glove_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, glove_twitter, 'sum') for tweet in x_val]))

In [55]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_sum_vecs_dbow_glove_weighted_scaled, y_train)

CPU times: user 2min, sys: 866 ms, total: 2min 1s
Wall time: 2min 4s


In [57]:
clf.score(val_sum_vecs_dbow_glove_weighted_scaled, y_val)

0.764625

As we can see in the above table, the models with GloVe weighted have more validation accuracy 

### Word vectors extracted from pre-trained Google News Word2Vec (Average/Sum)

With new updated Gensim, I can also load the famous pre-trained Google News word vectors. These word vectors are trained using Word2Vec model on Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. You can find more detail in the Google project archive. 
- https://code.google.com/archive/p/word2vec/

In [14]:
# 300 Dimensions Google News vectors specifically trained on tweets

google_news = api.load("word2vec-google-news-300")

#### Scaling the train & validation sets using Sklearn

In [None]:
# Averaging over each word vector in a tweet 

train_avg_vecs_dbow_google_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 300, google_news, 'avg') for tweet in x_train]))
val_avg_vecs_dbow_google_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 300, google_news, 'avg') for tweet in x_val]))

In [17]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_vecs_dbow_google_weighted_scaled, y_train)

CPU times: user 1min 25s, sys: 2.71 s, total: 1min 28s
Wall time: 2min 48s


In [18]:
clf.score(val_avg_vecs_dbow_google_weighted_scaled, y_val)

0.7510625

In [21]:
# Summing over each word vector in a tweet 

train_sum_vecs_dbow_google_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 300, google_news, 'sum') for tweet in x_train]))
val_sum_vecs_dbow_google_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 300, google_news, 'sum') for tweet in x_val]))

In [22]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_sum_vecs_dbow_google_weighted_scaled, y_train)

CPU times: user 1min 26s, sys: 1.85 s, total: 1min 28s
Wall time: 1min 31s


In [23]:
clf.score(val_sum_vecs_dbow_google_weighted_scaled, y_val)

0.7510625

### Word vectors separately trained (Average/Sum)

Previously, we have extracted word vectors from Doc2Vec models but what if we train a separate Word2Vec model on our tweets. The Doc2Vec models gave quite good representational word vectors in each document. Would the word vectors be more semantically correct if training a pure Word2Vec? 

Word2Vec parameters are similar to Doc2Vec.

In [14]:
# Labelling tweets using genism phrase library for unsupervised learning

def label_tweets_unigram(tweets, label):
    result = []
    prefix = label
    
    # Split tweets & attach label with index
    for index, tweet in zip(tweets.index, tweets):
        result.append(TaggedDocument(tweet.split(), [prefix + '_%s' % index]))
    
    return result

In [15]:
word_vec_train = label_tweets_unigram(df.text , 'all')
len(word_vec_train)

1600000

In [16]:
# All cores of CPU

cores = multiprocessing.cpu_count()

### CBOW (Continous Bag Of Words)

Within a corpus, CBOW model predicts the current word from a window of surrounding context words. It learns weights which act as word vector representations.

In [17]:
# Initializing Distributed Bag Of Words parameters & building word vocabulary

cbow_ug_model = Word2Vec(sg=0, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
cbow_ug_model.build_vocab([w_v.words for w_v in tqdm(word_vec_train)])

100%|██████████| 1600000/1600000 [00:00<00:00, 3092327.85it/s]


One caveat of the way this algorithm is that, since the learning rate decrease over the course of iterating over the data, labels which are only seen in a single LabeledSentence during training will only be trained with a fixed learning rate. This frequently produces less than optimal results.

Below iteration implement explicit multiple-pass, alpha-reduction approach with added shuffling.

In [18]:
%%time

# Multiple epochs iterating over labels more than once with decreasing learning rate

for epoch in range(30):
    
    # Shuffling word_vec_train & reducing aplha over multiple passes
    cbow_ug_model.train(utils.shuffle([w_v.words for w_v in tqdm(word_vec_train)]), total_examples=len(word_vec_train), epochs=1)
    cbow_ug_model.alpha -= 0.002
    cbow_ug_model.min_alpha = cbow_ug_model.alpha

100%|██████████| 1600000/1600000 [00:00<00:00, 3144211.90it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3304987.61it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 2943482.25it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3365052.11it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3308210.23it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3258977.47it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3172446.94it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3293670.41it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3231124.09it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3337945.69it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3274986.60it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3316333.68it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3232887.66it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3256074.32it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3262618.44it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3367395.

CPU times: user 18min 3s, sys: 4.34 s, total: 18min 7s
Wall time: 5min 57s


In [21]:
# Dumping dictionary 

with open('./data/cbow_ug_model.pkl', 'wb') as file:
    pickle.dump(cbow_ug_model, file, protocol=pickle.HIGHEST_PROTOCOL)

#### Scaling the train & validation sets using Sklearn

In [36]:
# Averaging over each word vector in a tweet 

train_avg_wordvecs_cbow_scaled = scale(np.concatenate([wordvec_generali(tweet, 100, cbow_ug_model, 'avg') for tweet in x_train]))
val_avg_wordvecs_cbow_scaled = scale(np.concatenate([wordvec_generali(tweet, 100, cbow_ug_model, 'avg') for tweet in x_val]))

  # Remove the CWD from sys.path while we load stuff.


In [37]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_wordvecs_cbow_scaled, y_train)

CPU times: user 35.1 s, sys: 316 ms, total: 35.4 s
Wall time: 35.4 s


In [38]:
clf.score(val_avg_wordvecs_cbow_scaled, y_val)

0.759875

### Skip Gram

Within a corpus, Skip-gram model predicts surrounding context words given the current word. It also learns weights which act as word vector representations.

In [19]:
# Initializing Skip Gram parameters & building word vocabulary

skip_gram_ug_model = Word2Vec(sg=1, size=100, negative=5, window=2, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
skip_gram_ug_model.build_vocab([w_v.words for w_v in tqdm(word_vec_train)])

100%|██████████| 1600000/1600000 [00:00<00:00, 3022703.95it/s]


One caveat of the way this algorithm is that, since the learning rate decrease over the course of iterating over the data, labels which are only seen in a single LabeledSentence during training will only be trained with a fixed learning rate. This frequently produces less than optimal results.

Below iteration implement explicit multiple-pass, alpha-reduction approach with added shuffling.

In [20]:
%%time

# Multiple epochs iterating over labels more than once with decreasing learning rate

for epoch in range(30):
    
    # Shuffling word_vec_train & reducing aplha over multiple passes
    skip_gram_ug_model.train(utils.shuffle([w_v.words for w_v in tqdm(word_vec_train)]), total_examples=len(word_vec_train), epochs=1)
    skip_gram_ug_model.alpha -= 0.002
    skip_gram_ug_model.min_alpha = skip_gram_ug_model.alpha

100%|██████████| 1600000/1600000 [00:00<00:00, 3166041.12it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3216242.61it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3156405.03it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3248467.90it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3150988.18it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3230422.61it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3395779.97it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 2935146.38it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3123366.16it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3174276.15it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3233725.76it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3131823.19it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3246723.42it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3222734.23it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3135903.31it/s]
100%|██████████| 1600000/1600000 [00:00<00:00, 3144627.

CPU times: user 38min 31s, sys: 5.62 s, total: 38min 36s
Wall time: 7min 40s


In [22]:
# Dumping dictionary 

with open('./data/skip_gram_ug_model.pkl', 'wb') as file:
    pickle.dump(skip_gram_ug_model, file, protocol=pickle.HIGHEST_PROTOCOL)

#### Scaling the train & validation sets using Sklearn

In [44]:
# Averaging over each word vector in a tweet 

train_avg_wordvecs_sgram_scaled = scale(np.concatenate([wordvec_generali(tweet, 100, skip_gram_ug_model, 'avg') for tweet in x_train]))
val_avg_wordvecs_sgram_scaled = scale(np.concatenate([wordvec_generali(tweet, 100, skip_gram_ug_model, 'avg') for tweet in x_val]))

  # Remove the CWD from sys.path while we load stuff.


In [45]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_wordvecs_sgram_scaled, y_train)

CPU times: user 16.6 s, sys: 341 ms, total: 17 s
Wall time: 20.9 s


In [46]:
clf.score(val_avg_wordvecs_sgram_scaled, y_val)

0.75975

### Concatenating CBOW & Skip Gram word vectors (Average/Sum)

In [48]:
# Extract average words vectors for train, validation sets using CBOW, Skip Gram Word2Vec models

def wordvec_generali_concate(model1, model2, corpus, size, aggregation='avg'):
    # Numpy zeros initialization
    vector = np.zeros(size).reshape((1, size))
    
    counter = 0
    for word in tweet.split():
        try:
            vector += np.append(model1[word], model2[word]).reshape((1, size))
            counter += 1
        except KeyError:
            continue
    
    # Counter not zero, divide by it to get averaged vector
    if(aggregation == 'avg') and counter > 0:
        vector /= counter
    
    return vector

#### Scaling the train & validation sets using Sklearn

In [44]:
# Averaging over each word vector in a tweet 

train_avg_wordvecs_cbow_sgram_scaled = scale(np.concatenate([wordvec_generali_concate(cbow_ug_model, skip_gram_ug_model, tweet, 200, 'avg') for tweet in x_train]))
val_avg_wordvecs_cbow_sgram_scaled = scale(np.concatenate([wordvec_generali_concate(cbow_ug_model, skip_gram_ug_model, tweet, 200, 'avg') for tweet in x_val]))

  # Remove the CWD from sys.path while we load stuff.


In [45]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_wordvecs_cbow_sgram_scaled, y_train)

CPU times: user 16.6 s, sys: 341 ms, total: 17 s
Wall time: 20.9 s


In [46]:
clf.score(val_avg_wordvecs_cbow_sgram_scaled, y_val)

0.75975

In [44]:
# Summing over each word vector in a tweet 

train_sum_wordvecs_cbow_sgram_scaled = scale(np.concatenate([wordvec_generali_concate(cbow_ug_model, skip_gram_ug_model, tweet, 200, 'sum') for tweet in x_train]))
val_sum_wordvecs_cbow_sgram_scaled = scale(np.concatenate([wordvec_generali_concate(cbow_ug_model, skip_gram_ug_model, tweet, 200, 'sum') for tweet in x_val]))

  # Remove the CWD from sys.path while we load stuff.


In [45]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_sum_wordvecs_cbow_sgram_scaled, y_train)

CPU times: user 16.6 s, sys: 341 ms, total: 17 s
Wall time: 20.9 s


In [46]:
clf.score(val_sum_wordvecs_cbow_sgram_scaled, y_val)

0.75975

### Word vectors separately trained with Custom Weighting (Average/Sum)

The harmonic mean rank seems like the same as pos_freq_pct. By calculating the harmonic mean, the impact of small value (in this case, pos_freq_pct) is too aggravated and ended up dominating the mean value. This is again exactly same as just the frequency value rank and doesn't provide a meaningful result.

What we can try next is to get the CDF (Cumulative Distribution Function) value of both pos_rate and pos_freq_pct. CDF can be explained as "distribution function of X, evaluated at x, is the probability that X will take a value less than or equal to x". By calculating CDF value, we can see where the value of either pos_rate or pos_freq_pct lies in the distribution in terms of cumulative manner. In the below result of the code, we can see a word "welcome" with pos_rate_normcdf of 0.995625, and pos_freq_pct_normcdf of 0.999354. This means roughly 99.56% of the tokens will take a pos_rate value less than or equal to 0.91535, and 99.99% will take a pos_freq_pct value less than or equal to 0.001521.

Next, we calculate a harmonic mean of these two CDF values, as we did earlier. By calculating the harmonic mean, we can see that pos_normcdf_hmean metric provides a more meaningful measure of how important a word is within the class.

The custom weighting has been used before to visualize positive & negative tokens and create custom lexicon for the classification task.

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(max_features=10000)
cvec.fit(x_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=10000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
from scipy.stats import hmean, norm

def norm_cdf(x):
    return norm.cdf(x, x.mean(), x.std())

In [12]:
# Positive, negative sentiments dataframes
neg_train = x_train[y_train == 0]
pos_train = x_train[y_train == 1]

# Document Matrices
neg_doc_matrix = cvec.transform(neg_train)
pos_doc_matrix = cvec.transform(pos_train)

# Summing tf-idf values 
neg_tf = np.sum(neg_doc_matrix, axis=0)
pos_tf = np.sum(pos_doc_matrix, axis=0)

# Remove single-dimensional entries from the shape of an array.
negative = np.squeeze(np.asarray(neg_tf))
positive = np.squeeze(np.asarray(pos_tf))

# tf-idf values saved in the dataframe
tf_df = pd.DataFrame([negative, positive], columns=cvec.get_feature_names()).transpose()
#tf_df.to_hdf('./data/tfidf.hdf', 'TFIDF')

# Sorting positive, negative sentiments
tf_df.columns = ['negative', 'positive']
tf_df['total'] = tf_df['negative'] + tf_df['positive']

# Metrics from Scattertext
tf_df['pos_rate'] = tf_df['positive'] * 1./tf_df['total']

# Another metric is the frequency a words occurs in the class
tf_df['pos_freq_perc'] = tf_df['positive'] * 1./tf_df['positive'].sum()

# Calculate Cumulative Distribution Function value of both pos_rate and pos_freq_perc.
tf_df['pos_rate_norm_cdf'] = norm_cdf(tf_df['pos_rate'])
tf_df['pos_freq_perc_norm_cdf'] = norm_cdf(tf_df['pos_freq_perc'])

# Calculate hmean of pos_rate_norm_cdf & pos_freq_perc_norm_cdf
tf_df['pos_norm_cdf_hmean'] = hmean([tf_df['pos_rate_norm_cdf'], tf_df['pos_freq_perc_norm_cdf']])

tf_df.sort_values(by='pos_norm_cdf_hmean', ascending=False).iloc[:10]

Unnamed: 0,negative,positive,total,pos_rate,pos_freq_perc,pos_rate_norm_cdf,pos_freq_perc_norm_cdf,pos_norm_cdf_hmean
thanks,5643,33676,39319,0.856482,0.004059,0.988566,1.0,0.99425
thank,2241,15422,17663,0.873125,0.001859,0.991379,0.990694,0.991036
awesome,3744,14176,17920,0.791071,0.001709,0.968526,0.984312,0.976355
happy,6354,20102,26456,0.759828,0.002423,0.951625,0.999058,0.974765
great,8020,24803,32823,0.755659,0.002989,0.948903,0.999945,0.973755
love,16641,46750,63391,0.737486,0.005634,0.935582,1.0,0.966719
hey,4938,13730,18668,0.735483,0.001655,0.933962,0.981253,0.957024
nice,6468,16650,23118,0.720218,0.002007,0.920562,0.994633,0.956165
yay,3107,10299,13406,0.768238,0.001241,0.956765,0.936626,0.946588
haha,9644,21218,30862,0.687512,0.002557,0.88509,0.999495,0.93882


In [23]:
%%time

# Common features of DBOW model & Harmonic mean custom weighting
common_features = set(cbow_ug_model.wv.vocab.keys()).intersection(set(tf_df.index))

tf_dict_custom_weighted = {}
# Iterate through common features and make a dictionary of word vectors weighted by Harmonic mean custom weighting
for feature in common_features:
    tf_dict_custom_weighted[feature] = np.append(cbow_ug_model[feature], skip_gram_ug_model[feature]) * tf_df.pos_norm_cdf_hmean[feature]

  


CPU times: user 383 ms, sys: 12.1 ms, total: 395 ms
Wall time: 372 ms


In [24]:
len(cbow_ug_model.wv.vocab.keys())

106259

#### Scaling the train & validation sets using Sklearn

In [37]:
# Averaging over each word vector in a tweet 

train_avg_vecs_cbow_sgram_tf_custom_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_custom_weighted, 'avg') for tweet in x_train]))
val_avg_vecs_cbow_sgram_tf_custom_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_custom_weighted, 'avg') for tweet in x_val]))

In [28]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_avg_vecs_cbow_sgram_tf_custom_weighted_scaled, y_train)

CPU times: user 2min 32s, sys: 1.09 s, total: 2min 33s
Wall time: 2min 33s


In [29]:
clf.score(val_avg_vecs_cbow_sgram_tf_custom_weighted_scaled, y_val)

0.77475

#### Scaling the train & validation sets using Sklearn

In [31]:
# Summing over each word vector in a tweet 

train_sum_vecs_cbow_sgram_tf_custom_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_custom_weighted, 'sum') for tweet in x_train]))
val_sum_vecs_cbow_sgram_tf_custom_weighted_scaled = scale(np.concatenate([wordvec_generali(tweet, 200, tf_dict_custom_weighted, 'sum') for tweet in x_val]))

In [32]:
%%time

# Train a Logistic Regression model

clf = LogisticRegression()
clf.fit(train_sum_vecs_cbow_sgram_tf_custom_weighted_scaled, y_train)

CPU times: user 2min 49s, sys: 1.14 s, total: 2min 50s
Wall time: 2min 50s


In [33]:
clf.score(val_sum_vecs_cbow_sgram_tf_custom_weighted_scaled, y_val)

0.72375

In [None]:
del tf_dict_custom_weighted

#### Populate table with Models & it's Accuracy

In [34]:
mydata = [['Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '200', 'NA', '71.78%', '72.77%'],
          [],
          ['Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM', '200', 'NA', '72.80%', '72.77%'],
          [],
          ['Word Vectors extracted from D2V Unigram DBOW', '200', 'TFIDF', '65.46%', '65.43%'], 
          [],
          ['Word Vectors extracted from D2V Unigram DBOW', '200', 'Custom Harmonic Mean', '68.86%', '68.93%'],
          [],
          ['Word Vectors extracted from pre-trained GloVe (tweets) W2V', '200', 'NA', '76.65%', '76.46%'],
          [],
          ['Word Vectors extracted from pre-trained Google News W2V', '300', 'NA', '75.10%', '75.10%'], 
          [],
          ['Word Vectors extracted from W2V Unigram CBOW + Unigram Skip Gram', '200', 'NA', '75.97%', '75.97%'],
          [],
          ['Word Vectors extracted from W2V Unigram CBOW + Unigram Skip Gram', '200', 'Custom Harmonic Mean', '77.47%', '72.37%']
         ]

In [35]:
from tabulate import tabulate
from IPython.display import HTML

display(HTML(tabulate(mydata, headers= ['Model', 'Word Vector Dimension', 'Weighting', 'Validation Accuracy Average W2V', 'Validation Accuracy Sum W2V'], floatfmt='.4f', tablefmt='html')))

Model,Word Vector Dimension,Weighting,Validation Accuracy Average W2V,Validation Accuracy Sum W2V
Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,200.0,,71.78%,72.77%
,,,,
Scaled Word Vectors extracted from D2V Unigram DBOW + Unigram DMM,200.0,,72.80%,72.77%
,,,,
Word Vectors extracted from D2V Unigram DBOW,200.0,TFIDF,65.46%,65.43%
,,,,
Word Vectors extracted from D2V Unigram DBOW,200.0,Custom Harmonic Mean,68.86%,68.93%
,,,,
Word Vectors extracted from pre-trained GloVe (tweets) W2V,200.0,,76.65%,76.46%
,,,,


### Neural Network with the best performing Word Vector Model

The best performing Word2Vec trained with Logistic Regression model will be fed to a neural network to further experiment/enhance validation accuracy. 

#### Keras neural network with Early Stopping & Checkpoint

In [40]:
%%time

# Setting checkpoint & early stopping
checkpoint_path = './checkpoint/bestw2v_model_best_weights.{epoch:02d}-{val_acc:.4f}.hdf5'
checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

early_stop = EarlyStopping(monitor='val_acc', patience=5, mode='max') 
callbacks_list = [checkpoint, early_stop]

np.random.seed(seed)
batch_size=32

# Create Model

model = Sequential()
model.add(Dense(128, input_dim=200, activation='relu'))
# model.add(Dense(256, activation='relu'))
# model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile Model

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model

model.fit(train_avg_vecs_cbow_sgram_tf_custom_weighted_scaled, y_train, validation_data=(val_avg_vecs_cbow_sgram_tf_custom_weighted_scaled, y_val), batch_size=batch_size,
                    epochs=5, verbose=2, callbacks=callbacks_list)

Train on 1568000 samples, validate on 16000 samples
Epoch 1/5
 - 97s - loss: 0.4616 - acc: 0.7833 - val_loss: 0.4565 - val_acc: 0.7867

Epoch 00001: val_acc improved from -inf to 0.78669, saving model to ./checkpoint/bestw2v_model_best_weights.01-0.7867.hdf5
Epoch 2/5
 - 87s - loss: 0.4502 - acc: 0.7888 - val_loss: 0.4576 - val_acc: 0.7843

Epoch 00002: val_acc did not improve from 0.78669
Epoch 3/5
 - 87s - loss: 0.4465 - acc: 0.7908 - val_loss: 0.4501 - val_acc: 0.7904

Epoch 00003: val_acc improved from 0.78669 to 0.79038, saving model to ./checkpoint/bestw2v_model_best_weights.03-0.7904.hdf5
Epoch 4/5
 - 86s - loss: 0.4441 - acc: 0.7922 - val_loss: 0.4462 - val_acc: 0.7933

Epoch 00004: val_acc improved from 0.79038 to 0.79325, saving model to ./checkpoint/bestw2v_model_best_weights.04-0.7933.hdf5
Epoch 5/5
 - 87s - loss: 0.4423 - acc: 0.7933 - val_loss: 0.4457 - val_acc: 0.7939

Epoch 00005: val_acc improved from 0.79325 to 0.79394, saving model to ./checkpoint/bestw2v_model_best_

In [42]:
# Load the best Model

loaded_model = load_model('./checkpoint/bestw2v_model_best_weights.05-0.7939.hdf5')

In [45]:
%%time

# Evaluate the best Model

loaded_model.evaluate(x=val_avg_vecs_cbow_sgram_tf_custom_weighted_scaled, y=y_val)

CPU times: user 766 ms, sys: 73.6 ms, total: 840 ms
Wall time: 452 ms


[0.44570713776350024, 0.7939375]

#### A properly trained Word2Vec with appropriate weights might even outperform Doc2Vec in classification.