<a href="https://colab.research.google.com/github/OmarMeriwani/CE807-Sentiment-analysis/blob/master/Word2Vec_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This document provides the code of Word2vec classifer

In [0]:
import numpy as np
from string import punctuation
from os import listdir
import pandas as pd
from numpy import zeros
from numpy import asarray
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from gensim.models import KeyedVectors
from gensim.test.utils import datapath


Load files and clean sentences, remove punctuation, lowerize case and return clean setnences.

In [0]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def doc_to_clean_lines(doc, vocab):
    clean_lines = ''
    lines = doc.splitlines()

    for line in lines:
        tokens = line.split()
        table = str.maketrans('', '', punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [w for w in tokens if w.lower() in vocab]
        clean_lines = ' '.join(tokens)
    return clean_lines


Load [pre-created](https://github.com/OmarMeriwani/CE807-Sentiment-analysis/blob/master/Build_Word2Vec_Vocabulary.ipynb) vocabulary file

In [0]:
vocab_filename = 'vocabulary.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
vocab = [v.lower() for v in vocab]


A method to read training dataset, getting values from it, and cleaning the sentences 

In [0]:
def readfile(filename):
    df = pd.read_csv(filename,header=0,sep='\t')
    mode = 'sentence' #all sentences or only full reviews (sentence,full)
    data = []
    prev = ''
    for i in range(0,len(df)):
        if mode == 'sentence':
            if prev != str(df.loc[i][1]):
                sentence = df.loc[i][2]
                prev = str(df.loc[i][1])
            else:
                continue
        else:
            sentence = df.loc[i][2]
        reviewPolarity = int(df.loc[i][3])
        sentence = doc_to_clean_lines(sentence,vocab)
        data.append([sentence,reviewPolarity])
    return data


To load  non binary files of embeddings

In [0]:
def load_embedding(filename):
	file = open(filename,'r')
	lines = file.readlines()
	file.close()
	embedding = dict()
	for line in lines:
		parts = line.split()
		embedding[parts[0]] = asarray(parts[1:], dtype='float32')
	return embedding

A method to get vectors for each word 

In [0]:
def get_weight_matrix2(embedding, vocab):
	vocab_size = len(vocab) + 1
	weight_matrix = zeros((vocab_size, 300))
	for word, i in vocab:
		vector = None
		try:
			vector = embedding.get_vector(word)
		except:
			continue
		if vector is not None:
			weight_matrix[i] = vector
	return weight_matrix


Split the dataset into training and testing datasets according a percentage

In [0]:
def split(docs, percentage):
    length = len(docs)
    firstlength = int (length * percentage)
    training = docs[:firstlength]
    test = docs[firstlength:length]
    return training,test
data = np.array(readfile('train.csv'))
print(data.shape)
traindata, testdata = split(data,0.7)
print(testdata.shape)
print(traindata.shape)
train_docs = traindata[:,0]
test_docs = testdata[:,0]
y_train = traindata[:,1]
y_test = testdata[:,1]


Training word2vec model using the training tokens that have been created above

In [0]:
w2v = Word2Vec(size=100, min_count=10)
w2v.build_vocab(sentences=training_tokens)
w2v.train(sentences=training_tokens,total_words=len(vocab),epochs=10)

Encode the documents using keras tokenizer, which will help in preioritizing words according to their occurences in the file.

In [0]:
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(train_docs)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)


Ensuring that all sequences in the resulting encoded array have the same length. Then, applying the same previous step on test data.

In [0]:
# pad sequences
max_length = max([len(s.split()) for s in train_docs])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = traindata[:,1]
test_docs = testdata[:,0]
encoded_docs = tokenizer.texts_to_sequences(test_docs)
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
ytest = testdata[:,1]
vocab_size = len(tokenizer.word_index) + 1


The commented code below shows two versions of embeddings, the ready one that is based on Google news and the one that has been created in this project.

In [0]:
#raw_embedding = load_embedding('embedding2.txt')
wv_from_bin = KeyedVectors.load_word2vec_format(datapath('G:/Data/GN/GoogleNews-vectors-negative300.bin'), binary=True)
#embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
embedding_vectors = get_weight_matrix2(wv_from_bin, tokenizer.word_index.items())

Keras model for predicting the result (commented code shows the use of Logistic regression which has given similar results)

In [0]:
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_vectors], input_length=max_length, trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(Dense(128, activation='relu', input_dim=200))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
print(model.summary())

# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)
#model = LogisticRegression(C=0.2, dual=True)
#model.fit(Xtrain, ytrain)

# evaluate
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
#result = model.score(Xtest,ytest)
print('Test Accuracy: %f' % (acc*100))
#print(result)