Let's try now to use some pretrained embedding to see if we get better accuracy.

In [1]:
import numpy as np
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import gensim.models.word2vec as w2v
import matplotlib.pyplot as plt

We clean the data as before

In [2]:
from data_helpers import load_data_and_labels
from tflearn.data_utils import VocabularyProcessor

In [3]:
sentence_len=60
vocab_proc = VocabularyProcessor(sentence_len)

In [4]:
X_sentences,y=load_data_and_labels('./data/rt/rt-polarity.pos','./data/rt/rt-polarity.neg')
X = np.array(list(vocab_proc.fit_transform(X_sentences)))

In [5]:
vocab_size = len(vocab_proc.vocabulary_)

# The second model

For our second model we use a pre-trained embedding. There are many options for this, let's use the word2vec model we train before.

In [6]:
books=[open('./data/dorian.txt','r'),open('./data/earnest.txt','r'),
       open('./data/essays.txt','r'),open('./data/ghost.txt','r'),
       open('./data/happy_prince.txt','r'),open('./data/house_pomegranates.txt','r'),
       open('./data/ideal_husband.txt','r'),open('./data/intentions.txt','r'),
       open('./data/lady_windermere.txt','r'),open('./data/profundis.txt','r'),
       open('./data/salome.txt','r'),open('./data/soul_of_man.txt','r'),
       open('./data/woman_of_no_importance.txt','r'),open('./data/rt/rt-polarity.pos','r',encoding = "ISO-8859-1"),
       open('./data/rt/rt-polarity.neg','r',encoding = "ISO-8859-1")]
corpus = " ".join([book.read() for book in books])
raw_sentences = sent_tokenize(corpus)
sentences=[]
for sentence in raw_sentences:
    sentences+=[word_tokenize(sentence)]
emb_model=w2v.Word2Vec(sentences)

let's first get the vocabualry that we have in the movie reviews

In [7]:
vocab_list=list(vocab_proc.reverse([[i] for i in range(vocab_size)]))

and we use the embedding matrix from the word2vec, but we need to build it first

In [8]:
emb_matrix=np.array([emb_model[word] if word in emb_model else np.random.random(100) for word in vocab_list])

In [9]:
#Global hyper-parameters
emb_dim=100
hidden_dim=50
num_classes=2

we create the placeholders to hold the data.

In [10]:
input_x = tf.placeholder(tf.int32, shape=[None, sentence_len], name="input_x")
input_y = tf.placeholder(tf.float32, [None, num_classes], name="input_y")
#dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")

Next, we create the variables we are going to need.

In [11]:
with tf.name_scope("embedding"):
    W = tf.Variable(emb_matrix,name="W",dtype=tf.float32)
    embedded_chars = tf.nn.embedding_lookup(W, input_x)

We can put all the this vectors together into a large vector, for this we use the reshape method

In [12]:
with tf.name_scope("reshape"):
    emb_vec= tf.reshape(embedded_chars,shape=[-1,sentence_len*emb_dim])

we can now go over the hidden dimension, but first we need a variable for this

In [13]:
with tf.name_scope("hidden"):
    W_h= tf.Variable(tf.random_uniform([sentence_len*emb_dim, hidden_dim], -1.0, 1.0),name="w_hidden")
    b_h= tf.Variable(tf.zeros([hidden_dim],name="b_hidden"))
    hidden_output= tf.nn.relu(tf.matmul(emb_vec,W_h)+b_h)

finally, the output layer

In [14]:
with tf.name_scope("output_layer"):
    W_o= tf.Variable(tf.random_uniform([hidden_dim,2], -1.0, 1.0),name="w_o")
    b_o= tf.Variable(tf.zeros([2],name="b_o"))
    score = tf.nn.relu(tf.matmul(hidden_output,W_o)+b_o)
    predictions = tf.argmax(score, 1, name="predictions")

note that we didn't put the softmax layer here.

In [15]:
with tf.name_scope("loss"):
    losses=tf.nn.softmax_cross_entropy_with_logits(labels=input_y, logits=score)
    loss=tf.reduce_mean(losses)

In [16]:
with tf.name_scope("accuracy"):
    correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

We are almost ready to start the session, we need the training operation

In [17]:
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer=tf.train.AdamOptimizer(1e-4).minimize(loss)
loss_summary = tf.summary.scalar("loss", loss)
acc_summary = tf.summary.scalar("accuracy", accuracy)
summary_op=tf.summary.merge([loss_summary,acc_summary])

Running the session

In [18]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
train_summary_writer = tf.summary.FileWriter('./summaries2/', sess.graph)

In [19]:
for i in range(100):
    acc,loss_,_=sess.run([accuracy,loss,optimizer],feed_dict={input_x:X,input_y:y})
    step,summaries = sess.run([global_step,summary_op],feed_dict={input_x:X,input_y:y})
    train_summary_writer.add_summary(summaries, i)
    print("This is step: %d, acc=%.2f, loss=%.2f"%(i,acc,loss_),end='\r')

This is step: 99, acc=0.50, loss=0.71

and after initiating tensorboard by using 

tensorboard --logdir="./summaries"

we can navigate to http://127.0.1.1:6006/ to see what we get.

# Homework

- Create a deeper model and compare its perfomance.