In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nltk
from collections import Counter, deque
import random
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from matplotlib import pylab
from scipy.sparse import lil_matrix


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
def read_data(filename):
    with open(filename, 'r') as f:
        data=[]
        files=f.read()
        files=files.lower()
        files=nltk.word_tokenize(files)
        data.extend(files)
        
    return data

words=read_data('/home/prayag/DL_projects/9781788478311_Code/NaturalLanguageProcessingwithTensorFlow_Code/ch3/wikipedia2text-extracted.txt')


In [2]:
vocabulary_size = 50001
count=[('UNK', 50000)]
count.extend(Counter(words).most_common(vocabulary_size-1))
vocab2int={i:v for v, i in enumerate([word for word, _ in count])}
int2vocab={i:v for i, v in zip(vocab2int.values(), vocab2int.keys())}
data_ints=[vocab2int.get(val, 50000) for val in words]

NameError: name 'words' is not defined

In [2]:
data_index=0
def generate_batch(batch_size, window_size):
    global data_index
    batch=np.ndarray(shape=(batch_size), dtype=np.int32)
    labels=np.ndarray(shape=(batch_size,1), dtype=np.int32)
    weights=np.ndarray(shape=(batch_size), dtype=np.int32)
    span=2*window_size+1
    num_samples=2*window_size
    buffer=deque(maxlen=span)
    
    for _ in range(span):
        buffer.append(data_ints[data_index])
        data_index=(data_index+1)%len(data_ints)
        
    for i in range(batch_size//num_samples): # m+1<= i <=N-m refer text
        k=0
        for j in list(range(window_size)) + list(range(window_size+1, span)):
            batch[i*num_samples+k]=buffer[window_size]
            labels[i*num_samples+k]=buffer[j]
            k+=1
            
        buffer.append(data_ints[data_index])
        data_index=(data_index+1)%len(data_ints)
        
    return batch, labels, weights

In [18]:
dataset_size=len(data_ints)
cooc_mat=lil_matrix((vocabulary_size, vocabulary_size), dtype=np.int32)

def generate_cooc(batch_size, window_size):
    
    print("running %d iterartion to calcualte cooc matrix"%(dataset_size//batch_size))
    
    for i in range(dataset_size//batch_size):
        if i>0 and i%100000==0:
            print ("%d iteration"%(i))
            
        context, target, weights = generate_batch(8,4)
        target = target.reshape(-1)
        
        for c,t,w in zip(context,target,weights):
            cooc_mat[c,t]+=1.0*w
            
generate_cooc(8,4)   

running 1451701 iterartion to calcualte cooc matrix
100000 iteration
200000 iteration
300000 iteration
400000 iteration
500000 iteration
600000 iteration
700000 iteration
800000 iteration
900000 iteration
1000000 iteration
1100000 iteration
1200000 iteration
1300000 iteration
1400000 iteration


In [3]:
batch_size=128
embed_size=128
window_size=4
valid_size=16
valid_window=50
valid_sample=np.array(random.sample(range(valid_window), valid_size))
valid_sample=np.append(valid_sample, random.sample(range(1000, 1000+valid_window), valid_size), axis=0)

In [4]:
train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size])
valid_dataset = tf.constant(valid_sample, dtype=tf.int32)


In [6]:
in_embeddings=tf.Variable(tf.random_uniform([vocabulary_size, embed_size], -1.0, 1.0))
in_bias_embeddings=tf.Variable(tf.random_uniform([vocabulary_size], 0.1, 0.01, dtype=tf.float32))
out_embeddings=tf.Variable(tf.random_uniform([vocabulary_size, embed_size], -1.0, 1.0))
out_bias_embeddings=tf.Variable(tf.random_uniform([vocabulary_size], 0.1, 0.01, dtype=tf.float32))

embed_in=tf.nn.embedding_lookup(in_embeddings, train_dataset)
embed_out=tf.nn.embedding_lookup(out_embeddings, train_labels)
embed_bias_in=tf.nn.embedding_lookup(in_bias_embeddings, train_dataset)
embed_bias_out=tf.nn.embedding_lookup(out_bias_embeddings, train_labels)

weight_x=tf.placeholder(shape=[batch_size], dtype=tf.float32)
x_ij=tf.placeholder(shape=[batch_size], dtype=tf.float32)

loss=tf.reduce_mean(weight_x*(tf.reduce_sum(embed_in*embed_out, axis=0) +embed_bias_in+embed_bias_out-tf.log(1+x_ij))**2 )
optm=tf.train.AdagradOptimizer(1.0).minimize(loss)



In [10]:
embed_out.shape

TensorShape([Dimension(128), Dimension(128)])

In [8]:
#cosine similarity
embeddings=(in_embeddings+out_embeddings)/2
norm=tf.sqrt(tf.reduce_mean(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings=embeddings/norm#a/|a| vocab_size X embed_size
valid_embeddings=tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) #batch_size X embed_size
similarity=tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings)) #batch_size X vocab_size

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [9]:
epochs=100001
glove_loss=[]
avg_loss=0
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for epoch in range (epochs):
        batch_data, batch_labels, _ = generate_batch(batch_size, window_size)
        batch_weight=[] #for calculating weighting term in paper, f(xij)
        batch_xij=[] #for calculating cooccurance count of each pair of words in training batch
        for c,t in zip(batch_data, batch_labels.reshape(-1)):
            point_weight = (cooc_mat[c,t]/100)**0.75 if cooc_mat[c,t]<100 else 1.0
            batch_weight.append(point_weight)
            batch_xij.append(cooc_mat[c,t])
            
        batch_weight=np.clip(batch_weight, -100, 1)
        batch_xij=np.asarray(batch_xij)
        
        _, l=sess.run([optm, loss], feed_dict:{train_dataset:batch_data, train_labels:batch_labels, 
                                              weight_x:batch_weight, x_ij:batch_xij})
        
        avg_loss+=l
        if epoch%2000==0:
            print("avg loss at %d is %d"%(epoch, avg_loss))
            glove_loss.append(avg_loss/2000)
            avg_loss=0
            
        if epoch%10000==0:
            #calculate close words for valid words
            pass
            
        
            

TensorShape([Dimension(32), Dimension(50001)])