In [1]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, Embedding, Input, Add, Dot, Reshape, Flatten
from keras.optimizers import Adam
from keras.preprocessing.sequence import skipgrams
from keras.models import Model, load_model

import tarfile
from urllib.request import urlretrieve
import os
import nltk
from scipy.sparse import csr_matrix, lil_matrix
import numpy as np
from scipy.sparse import save_npz, load_npz
from keras import backend as K
import random
import matplotlib.pyplot as plt
from keras.utils import plot_model
import pandas as pd
%matplotlib inline

In [2]:
url = 'http://www.cs.cmu.edu/~ark/personas/data/'

def maybe_download(filename):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists("datasets"):
    os.mkdir("datasets")
  if not os.path.exists(os.path.join("datasets", filename)):
    print('Downloading file...')
    filename, _ = urlretrieve(url + filename, os.path.join("datasets",filename))
  else:
    print('File exists ...')

  return filename

def extract_and_verify(filename, expected_bytes):
  print("Extracting the file")
  tar = tarfile.open(os.path.join("datasets",filename), "r:gz")
  tar.extractall("datasets")
  tar.close()

  statinfo = os.stat(os.path.join("datasets",filename))
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % os.path.join("datasets",filename))
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + os.path.join("datasets",filename) + '. Can you get to it with a browser?')

  return filename

filename = maybe_download('MovieSummaries.tar.gz')
extract_and_verify(filename, 48002242)

File exists ...
Extracting the file
Found and verified datasets\MovieSummaries.tar.gz


'MovieSummaries.tar.gz'

In [3]:
def read_data(filename, n_lines):
    """ Reading the zip file to extract text """
    docs = []
    i = 0
    with open(filename, 'r', encoding='utf-8') as f:
        for row in f:
            file_string = nltk.word_tokenize(row)
            # First token is the movie ID
            docs.append(' '.join(file_string[1:]))
            i += 1
            if n_lines and i == n_lines:
                break
    return docs

docs = read_data(os.path.join("datasets", "MovieSummaries", 'plot_summaries.txt'), 10000)
print("Read in {} documents".format(len(docs)))

Read in 10000 documents


In [4]:
v_size = 3000
tokenizer = Tokenizer(num_words=v_size, oov_token='UNK')
tokenizer.fit_on_texts(docs)

In [5]:
generate_cooc = False
def generate_cooc_matrix(text, tokenizer, window_size, n_vocab, use_weighting=True):
    sequences = tokenizer.texts_to_sequences(text)
    
    cooc_mat = lil_matrix((n_vocab, n_vocab), dtype=np.float32)
    for sequence in sequences:
        for i, wi in zip(np.arange(window_size, len(sequence)-window_size), sequence[window_size:-window_size]):
            context_window = sequence[i-window_size: i+window_size+1]
            distances = np.abs(np.arange(-window_size, window_size+1))
            distances[window_size] = 1.0
            nom = np.ones(shape=(window_size*2 + 1,), dtype=np.float32)
            nom[window_size] = 0.0

            if use_weighting:
                cooc_mat[wi, context_window] += nom/distances    # Update element
            else:
                cooc_mat[wi, context_window] += nom
    
    return cooc_mat    

if generate_cooc:
    cooc_mat = generate_cooc_matrix(docs, tokenizer, 4, v_size, True)
    save_npz(os.path.join('cooc_mat.npz'), cooc_mat.tocsr())
else:
    cooc_mat = load_npz(os.path.join('cooc_mat.npz')).tolil()
    print('Cooc matrix of type {} was loaded from disk'.format(type(cooc_mat).__name__))

Cooc matrix of type lil_matrix was loaded from disk


In [6]:
def create_glove_model(v_size):
    
    w_i = Input(shape=(1,))
    w_j = Input(shape=(1,))

    emb_i = Flatten()(Embedding(v_size, 96, input_length=1)(w_i))
    emb_j = Flatten()(Embedding(v_size, 96, input_length=1)(w_j))

    ij_dot = Dot(axes=-1)([emb_i,emb_j])
    
    b_i = Flatten()(
        Embedding(v_size, 1, input_length=1)(w_i)
    )
    b_j = Flatten()(
        Embedding(v_size, 1, input_length=1)(w_j)
    )

    pred = Add()([ij_dot, b_i, b_j])

    def glove_loss(y_true, y_pred):
        return K.sum(
            K.pow((y_true-1)/100.0, 0.75)*K.square(y_pred - K.log(y_true))
        )

    model = Model(inputs=[w_i, w_j],outputs=pred)
    model.compile(loss=glove_loss, optimizer =Adam(lr=0.0001))
    return model

In [7]:
K.clear_session()
model = create_glove_model(v_size)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 96)        288000      ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 96)        288000      ['input_2[0][0]']                
                                                                                              

  super().__init__(name, **kwargs)


In [8]:
cooc_mat = load_npz(os.path.join('cooc_mat.npz'))
batch_size =128
copy_docs = list(docs)
index2word = dict(zip(tokenizer.word_index.values(), tokenizer.word_index.keys()))

""" Each epoch """
for ep in range(3):
    
    #valid_words = get_valid_words(docs, 20, tokenizer)
    
    random.shuffle(copy_docs)
    losses = []
    """ Each document (i.e. movie plot) """
    for doc in copy_docs:
        
        seq = tokenizer.texts_to_sequences([doc])[0]

        """ Getting skip-gram data """
        # Negative samples are automatically sampled by tf loss function
        wpairs, labels = skipgrams(
            sequence=seq, vocabulary_size=v_size, negative_samples=0.0, shuffle=True
        )
        
        if len(wpairs)==0:
            continue

        sg_in, sg_out = zip(*wpairs)
        sg_in, sg_out = np.array(sg_in).reshape(-1,1), np.array(sg_out).reshape(-1,1)
        x_ij = np.array(cooc_mat[sg_in[:,0], sg_out[:,0]]).reshape(-1,1) + 1
        
        assert np.all(np.array(labels)==1)
        assert x_ij.shape[0] == sg_in.shape[0], 'X_ij {} shape does not sg_in {}'.format(x_ij.shape, sg_in.shape)
        """ For each batch in the dataset """
        model.fit([sg_in, sg_out], x_ij, batch_size = batch_size, epochs=1, verbose=0)
        l = model.evaluate([sg_in, sg_out], x_ij, batch_size=batch_size, verbose=0)
        losses.append(l)
    print('Loss in epoch {}: {}'.format(ep, np.mean(losses)))

KeyboardInterrupt: 

In [None]:
def save_embeddings(model,save_dir, tok, v_size):
    """ Saving data to disk """
    
    # We need to add the 0th index to word list manually
    word_list = ["RESERVED"]+[tok.index_word[w_i] for w_i in range(1,v_size)]
    emb_w_df = None
    for layer in model.layers:
        if 'embedding' == layer.name or 'embedding_1' == layer.name:
            if emb_w_df is None:
                emb_w_df = pd.DataFrame(layer.get_weights()[0])
            else:
                emb_w_df += layer.get_weights()[0]
    
    emb_w_df.insert(0, "word", word_list)
            
    emb_w_df.to_csv(
        os.path.join(save_dir, 'embeddings_w.csv'), index=False, header=None
    )
    
save_embeddings(model, 'datasets', tokenizer, v_size)
model.save('glove_model.h5')