### CBOW Algorithm

In [23]:
import random
import tensorflow as tf
import os 
import zipfile
import numpy as np

In [24]:
from urllib.request import urlretrieve


def download_data(url, data_dir):
    """Download a file if not present, and make it sure it's the right size"""

    os.makedirs(data_dir, exist_ok=True)
    
    file_path = os.path.join(data_dir, 'bbc-fulltext.zip')

    if not os.path.exists('file_path'):
        print('Downloading file...')
        filename, _ = urlretrieve(url, file_path)
    else:
        print('File already exists')

    extract_path = os.path.join(data_dir, 'bbc')
    if not os.path.exists(extract_path):

        with zipfile.ZipFile(
            os.path.join(data_dir,'bbc-fulltext.zip'),
            'r'
        ) as zipf:
            zipf.extractall(data_dir)
    
    else:
        print('bbc-fulltext,zip has already been extracted')


In [25]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'
download_data(url, 'data')

Downloading file...
bbc-fulltext,zip has already been extracted


In [26]:
def read_data(data_dir):
    news_stories = []
    print('Reading files')
    for root, dirs, files in os.walk(data_dir):
        for fi, f in enumerate(files):
            if 'README' in f:
                continue
            print('.'*fi, f, end='\r')
            with open(os.path.join(root, f), encoding='latin-1') as f:
                story = []
                for row in f:
                    story.append(row.strip())
                story = ''.join(story)
                news_stories.append(story)
    print(f"\nDetected {len(news_stories)} stories")
    return news_stories

In [27]:
news_stories = read_data(os.path.join('data','bbc'))

print(f"{sum([len(story.split(' ')) for story in news_stories])} words in the total news set")
print('Example words (starts):', news_stories[0][:50])
print('Example words (end):', news_stories[-1][:-50])

Reading files
................................................................................................................................................................................................................................................................................................................................................................................................................ 284.txt........ 284.txt...................................................................................... 284.txt
Detected 2225 stories
843863 words in the total news set
Example words (starts): Musicians to tackle US red tapeMusicians' groups a
Example words (end): Hacker threat to Apple's iTunesUsers of Apple's music jukebox iTunes need to update the software to avoid a potential security threat.Hackers can build malicious playlist files which could crash the program and let them seize control of the computer by inserting Trojan code. A new version of iTunes is now availab

In [28]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words = None,
    filters = '!"#$%&()*+,-/:;<=>?@[\\]^_{|}~\t\n',
    lower = True,
    split = ' '
)

In [29]:
tokenizer.fit_on_texts(news_stories)

In [30]:
n_vocab = len(tokenizer.word_index.items()) + 1

In [31]:
news_sequences = tokenizer.texts_to_sequences(news_stories)

In [32]:

def cbow_grams(sequence, vocabulary_size, window_size = 4, negative_samples = 1., shuffle = True, categorical = False, sampling_table = None, seed = None):

    targets, contexts, labels = [], [], []

    for i, wi in enumerate(sequence):


        if not wi or i < window_size or i + 1 > len(sequence) - window_size:
            continue
        if sampling_table is not None:
            if sampling_table[wi] < random.random():
                continue
        
        window_start = max(0, i - window_size)
        window_end = min(len(sequence), i + window_size + 1)

        context_words = [ wj for j, wj in enumerate(sequence[window_start:window_end])  if j+window_start != i]
        target_word = wi

        context_classes = tf.expand_dims(tf.constant(context_words, dtype = 'int64'), 0)

        negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
            true_classes= context_classes,
            num_true= window_size * 2,
            num_sampled = negative_samples,
            unique = True,
            range_max = vocabulary_size,
            name = 'negative_sampling'
        )

        #build context and label vectors
        negative_targets = negative_sampling_candidates.numpy().tolist()

        target = [target_word] + negative_targets
        label = [1] + [0]*negative_samples

        #Append each element from the training example to global list
        targets.extend(target)
        contexts.extend([context_words]*(negative_samples+1))
        labels.extend(label)

    couples = list(zip(targets, contexts))

    seed = random.randint(0, 10e6)
    random.seed(seed)
    random.shuffle(couples)
    random.seed(seed)
    random.shuffle(labels)

    return couples, labels

window_size = 1 # How many words to consider left and right

inputs, labels = cbow_grams(
    tokenizer.texts_to_sequences(["i am going to the store"])[0],
    vocabulary_size = len(tokenizer.word_index.items()) + 1,
    window_size= window_size, negative_samples=4, shuffle = False,
    categorical = False,
    sampling_table= None,
    seed = None
)

window_size = 1 # How many words to consider left and right.

inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    tokenizer.texts_to_sequences([news_stories[0][:150]])[0], 
    vocabulary_size=len(tokenizer.word_index.items())+1, 
    window_size=window_size, negative_samples=4, shuffle=False,
    categorical=False, sampling_table=None, seed=None
)

i = 0
for inp, lbl in zip(inputs, labels):
    i += 1
    print(f"Input: {inp} ({[tokenizer.index_word[wi] for wi in inp]}) / Label: {lbl}")


Input: [2826, 2] (['musicians', 'to']) / Label: 1
Input: [2, 2826] (['to', 'musicians']) / Label: 1
Input: [2, 1309] (['to', 'tackle']) / Label: 1
Input: [1309, 2] (['tackle', 'to']) / Label: 1
Input: [1309, 48] (['tackle', 'us']) / Label: 1
Input: [48, 1309] (['us', 'tackle']) / Label: 1
Input: [48, 1235] (['us', 'red']) / Label: 1
Input: [1235, 48] (['red', 'us']) / Label: 1
Input: [1235, 24644] (['red', "tapemusicians'"]) / Label: 1
Input: [24644, 1235] (["tapemusicians'", 'red']) / Label: 1
Input: [24644, 880] (["tapemusicians'", 'groups']) / Label: 1
Input: [880, 24644] (['groups', "tapemusicians'"]) / Label: 1
Input: [880, 23] (['groups', 'are']) / Label: 1
Input: [23, 880] (['are', 'groups']) / Label: 1
Input: [23, 2] (['are', 'to']) / Label: 1
Input: [2, 23] (['to', 'are']) / Label: 1
Input: [2, 1309] (['to', 'tackle']) / Label: 1
Input: [1309, 2] (['tackle', 'to']) / Label: 1
Input: [1309, 48] (['tackle', 'us']) / Label: 1
Input: [48, 1309] (['us', 'tackle']) / Label: 1
Input:

In [33]:
window_size = 1 # How many words to consider left and right.

inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    tokenizer.texts_to_sequences([news_stories[0][:150]])[0], 
    vocabulary_size=len(tokenizer.word_index.items())+1, 
    window_size=window_size, negative_samples=4, shuffle=False,
    categorical=False, sampling_table=None, seed=None
)

i = 0
for inp, lbl in zip(inputs, labels):
    i += 1
    print(f"Input: {inp} ({[tokenizer.index_word[wi] for wi in inp]}) / Label: {lbl}")

Input: [2826, 2] (['musicians', 'to']) / Label: 1
Input: [2, 2826] (['to', 'musicians']) / Label: 1
Input: [2, 1309] (['to', 'tackle']) / Label: 1
Input: [1309, 2] (['tackle', 'to']) / Label: 1
Input: [1309, 48] (['tackle', 'us']) / Label: 1
Input: [48, 1309] (['us', 'tackle']) / Label: 1
Input: [48, 1235] (['us', 'red']) / Label: 1
Input: [1235, 48] (['red', 'us']) / Label: 1
Input: [1235, 24644] (['red', "tapemusicians'"]) / Label: 1
Input: [24644, 1235] (["tapemusicians'", 'red']) / Label: 1
Input: [24644, 880] (["tapemusicians'", 'groups']) / Label: 1
Input: [880, 24644] (['groups', "tapemusicians'"]) / Label: 1
Input: [880, 23] (['groups', 'are']) / Label: 1
Input: [23, 880] (['are', 'groups']) / Label: 1
Input: [23, 2] (['are', 'to']) / Label: 1
Input: [2, 23] (['to', 'are']) / Label: 1
Input: [2, 1309] (['to', 'tackle']) / Label: 1
Input: [1309, 2] (['tackle', 'to']) / Label: 1
Input: [1309, 48] (['tackle', 'us']) / Label: 1
Input: [48, 1309] (['us', 'tackle']) / Label: 1
Input:

### Implementing the model

In [34]:
batch_size = 4096 # Data points in a single batch

embedding_size = 128 # Dimension of the embedding vector

window_size = 1 # Using window size of 1 on either side of the target word
epochs = 5 
negative_samples = 4 # Number of negative samples generated per example

#Picking a random validation set to sample nearest neighbors 
valid_size = 16 # Random set of words to evaluate similarity on.
# We sample valid datapoints randomly from a large window without always being deterministic
valid_window = 250

# When selecting valid examples, we select some of the most frequent words as well as
# some moderately rare words as well
np.random.seed(54321)
random.seed(54321)

valid_term_ids = np.array(random.sample(range(valid_window), valid_size))
valid_term_ids = np.append(
    valid_term_ids, random.sample(range(1000, 1000+valid_window), valid_size),
    axis=0
)

In [35]:
valid_term_ids

array([ 125,  200,   17,   62,   43,  141,  135,  100,   44,  234,   82,
        131,  209,   72,  175,   30, 1007, 1194, 1073, 1114, 1006, 1019,
       1169, 1029, 1132, 1154, 1049, 1199, 1180, 1053, 1008, 1113])

In [36]:
import tensorflow.keras.backend as K

K.clear_session()


# Inputs; target input layer will have the final shape [None]
# context will have [None, 2xwindow_size] shape
input_1 = tf.keras.layers.Input(shape=())
input_2 = tf.keras.layers.Input(shape=(window_size*2,))

# Target and context embedding layers
target_embedding_layer = tf.keras.layers.Embedding(
    input_dim=n_vocab, output_dim=embedding_size, name='target_embedding'
)

context_embedding_layer = tf.keras.layers.Embedding(
    input_dim=n_vocab, output_dim=embedding_size, name='context_embedding'
)

# Outputs of the target and context embedding lookups
context_out = context_embedding_layer(input_2)
target_out = target_embedding_layer(input_1)

# Taking the mean over the all the context words to produce [None, embedding_size]
mean_context_out = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=1))(context_out)

# Computing the dot product between the two 
out = tf.keras.layers.Dot(axes=-1)([context_out, target_out])

cbow_model = tf.keras.models.Model(inputs=[input_1, input_2], outputs=out, name='cbow_model')

cbow_model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), 
    optimizer='adam'
)

cbow_model.summary()

Model: "cbow_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 2)]          0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 context_embedding (Embedding)  (None, 2, 128)       6039936     ['input_2[0][0]']                
                                                                                                  
 target_embedding (Embedding)   (None, 128)          6039936     ['input_1[0][0]']                
                                                                                         

In [37]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(
        n_vocab, sampling_factor = 1e-05
    )


In [38]:
def cbow_data_generator(sequences, window_size, batch_size, negative_samples):
    
    rand_sequence_ids = np.arange(len(sequences))                    
    np.random.shuffle(rand_sequence_ids)

    for si in rand_sequence_ids:
        inputs, labels = cbow_grams(
            sequences[si], 
            vocabulary_size=n_vocab, 
            window_size=window_size, 
            negative_samples=negative_samples, 
            shuffle=True,
            sampling_table=sampling_table,
            seed=None
        )
        
        inputs_context, inputs_target, labels = np.array([inp[1] for inp in inputs]), np.array([inp[0] for inp in inputs]), np.array(labels).reshape(-1,1)
        
        assert inputs_context.shape[0] == inputs_target.shape[0]
        assert inputs_context.shape[0] == labels.shape[0]
        
        #print(inputs_context.shape, inputs_target.shape, labels.shape)
        for eg_id_start in range(0, inputs_context.shape[0], batch_size):            
            
            yield (
                inputs_target[eg_id_start: min(eg_id_start+batch_size, inputs_target.shape[0])], 
                inputs_context[eg_id_start: min(eg_id_start+batch_size, inputs_context.shape[0]),:]
            ), labels[eg_id_start: min(eg_id_start+batch_size, labels.shape[0])]

In [42]:
#training the model and evaluating the model

class ValidationCallback(tf.keras.callbacks.Callback):

    def __init__(self, valid_term_ids, model_with_embeddings, tokenizer):

        self.valid_term_ids = valid_term_ids
        self.model_with_embeddings = model_with_embeddings
        self.tokenizer = tokenizer

        super().__init__()

    def on_epoch_end(self, epoch, logs = None):
        """Validation logic"""

        #Using context embeddings to get the most similar 
        # Other strategies include: using target embeddings, mean embeddings after avaraging context/target

        embedding_weights = self.model_with_embeddings.get_layer("context_embedding").get_weights()[0]
        normalized_embeddings = embedding_weights / np.sqrt(np.sum(embedding_weights**2, axis = 1, keepdims=True))

        # Get the embeddings corresping to valid_term_ids 
        valid_embeddings = normalized_embeddings[self.valid_term_ids,:]

        # Compute the similarity between valid_term_ids and all the embeddings
        # V x d (d x D) => V x D
        top_k = 5 # Top k items will be displayed
        similarity = np.dot(valid_embeddings, normalized_embeddings.T)

        # Invert similarity matrix to negative
        # Ignore the first one because that would be the same word as the probe word
        similarity_top_k = np.argsort(-similarity, axis=1)[:, 1: top_k+1]

        #Print the output
        for i, term_id in enumerate(valid_term_ids):

            similar_word_str = ','.join([self.tokenizer.index_word[j] for j in similarity_top_k[i, :] if j > 1])
            print(f"{self.tokenizer.index_word[term_id]}): {similar_word_str}")

        print('\n')


In [43]:
cbow_validation_callback = ValidationCallback(valid_term_ids, cbow_model, tokenizer)

for ei in range(epochs):
    print(f"Epoch: {ei+1}/{epochs} started")
    news_cbow_gen = cbow_data_generator(news_sequences, window_size, batch_size, negative_samples)
    cbow_model.fit(
        news_cbow_gen, 
        epochs=1, 
        callbacks=cbow_validation_callback,         
    )


Epoch: 1/5 started
   2227/Unknown - 46s 20ms/step - loss: 0.4886labour): conservative,basic,banned,midfield,germany's
based): meanwhile,documentary,chicago,capital,dan
has): had,have,felt,david,scale
you): format,association.,limited,daniel,actually
also): democratic,honour,salary,wooden,deeply
between): boyd,gmt,anderson,powers,prison
go): won't,profit,criminal,alicia,hollywood
film): micro,king,french,smaller,opening
who): pressure,whereas,stuart,respect,legend
another): near,department,easily,prior,federation
make): memory,totally,honour,death,statement
since): safety,euro,indeed,darren,nicolas
me): anderson,kevin,injury,stadium,event
so): absolutely,roy,believed,park,germany
added): won't,king,gareth,expectations,andy
mr): japan,charles,total,parent,star
figure): veteran,anderson,hollywood,estimated,web
roddick): physical,management,systems,television,johnson
income): survey,including,reforms,gadget,budget
wins): basis,carolina,machine,ronaldo,opera
attempt): services.,physical,an