In [58]:
import os
import zipfile

In [59]:
from urllib.request import urlretrieve


def download_data(url, data_dir):
    """Download a file if not present, and make it sure it's the right size"""

    os.makedirs(data_dir, exist_ok=True)
    
    file_path = os.path.join(data_dir, 'bbc-fulltext.zip')

    if not os.path.exists('file_path'):
        print('Downloading file...')
        filename, _ = urlretrieve(url, file_path)
    else:
        print('File already exists')

    extract_path = os.path.join(data_dir, 'bbc')
    if not os.path.exists(extract_path):

        with zipfile.ZipFile(
            os.path.join(data_dir,'bbc-fulltext.zip'),
            'r'
        ) as zipf:
            zipf.extractall(data_dir)
    
    else:
        print('bbc-fulltext,zip has already been extracted')


In [60]:
url = 'http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip'
download_data(url, 'data')

Downloading file...
bbc-fulltext,zip has already been extracted


In [61]:
def read_data(data_dir):
    news_stories = []
    print('Reading files')
    for root, dirs, files in os.walk(data_dir):
        for fi, f in enumerate(files):
            if 'README' in f:
                continue
            print('.'*fi, f, end='\r')
            with open(os.path.join(root, f), encoding='latin-1') as f:
                story = []
                for row in f:
                    story.append(row.strip())
                story = ''.join(story)
                news_stories.append(story)
    print(f"\nDetected {len(news_stories)} stories")
    return news_stories

In [62]:
news_stories = read_data(os.path.join('data','bbc'))

print(f"{sum([len(story.split(' ')) for story in news_stories])} words in the total news set")
print('Example words (starts):', news_stories[0][:50])
print('Example words (end):', news_stories[-1][:-50])


Reading files
................................................................................................................................................................................................................................................................................................................................................................................................................ 284.txt........ 284.txt...................................................................................... 284.txt
Detected 2225 stories
843863 words in the total news set
Example words (starts): Musicians to tackle US red tapeMusicians' groups a
Example words (end): Hacker threat to Apple's iTunesUsers of Apple's music jukebox iTunes need to update the software to avoid a potential security threat.Hackers can build malicious playlist files which could crash the program and let them seize control of the computer by inserting Trojan code. A new version of iTunes is now availab

In [63]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(
    num_words = None,
    filters = '!"#$%&()*+,-/:;<=>?@[\\]^_{|}~\t\n',
    lower = True,
    split = ' '
)


In [64]:
tokenizer.fit_on_texts(news_stories)

word_index is a dictionary that maps each words to a unique IDs

In [65]:
n_vocab = len(tokenizer.word_index.items()) + 1
print(f"Vocabulary size: {n_vocab}")

print("\nWords at the top")
print('\t', dict(list(tokenizer.word_index.items())[:10]))
print("\nWords at the bottom")
print('\t', dict(list(tokenizer.word_index.items())[-10:]))


Vocabulary size: 47187

Words at the top
	 {'the': 1, 'to': 2, 'of': 3, 'and': 4, 'a': 5, 'in': 6, 'for': 7, 'is': 8, 'that': 9, 'on': 10}

Words at the bottom
	 {'sp.': 47177, 'gameboy.': 47178, 'itunesusers': 47179, 'threat.hackers': 47180, 'inserting': 47181, 'solves': 47182, 'idefence': 47183, '4.7.1.': 47184, '.pls': 47185, '.m3u': 47186}


index_word is a dictionary that maps each unique word ID to the corresponding word

In [91]:
n_vocab_index_word = len(tokenizer.index_word.items()) + 1
print(f"Vocabulary size: {n_vocab_index_word}")

print("\nWords at the top")
print('\t', dict(list(tokenizer.index_word.items())[:10]))
print("\nWords at the bottom")
print('\t', dict(list(tokenizer.index_word.items())[-10:]))


Vocabulary size: 47187

Words at the top
	 {1: 'the', 2: 'to', 3: 'of', 4: 'and', 5: 'a', 6: 'in', 7: 'for', 8: 'is', 9: 'that', 10: 'on'}

Words at the bottom
	 {47177: 'sp.', 47178: 'gameboy.', 47179: 'itunesusers', 47180: 'threat.hackers', 47181: 'inserting', 47182: 'solves', 47183: 'idefence', 47184: '4.7.1.', 47185: '.pls', 47186: '.m3u'}


In [92]:
tokenizer_2 = Tokenizer(
    num_words = 15000,
    filters = '!"#$%&()*+,-/:;<=>?@[\\]^_{|}~\t\n',
    lower = True,
    split = ' ',
    oov_token ='', #OOV = out of vocabulary: in this case the last 15000 words are out of vocabulary because they are more rare then the first 15000th
)

In [93]:
tokenizer_2.fit_on_texts(news_stories)

In [94]:
print(f'Original: {news_stories[0][:100]}')


Original: Musicians to tackle US red tapeMusicians' groups are to tackle US visa regulations which are blamed 


In [95]:
print(f"Sequence IDs: {tokenizer_2.texts_to_sequences([news_stories[0][:100]])[0]}")

Sequence IDs: [2827, 3, 1310, 49, 1236, 1, 881, 24, 3, 1310, 49, 2756, 3851, 35, 24, 2142]


In [96]:
news_sequences = tokenizer.texts_to_sequences(news_stories)

Generating skip-gram

In [97]:
sample_word_ids = news_sequences[0][:5]
sample_phrase = ' '.join([tokenizer.index_word[wid] for wid in sample_word_ids])
print(f"Sample phrase: {sample_phrase}")
print(f"Sample word IDs: {sample_word_ids}\n")

Sample phrase: musicians to tackle us red
Sample word IDs: [2826, 2, 1309, 48, 1235]



In [98]:
window_size = 1 # How many words to consider left and right.

In [99]:
import tensorflow as tf
inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sequence=sample_word_ids,
    vocabulary_size=n_vocab,
    window_size=window_size,
    negative_samples=1.0,
    shuffle = False,
    categorical = False,
    sampling_table = None,
    seed = None
)

In [103]:
print("Sample skip-grams")

for inp, lbl in zip(inputs,labels):
    print(f"\tInput: {inp}({[tokenizer_2.index_word[wi] for wi in inp]}) /  Label: {lbl}")
         

Sample skip-grams
	Input: [2826, 2](['musicians', 'to']) /  Label: 1
	Input: [2, 2826](['to', 'musicians']) /  Label: 1
	Input: [2, 1309](['to', 'tackle']) /  Label: 1
	Input: [1309, 2](['tackle', 'to']) /  Label: 1
	Input: [1309, 48](['tackle', 'us']) /  Label: 1
	Input: [48, 1309](['us', 'tackle']) /  Label: 1
	Input: [48, 1235](['us', 'red']) /  Label: 1
	Input: [1235, 48](['red', 'us']) /  Label: 1


Generating positive candidates

In [132]:
import numpy as np

inputs, labels = tf.keras.preprocessing.sequence.skipgrams(
    sequence=sample_word_ids,
    vocabulary_size=n_vocab,
    window_size=window_size,
    negative_samples=0,
    shuffle = False,
)

inputs, labels = np.array(inputs), np.array(labels)

In [133]:
print("Sample skip grams")


for inp, lbl in zip(inputs,labels):
    print(f"Input: {inp} ({[tokenizer_2.index_word[wi] for wi in inp]})  / Label:{lbl}")

Sample skip grams
Input: [2826    2] (['ces', 'the'])  / Label:1
Input: [   2 2826] (['the', 'ces'])  / Label:1
Input: [   2 1309] (['the', 'improve'])  / Label:1
Input: [1309    2] (['improve', 'the'])  / Label:1
Input: [1309   48] (['improve', 'about'])  / Label:1
Input: [  48 1309] (['about', 'improve'])  / Label:1
Input: [  48 1235] (['about', 'premiership'])  / Label:1
Input: [1235   48] (['premiership', 'about'])  / Label:1


Generating negative candidates

In [136]:
negative_sampling_candidates, true_excepted_count, sampled_expected_count = tf.random.log_uniform_candidate_sampler(
    true_classes = inputs[:1,1:], #[b, 1] sized tensor
    num_true = 1, #number of true words per example
    num_sampled = 10,
    unique = True,
    range_max = n_vocab,
    name = 'negative_sampling'
    )


In [142]:
print(f'Positive sample: {inputs[:1,1:]} ')
print(f"Negative sample:{negative_sampling_candidates}")
print(f"true_expected_count: {true_excepted_count}")
print(f"sampled_expected_count: {sampled_expected_count}")

Positive sample: [[2]] 
Negative sample:[ 5529 22459 26630    88  2870 38049    67 35905   104   117]
true_expected_count: [[0.26731545]]
sampled_expected_count: [1.6801454e-04 4.1370600e-05 3.4891178e-05 1.0382281e-02 3.2359548e-04
 2.4420295e-05 1.3565268e-02 2.5878446e-05 8.8076908e-03 7.8414343e-03]


Data generator function that generates batches of data for our model

In [145]:
import random

# Must see again

In [167]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(
        n_vocab, sampling_factor = 1e-05
    )


In [257]:
def skip_gram_data_generator(sequences, window_size, batch_size, negative_samples, vocab_size, seed=None):
    #data shuffling
    rand_sequence_ids = np.arange(len(sequences))
    np.random.shuffle(rand_sequence_ids)
    #generating positive skip grams
    for si in rand_sequence_ids:
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequences[si],
            vocabulary_size= vocab_size,
            window_size = window_size,
            negative_samples= 0.0,
            shuffle= False,
            sampling_table= sampling_table, #using subsampling technique
            seed = seed
        )

   
    targets, contexts, labels = [], [], []

    for target_word, context_word in positive_skip_grams:
        context_class = tf.expand_dims(tf.constant([context_word],dtype = 'int64'), 1)

        negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
            true_classes = context_class,
            num_true = 1,
            num_sampled = negative_samples,
            unique = True,
            range_max = vocab_size,
            name = 'negative_sampling'
        )

    # Build context and label vectors (for one taget word)
    context = tf.concat(
        [tf.constant([context_word], dtype = 'int64'), negative_sampling_candidates],
        axis=0)
    label = tf.constant([1] + [0]*negative_samples, dtype = 'int64')

    #Append each element from the training example to global lists
    targets.extend([target_word]*(negative_samples + 1))
    contexts.append(context)
    labels.append(label)

    contexts, targets, labels = np.concatenate(contexts), np.array(targets), np.concatenate(labels)
    #If seed is not provided generate a random number
    if not seed:
        seed = random.randint(0, 10e6)

    np.random.seed(seed)
    np.random.shuffle(contexts)
    np.random.seed(seed)
    np.random.shuffle(targets)
    np.random.seed(seed)
    np.random.shuffle(labels)

    #geraration batches of data
    for eg_id_start in range(0, contexts.shape[0], batch_size):
        yield(
            targets[eg_id_start: min(eg_id_start + batch_size, inputs.shape[0])],
            contexts[eg_id_start: min(eg_id_start + batch_size, inputs.shape[0])]
            ),labels[eg_id_start: min(eg_id_start + batch_size, inputs.shape[0])]




Implemeting the skip-gram architecture

In [258]:
batch_size =4096 #data points in a single batch

embedding_size = 128 # Dimension of the embedding vector

window_size = 1 # Using a window size of 1 on either side of target word
negative_samples = 4 # Number of negative samples generated per example

epochs = 5 # Number of epochs to train for

# We pick a random validation set to sample nearest neighbors
valid_size = 16 # Random set of words to evaluate similarity on
valid_window = 250

np.random.seed(54321)
random.seed(54321)

valid_terms_ids = np.array(random.sample(range(valid_window), valid_size))
valid_terms_ids = np.append(
    valid_terms_ids, random.sample(range(1000, 1000+valid_window), valid_size),
    axis = 0
)

Defining model using  Keras Functional API

In [259]:
#clearing any current running session, to make sure there aren't any other models occupying the hardware
import tensorflow.keras.backend as K
K.clear_session() 

In [260]:
# Inputs - skipgrams() function outputs target, context in that order 
input_1 = tf.keras.layers.Input(shape=(), name='target') # shape is defined has undefined dimension
input_2 = tf.keras.layers.Input(shape=(), name = 'context') # shape is defined has undefined dimension


In [261]:
# Two embeddings layers are used one for the context and one for the target
target_embedding_layer = tf.keras.layers.Embedding(
    input_dim = n_vocab,
    output_dim = embedding_size,
    name = 'target_embedding'
)
context_embedding_layer = tf.keras.layers.Embedding(
    input_dim = n_vocab,
    output_dim = embedding_size,
    name = 'context_embedding'
)

In [262]:
#lookup outputs of the embedding layers
target_out = target_embedding_layer(input_1)
context_out = context_embedding_layer(input_2)

In [263]:
#computing the dot product between the two
out = tf.keras.layers.Dot(axes = -1)([context_out, target_out])

In [264]:
#Defining the model
skip_gram_model = tf.keras.models.Model(inputs = [input_1,input_2], outputs = out, name='skip_gram_model')


In [265]:
#compiling the model
skip_gram_model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer='adam', metrics = ['accuracy'])

In [266]:
skip_gram_model.summary()

Model: "skip_gram_model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 context (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 target (InputLayer)            [(None,)]            0           []                               
                                                                                                  
 context_embedding (Embedding)  (None, 128)          6039936     ['context[0][0]']                
                                                                                                  
 target_embedding (Embedding)   (None, 128)          6039936     ['target[0][0]']                 
                                                                                    

In [267]:
#training the model and evaluating the model

class ValidationCallback(tf.keras.callbacks.Callback):

    def __init__(self, valid_term_ids, model_with_embeddings, tokenizer):

        self.valid_term_ids = valid_terms_ids
        self.model_with_embeddings = model_with_embeddings
        self.tokenizer = tokenizer

        super().__init__()

    def on_epoch_end(self, epoch, logs = None):
        """Validation logic"""

        #Using context embeddings to get the most similar 
        # Other strategies include: using target embeddings, mean embeddings after avaraging context/target

        embedding_weights = self.model_with_embeddings.get_layer("context_embedding").get_weights()[0]
        normalized_embeddings = embedding_weights / np.sqrt(np.sum(embedding_weights**2, axis = 1, keepdims=True))

        # Get the embeddings corresping to valid_term_ids 
        valid_embeddings = normalized_embeddings[self.valid_term_ids,:]

        # Compute the similarity between valid_term_ids and all the embeddings
        # V x d (d x D) => V x D
        top_k = 5 # Top k items will be displayed
        similarity = np.dot(valid_embeddings, normalized_embeddings.T)

        # Invert similarity matrix to negative
        # Ignore the first one because that would be the same word as the probe word
        similarity_top_k = np.argsort(-similarity, axis=1)[:, 1: top_k+1]

        #Print the output
        for i, term_id in enumerate(valid_terms_ids):

            similar_word_str = ','.join([self.tokenizer.index_word[j] for j in similarity_top_k[i, :] if j > 1])
            print(f"{self.tokenizer.index_word[term_id]}): {similar_word_str}")

        print('\n')


In [270]:
skipgram_validation_callback = ValidationCallback(valid_terms_ids, skip_gram_model, tokenizer)

for ei in range(epochs):

    print(f"Epoch: {ei+1}/{epochs} started")

    news_skip_gram_gen = skip_gram_data_generator(
        news_sequences, window_size, batch_size, negative_samples, n_vocab
    )

    skip_gram_model.fit(
        news_skip_gram_gen, epochs = 1,
        callbacks = skipgram_validation_callback,
    )

Epoch: 1/5 started
      1/Unknown - 0s 21ms/step - loss: 0.6917 - accuracy: 0.8000labour): tass,stages.,adsmusic,filmhollywood,charge.in
based): corruption.,144,dreamed,serene,provinces
has): 193,1938.,glory.,two,vima
you): nevo,fedorova,kitchen.,idols',offputting
also): dripped,â£24.8m,playability,parliament.however,robbie's
between): vouch,keegan.,together.any,phoned,hornsey
go): 1921,davies,1985's,trusted.,khoo
film): buy.,activities,existence.,â£28.7m,msps.the
who): scots,thayer,5.11,nottinghamshire,cruel
another): predominately,lapsed,426m,underestimating,decorating
make): fonda,extortionists,sailed,warwick's,hardcourts
since): turkcell's,low.the,printing,liverpoolbournemouth,entrenches
me): dealings,migrated,2004.however,thanou.,spend.
so): remotes,fold.the,found,monteiro,'victory'
added): â£280.2m,higher.,woman.,juan,nestle's
mr): francewales,volleyed,surge,limbo,yoxall
figure): achieve.,kse,counts,lea,fis.
roddick): 271st,convener,wounded,eurozone,scrum'.
income): safeguarded.