In [1]:
!pip install keras
!pip install tensorflow
!pip install plot_keras_history
!pip install seaborn

Collecting plot_keras_history
  Downloading plot_keras_history-1.1.30.tar.gz (8.6 kB)
Collecting sanitize_ml_labels>=1.0.28
  Downloading sanitize_ml_labels-1.0.29.tar.gz (7.4 kB)
Collecting compress_json
  Downloading compress_json-1.0.4.tar.gz (4.7 kB)
Building wheels for collected packages: plot-keras-history, sanitize-ml-labels, compress-json
  Building wheel for plot-keras-history (setup.py) ... [?25l[?25hdone
  Created wheel for plot-keras-history: filename=plot_keras_history-1.1.30-py3-none-any.whl size=8794 sha256=f100a5b943915da2434b11e99a58ebb7250a9c66352532d5744e1ab8302402a6
  Stored in directory: /root/.cache/pip/wheels/b0/60/47/8c5aa37c06be5e97879ec467bc2e6a30b315d95f662c63a503
  Building wheel for sanitize-ml-labels (setup.py) ... [?25l[?25hdone
  Created wheel for sanitize-ml-labels: filename=sanitize_ml_labels-1.0.29-py3-none-any.whl size=7878 sha256=361f42e1548cdf97ab7d96e44dbb5aee9a6f74f8b7403c9e6b422d0d3608018d
  Stored in directory: /root/.cache/pip/wheels/c2/f5

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

from keras.utils import np_utils
from keras.preprocessing import sequence

import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, Reshape
from keras.layers import Input
from keras.models import Model
from keras.layers import dot
from tensorflow.keras.activations import relu
from nltk import word_tokenize, sent_tokenize
from gensim.corpora.dictionary import Dictionary
import numpy as np

from keras.preprocessing.sequence import skipgrams
import gensim


In [3]:
# using nltk tokenizer.  
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
#Data Preparation 

AlotOftext = """Language users never choose words randomly, and language is essentially
non-random. Statistical hypothesis testing uses a null hypothesis, which
posits randomness. Hence, when we look at linguistic phenomena in corpora, 
the null hypothesis will never be true. Moreover, where there is enough
data, we shall (almost) always be able to establish that it is not true. In
corpus studies, we frequently do have enough data, so the fact that a relation 
between two phenomena is demonstrably non-random, does not support the inference 
that it is not arbitrary. We present experimental evidence
of how arbitrary associations between word frequencies and corpora are
systematically non-random. We review literature in which hypothesis testing 
has been used, and show how it has often led to unhelpful or misleading results.""".lower()



#Tokenize text
tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(AlotOftext)]

#Create Vocab as a Dictionary
vocab = Dictionary(tokenized_text)
print(dict(vocab.items()))

print(vocab.token2id['corpora'])
print(vocab[2])
sent0 = tokenized_text[0]
print(vocab.doc2idx(sent0))

vocab.add_documents([['PAD']])
dict(vocab.items())
print(vocab.token2id['PAD'])

corpusByWordID = list()
for sent in  tokenized_text:
    corpusByWordID.append(vocab.doc2idx(sent))

vocab_size = len(vocab)
embed_size = 100
hidden_dim=100
window_size = 2 # context window size

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(vocab.items())[:10])

{0: ',', 1: '.', 2: 'and', 3: 'choose', 4: 'essentially', 5: 'is', 6: 'language', 7: 'never', 8: 'non-random', 9: 'randomly', 10: 'users', 11: 'words', 12: 'a', 13: 'hypothesis', 14: 'null', 15: 'posits', 16: 'randomness', 17: 'statistical', 18: 'testing', 19: 'uses', 20: 'which', 21: 'at', 22: 'be', 23: 'corpora', 24: 'hence', 25: 'in', 26: 'linguistic', 27: 'look', 28: 'phenomena', 29: 'the', 30: 'true', 31: 'we', 32: 'when', 33: 'will', 34: '(', 35: ')', 36: 'able', 37: 'almost', 38: 'always', 39: 'data', 40: 'enough', 41: 'establish', 42: 'it', 43: 'moreover', 44: 'not', 45: 'shall', 46: 'that', 47: 'there', 48: 'to', 49: 'where', 50: 'arbitrary', 51: 'between', 52: 'corpus', 53: 'demonstrably', 54: 'do', 55: 'does', 56: 'fact', 57: 'frequently', 58: 'have', 59: 'inference', 60: 'relation', 61: 'so', 62: 'studies', 63: 'support', 64: 'two', 65: 'are', 66: 'associations', 67: 'evidence', 68: 'experimental', 69: 'frequencies', 70: 'how', 71: 'of', 72: 'present', 73: 'systematically',

In [5]:
# Create CBOW Training data
def generate_cbow_context_word_pairs(corpusByID, window_size, vocab_size):
    context_length = window_size*2
    X=[]
    Y=[]
    for sent in corpusByID:
        sentence_length = len(sent)
        for index, word in enumerate(sent):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([sent[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)
            if start<0:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='pre',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            if end>=sentence_length:
                x = sequence.pad_sequences(context_words, maxlen=context_length,padding='post',value=vocab.token2id['PAD'])
                y = np_utils.to_categorical(label_word, vocab_size)
                X.append(x)
                Y.append(y)
                continue
            else:
                X.append(sequence.pad_sequences(context_words, maxlen=context_length))
                y = np_utils.to_categorical(label_word, vocab_size)
                Y.append(y)
                continue
           
    return X,Y
            
# Test this out for some samples


X,Y = generate_cbow_context_word_pairs(corpusByWordID, window_size, vocab_size) 
   
for x, y in zip(X,Y):
    print('Context (X):', [vocab[w] for w in x[0]], '-> Target (Y):', vocab[np.argwhere(y[0])[0][0]])


Context (X): ['PAD', 'PAD', 'users', 'never'] -> Target (Y): language
Context (X): ['PAD', 'language', 'never', 'choose'] -> Target (Y): users
Context (X): ['language', 'users', 'choose', 'words'] -> Target (Y): never
Context (X): ['users', 'never', 'words', 'randomly'] -> Target (Y): choose
Context (X): ['never', 'choose', 'randomly', ','] -> Target (Y): words
Context (X): ['choose', 'words', ',', 'and'] -> Target (Y): randomly
Context (X): ['words', 'randomly', 'and', 'language'] -> Target (Y): ,
Context (X): ['randomly', ',', 'language', 'is'] -> Target (Y): and
Context (X): [',', 'and', 'is', 'essentially'] -> Target (Y): language
Context (X): ['and', 'language', 'essentially', 'non-random'] -> Target (Y): is
Context (X): ['language', 'is', 'non-random', '.'] -> Target (Y): essentially
Context (X): ['is', 'essentially', '.', 'PAD'] -> Target (Y): non-random
Context (X): ['essentially', 'non-random', 'PAD', 'PAD'] -> Target (Y): .
Context (X): ['PAD', 'PAD', 'hypothesis', 'testing']

In [6]:
cbow = Sequential()

cbow.add(Embedding(input_dim=88, output_dim=100, input_length=4)) #N:100 V:23
cbow.add(Lambda(lambda x: relu(K.mean(x, axis=1)), output_shape=(1,100)))

cbow.add(Dense(88, activation='sigmoid'))
cbow.compile(loss='categorical_crossentropy', optimizer='sgd')
cbow.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            8800      
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 88)                8888      
                                                                 
Total params: 17,688
Trainable params: 17,688
Non-trainable params: 0
_________________________________________________________________


In [7]:
#Train the model

for epoch in range(100):
    loss = 0.
    for x, y in zip(X,Y):
        loss += cbow.train_on_batch(x, y)
    print(epoch, loss)

0 644.529953956604
1 642.6628098487854
2 640.8561697006226
3 639.1088786125183
4 637.4187512397766
5 635.7886962890625
6 634.2181217670441
7 632.70383644104
8 631.2472808361053
9 629.8468196392059
10 628.5019631385803
11 627.2107443809509
12 625.9719932079315
13 624.7825667858124
14 623.640787601471
15 622.5446374416351
16 621.490226984024
17 620.4775929450989
18 619.5040559768677
19 618.5655689239502
20 617.6607971191406
21 616.7875819206238
22 615.941009759903
23 615.1200044155121
24 614.3197145462036
25 613.5392379760742
26 612.7749700546265
27 612.0258460044861
28 611.2883203029633
29 610.5604948997498
30 609.8407635688782
31 609.126074552536
32 608.4147922992706
33 607.7050426006317
34 606.9944343566895
35 606.2834105491638
36 605.5687637329102
37 604.849258184433
38 604.1248989105225
39 603.3927493095398
40 602.6533381938934
41 601.9049561023712
42 601.1480689048767
43 600.3802502155304
44 599.6040115356445
45 598.8185274600983
46 598.0249547958374
47 597.2244116067886
48 596.417

In [8]:
## Save the wordvectors
f = open('/content/drive/My Drive/TPML/day1/Cbow_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = cbow.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [9]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/TPML/day1/Cbow_vectors.txt', binary=False)

w2v.most_similar(positive=['language'])

[('words', 0.46368470788002014),
 ('randomly', 0.3626438081264496),
 ('non-random', 0.3214108943939209),
 ('which', 0.2801418602466583),
 ('the', 0.2698863744735718),
 ('.', 0.2316035032272339),
 ('hypothesis', 0.2199399769306183),
 ('choose', 0.21757295727729797),
 ('users', 0.20526540279388428),
 ('posits', 0.2004464864730835)]

In [10]:
#Create Skipgram Training data 

# generate skip-grams with both positive and negative examples
skip_grams = [skipgrams(sent, vocabulary_size=vocab_size, window_size=2) for sent in corpusByWordID]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
        vocab[pairs[i][0]], pairs[i][0],           
        vocab[pairs[i][1]], pairs[i][1], 
        labels[i]))

(users (10), evidence (67)) -> 0
(randomly (9), there (47)) -> 0
(essentially (4), is (5)) -> 1
(choose (3), linguistic (26)) -> 0
(randomly (9), choose (3)) -> 1
(language (6), frequently (57)) -> 0
(users (10), never (7)) -> 1
(words (11), never (7)) -> 1
(words (11), randomly (9)) -> 1
(language (6), word (74)) -> 0


In [32]:
#define the skip-gram model

input_word = Input((1,))
input_context_word = Input((1,))

word_embedding    = Embedding(input_dim=88, output_dim=100,input_length=1,name='word_embedding')
context_embedding = Embedding(input_dim=88, output_dim=100,input_length=1,name='conotext_embedding')

word_embedding = word_embedding(input_word)
word_embedding_layer = Reshape((100, 1))(word_embedding)

context_embedding = context_embedding(input_context_word)
context_embedding_layer = Reshape((100, 1))(context_embedding)

# now perform the dot product operation  
dot_product = dot([word_embedding_layer, context_embedding_layer], axes=1)
dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
outputLayer = Dense(1, activation='sigmoid')(dot_product)

model = Model(inputs=[input_word, input_context_word], outputs=outputLayer)
model.compile(loss='binary_crossentropy', optimizer='adam')

# view model summary
print(model.summary())

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_13 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_14 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 word_embedding (Embedding)     (None, 1, 100)       8800        ['input_13[0][0]']               
                                                                                                  
 conotext_embedding (Embedding)  (None, 1, 100)      8800        ['input_14[0][0]']               
                                                                                            

In [44]:
#train the model

for epoch in range(1, 100):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 0.5551827773451805
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 0.5512690395116806
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 3 Loss: 0.5475252717733383
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 4 Loss: 0.5439421236515045
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 5 Loss: 0.5405109599232674
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 6 Loss: 0.5372236967086792
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 7 Loss: 0.534072682261467
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 8 Loss: 0.5310508050024509
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 9 Loss: 0.5281513631343842
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 10 Loss: 0.5253680236637592
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 11 Loss: 0.5226949714124203
Processed 0 (skip_fi

In [45]:
#get the embeding matrix
weights = model.get_weights()
## Save the wordvectors
f = open('/content/drive/My Drive/TPML/day1/skipgram_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [46]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/TPML/day1/skipgram_vectors.txt', binary=False)
w2v.most_similar(positive=['the'])


[('look', 0.3385516405105591),
 ('a', 0.32431089878082275),
 ('uses', 0.3201105296611786),
 ('posits', 0.2946311831474304),
 ('testing', 0.28061509132385254),
 ('relation', 0.27493759989738464),
 ('it', 0.26850587129592896),
 ('which', 0.26102298498153687),
 ('does', 0.2528742551803589),
 ('review', 0.2403210699558258)]

In [None]:
#Excerise: 
#modeify the skipegram_model to share the same embeding layer between word and context
#Discussion: which is better? Why?  

In [36]:
#modeify the skipegram_share_model
input_s_word = Input((1,))
input_s_context_word = Input((1,))

share_embedding   = Embedding(input_dim=88, output_dim=100,input_length=1,name='share_embedding')

word_embedding = share_embedding(input_s_word)
word_embedding_layer = Reshape((100, 1))(word_embedding)

context_embedding = share_embedding(input_s_context_word)
context_embedding_layer = Reshape((100, 1))(context_embedding)

# now perform the dot product operation  
dot_product = dot([word_embedding_layer, context_embedding_layer], axes=1)
dot_product = Reshape((1,))(dot_product)

# add the sigmoid output layer
outputLayer = Dense(1, activation='sigmoid')(dot_product)

share_model = Model(inputs=[input_s_word, input_s_context_word], outputs=outputLayer)
share_model.compile(loss='binary_crossentropy', optimizer='adam')

# view model summary
print(share_model.summary())

Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_15 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 input_16 (InputLayer)          [(None, 1)]          0           []                               
                                                                                                  
 share_embedding (Embedding)    (None, 1, 100)       8800        ['input_15[0][0]',               
                                                                  'input_16[0][0]']               
                                                                                                  
 reshape_15 (Reshape)           (None, 100, 1)       0           ['share_embedding[0][0]']  

In [47]:
#train the model

for epoch in range(1, 100):
    loss = 0
    for i, elem in enumerate(skip_grams):
        pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
        pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
        labels = np.array(elem[1], dtype='int32')
        X = [pair_first_elem, pair_second_elem]
        Y = labels
        if i % 10000 == 0:
            print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
        loss += share_model.train_on_batch(X,Y)  

    print('Epoch:', epoch, 'Loss:', loss)

Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 1 Loss: 0.793669693171978
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 2 Loss: 0.7903274074196815
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 3 Loss: 0.787090927362442
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 4 Loss: 0.783955529332161
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 5 Loss: 0.7809169143438339
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 6 Loss: 0.7779709696769714
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 7 Loss: 0.7751136943697929
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 8 Loss: 0.7723414823412895
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 9 Loss: 0.7696507349610329
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 10 Loss: 0.7670380920171738
Processed 0 (skip_first, skip_second, relevance) pairs
Epoch: 11 Loss: 0.7645004242658615
Processed 0 (skip_firs

In [48]:
#get the embeding matrix
weights = share_model.get_weights()
## Save the wordvectors
f = open('/content/drive/My Drive/TPML/day1/skipgram_vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = share_model.get_weights()[0]
for key in vocab:
    str_vec = ' '.join(map(str, list(vectors[key, :])))
    f.write('{} {}\n'.format(vocab[key], str_vec))
f.close()

In [49]:
## Load the vectors back and validate
w2v = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/My Drive/TPML/day1/skipgram_vectors.txt', binary=False)
w2v.most_similar(positive=['the'])


[('fact', 0.6873430013656616),
 ('so', 0.6473961472511292),
 ('support', 0.5860694050788879),
 ('null', 0.562027096748352),
 ('corpora', 0.5151245594024658),
 ('do', 0.3742136061191559),
 ('not', 0.3738800287246704),
 ('have', 0.3358570337295532),
 ('that', 0.303596556186676),
 ('frequently', 0.30346593260765076)]


#Discussion: which is better? Why? 
The skipegarm_model shared the same embedding layer is better.
Because compared with the result of the separated one,it has higher similarity rank.