# **43177 STUTI KUMAR**

# **A5 - CBOW**

In [None]:
#import libraries
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import gensim

In [None]:
#read data
data=open('coronaData.txt','r')
corona_data = [text for text in data if text.count(' ') >= 2]

#tokenize every word in the dataset and fit data to the tokenizer
vectorize = Tokenizer()
vectorize.fit_on_texts(corona_data)
corona_data = vectorize.texts_to_sequences(corona_data)

#calculate the total number of words and the total number of sentences
total_vocab = sum(len(s) for s in corona_data)
word_count = len(vectorize.word_index) + 1
window_size = 2

In [None]:
# function that takes in window sizes separately for target and the context 
# and creates the pairs of contextual words and target words
def cbow_model(data, window_size, total_vocab):
    total_length = window_size*2
    for text in data:
        text_len = len(text)
        for idx, word in enumerate(text):
            context_word = []
            target   = []            
            begin = idx - window_size
            end = idx + window_size + 1
            context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])
            target.append(word)
            contextual = sequence.pad_sequences(context_word, total_length=total_length)
            final_target = np_utils.to_categorical(target, total_vocab)
            yield(contextual, final_target) 

In [None]:
#build the neural network model
model = Sequential()
model.add(Embedding(input_dim=total_vocab, output_dim=100, input_length=window_size*2))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))
model.add(Dense(total_vocab, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
for i in range(10):
    cost = 0
    for x, y in cbow_model(data, window_size, total_vocab):
        cost += model.train_on_batch(contextual, final_target)
    print(i, cost)

0 0
1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0


In [None]:
#create a file that contains all the vectors
dimensions=100
vect_file = open('vectors.txt' ,'w')
vect_file.write('{} {}\n'.format(total_vocab,dimensions))

8

In [None]:
#access the weights of the trained model and write it to created file
weights = model.get_weights()[0]
for text, i in vectorize.word_index.items():
    final_vec = ' '.join(map(str, list(weights[i, :])))
    vect_file.write('{} {}\n'.format(text, final_vec))
vect_file.close()

In [None]:
#use the vectors that were created in the gensim model
cbow_output = gensim.models.KeyedVectors.load_word2vec_format('vectors.txt', binary=False)
cbow_output.most_similar(positive=['virus'])

[('further', 0.22572550177574158),
 ('specific', 0.19405266642570496),
 ('is', 0.18120348453521729),
 ('shed', 0.17946389317512512),
 ('illness', 0.16364046931266785),
 ('successive', 0.15235838294029236),
 ('does', 0.144303098320961),
 ('symptomatic', 0.14363688230514526),
 ('symptoms', 0.1398129016160965),
 ('difficult', 0.134918212890625)]