# Import Section

In [1]:
from google.colab import drive
drive.mount('/content/drive')
path = '/content/drive/My Drive/covid_project/graph_files'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile

import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import csv

Instructions for updating:
non-resource variables are not supported in the long term


# Loading data

In [3]:
import csv
tweet_list = []
with open('/content/drive/My Drive/covid_project/graph_files/tweets_rows_deleted.csv', 'r', encoding='utf-8') as inp:
  for row in csv.reader(inp):
    tweet_list.append(row[1])
print(tweet_list)



In [4]:
corpus = []
for elem in tweet_list:
  l = elem.split()
  for el in l:
    corpus.append(el)
print(corpus)



# Building dataset

In [5]:
vocabulary_size = 50000

def build_dataset(words):
  count = [('UNK', 0)]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  
  for word in words:
    
    index = dictionary[word]
    data.append(index)
  
  
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reverse_dictionary

data, count, dictionary, reverse_dictionary = build_dataset(corpus)
data_size = len(reverse_dictionary.keys())
print('data: ',data)
print('Count: ', count)
print('dictionary: ', dictionary)
print('Reverse: ', reverse_dictionary)

data:  [87, 140, 1899, 4888, 6502, 603, 1090, 115, 58, 75, 795, 3091, 2189, 4889, 6503, 1107, 751, 8171, 8172, 582, 255, 48, 2909, 882, 3341, 2289, 4890, 242, 963, 464, 342, 328, 322, 1344, 6504, 7, 8173, 4891, 5573, 207, 368, 69, 2742, 936, 58, 604, 502, 243, 964, 8174, 2, 267, 1438, 8175, 2080, 11, 61, 494, 8176, 8175, 2080, 468, 268, 31, 4, 314, 831, 544, 17, 1, 79, 3628, 1132, 1744, 2, 1108, 147, 96, 61, 1900, 107, 2290, 369, 146, 4892, 5574, 184, 2291, 3092, 6505, 21, 32, 1992, 4, 761, 4365, 442, 24, 6, 1390, 139, 3093, 285, 809, 752, 116, 519, 178, 15, 14, 1745, 8177, 11438, 80, 74, 11439, 12, 469, 978, 11440, 3094, 22, 2743, 1901, 34, 14, 841, 1045, 200, 1046, 2429, 732, 2744, 2745, 1161, 8178, 267, 1545, 883, 11441, 81, 16, 733, 34, 306, 122, 108, 11442, 1, 119, 2292, 406, 11443, 88, 656, 2430, 53, 753, 6506, 407, 4366, 1345, 6, 170, 3629, 1672, 545, 1020, 166, 361, 520, 1439, 414, 1068, 118, 1020, 1021, 534, 4893, 69, 1746, 242, 11444, 11445, 4367, 425, 415, 226, 3342, 1248, 1

In [6]:
del corpus  # Hint to reduce memory.
print('Most common words ', count[1:6])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])

Most common words  [('amp', 2140), ('the', 2111), ('people', 1838), ('this', 1500), ('cases', 1314)]
Sample data [87, 140, 1899, 4888, 6502, 603, 1090, 115, 58, 75] ['read', 'thread', 'judge', 'strangers', 'navigating', 'moment', 'photo', 'chinese', 'medical', 'workers']


# Building and training model

In [7]:
BATCH_SIZE = 1000
EPOCHS = 10
EMBEDDING_SIZE = 100  # Dimension of the embedding vector.
SKIP_WINDOW = 2       # How many words to consider left and right.
NUM_SKIPS = 2         # How many times to reuse an input to generate a label.

WINDOW_SIZE = 10
VALID_SIZE = 16     # Random set of words to evaluate similarity on.
VALID_WINDOW = 100  # Only pick dev samples in the head of the distribution.
#valid_examples = np.random.choice(VALID_WINDOW, VALID_SIZE, replace=False)
NUM_SAMPLED = 64    # Number of negative examples to sample.
EXP_WORDS = WINDOW_SIZE *2
ITERATIONS = 10

In [8]:
from collections import Counter
import random

threshold = 1e-5
word_counts = Counter(data)
total_count = len(data)
freqs = {word: number/total_count for word, number in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
train_words = [word for word in data if random.random() < (1 - p_drop[word])]

In [9]:
# Get a list of words in a window around an index.
def get_target(words, idx, window_size=5):
    
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = set(words[start:idx] + words[idx+1:stop+1])
    
    return list(target_words)

In [10]:
# Create a generator of word batches as a tuple (inputs, targets) 
def get_batches(words, batch_size, window_size=5):

    
    n_batches = len(words)//batch_size
    
    # only full batches
    words = words[:n_batches*batch_size]
    
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y

In [11]:
graph = tf.Graph()

with graph.as_default():

  # input tensors
  with tf.name_scope('inputs'):
    train_inputs = tf.placeholder(tf.int32, shape=[None], name='train_input') #BATCH_SIZE
    train_labels = tf.placeholder(tf.int32, shape=[None, None], name='train_labels') #BATCH_SIZE, EXP_WORDS
  
  
  # variable tensors
  with tf.name_scope('variables'):
    word_embeddings = tf.Variable(tf.random_uniform([data_size, EMBEDDING_SIZE], -1.0, 1.0), name='word_embeddings')
    context_embeddings = tf.Variable(tf.truncated_normal([data_size, EMBEDDING_SIZE],stddev=1.0 / math.sqrt(EMBEDDING_SIZE)), name='context_embeddings')
    out_biases = tf.Variable(tf.zeros([data_size]))
  
  lookup = tf.nn.embedding_lookup(word_embeddings, train_inputs) # forward inputs to the hidden layer

  # loss
  with tf.name_scope('cost'):
      softmax = tf.nn.sampled_softmax_loss(
        weights=context_embeddings,
        biases=out_biases,
        labels=train_labels,
        inputs=lookup,
        num_classes=data_size,
        num_sampled=NUM_SAMPLED,
        num_true=1
      )
      cost = tf.reduce_sum(softmax, name='reduce_sum')

  # track the batch loss in the summary
  tf.summary.scalar('cost', cost)
  # optimizer
  with tf.name_scope('optimizer'):
      optimizer = tf.train.AdamOptimizer().minimize(cost)

  with tf.name_scope('validation'):
    # pick 8 samples from (0,100) and (1000,1100) each ranges. lower id implies more frequent 
    valid_examples = np.array(random.sample(range(VALID_WINDOW), VALID_SIZE//2))
    valid_examples = np.append(valid_examples, 
                               random.sample(range(1000,1000+VALID_WINDOW), VALID_SIZE//2))

    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
    
    # We use the cosine distance:
    norm = tf.sqrt(tf.reduce_sum(tf.square(word_embeddings), 1, keepdims=True))
    normalized_embedding = word_embeddings / norm
    valid_embedding = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
    similarity = tf.matmul(valid_embedding, tf.transpose(normalized_embedding))


  merged = tf.summary.merge_all() # merged summaries
  init = tf.global_variables_initializer() # variables initializer
  saver = tf.train.Saver() # network saver
print('>> graph created') 

>> graph created


In [12]:
import time
with tf.Session(graph=graph) as sess:
    iteration = 1
    loss = 0
    sess.run(tf.global_variables_initializer())
    for _ in range(1,ITERATIONS+1):
      for e in range(1, EPOCHS+1):
          batches = get_batches(train_words, BATCH_SIZE, WINDOW_SIZE)
          start = time.time()
          for x, y in batches:
              
              feed = {train_inputs: x,
                      train_labels: np.array(y)[:, None]}
              train_loss, _ = sess.run([cost, optimizer], feed_dict=feed)
              
              loss += train_loss
              
              if iteration % 100 == 0: 
                  end = time.time()
                  print("Epoch {}/{}".format(e, EPOCHS),
                        "Iteration: {}".format(iteration),
                        "Avg. Training loss: {:.4f}".format(loss/100),
                        "{:.4f} sec/batch".format((end-start)/100))
                  loss = 0
                  start = time.time()
              
              if iteration % 1000 == 0:
                  sim = similarity.eval()
                  for i in range(VALID_SIZE):
                      valid_word = reverse_dictionary[valid_examples[i]]
                      top_k = 8 # number of nearest neighbors
                      nearest = (-sim[i, :]).argsort()[1:top_k+1]
                      log = 'Nearest to %s:' % valid_word
                      for k in range(top_k):
                          close_word = reverse_dictionary[nearest[k]]
                          log = '%s %s,' % (log, close_word)
                      print(log)
              
              iteration += 1
      file = os.path.join(path, '/checkpoints/text8.ckpt')
      save_path = saver.save(sess, file)
      embed_mat = sess.run(normalized_embedding)

Epoch 2/10 Iteration: 100 Avg. Training loss: 46079.6236 0.0326 sec/batch
Epoch 4/10 Iteration: 200 Avg. Training loss: 46089.2365 0.0091 sec/batch
Epoch 5/10 Iteration: 300 Avg. Training loss: 44869.3209 0.0405 sec/batch
Epoch 7/10 Iteration: 400 Avg. Training loss: 44478.6054 0.0184 sec/batch
Epoch 8/10 Iteration: 500 Avg. Training loss: 43869.2628 0.0487 sec/batch
Epoch 10/10 Iteration: 600 Avg. Training loss: 42641.2556 0.0267 sec/batch
Epoch 2/10 Iteration: 700 Avg. Training loss: 42552.0182 0.0059 sec/batch
Epoch 3/10 Iteration: 800 Avg. Training loss: 40944.9400 0.0375 sec/batch
Epoch 5/10 Iteration: 900 Avg. Training loss: 40668.4878 0.0154 sec/batch
Epoch 6/10 Iteration: 1000 Avg. Training loss: 40650.9233 0.0460 sec/batch
Nearest to work: recommend, squirrel, nashville, merry, saw, bonehead, budh, discredited,
Nearest to protect: kwong, thinned, monetised, zoonotic, rickshaws, qantas, generosity, wanted,
Nearest to like: probinsyano, sooooo, museums, continental, humanity, th