In [1]:
import sys,random,math
from collections import Counter
import numpy as np

In [2]:
np.random.seed(1)
random.seed(1)

In [3]:
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x:(x.split(" ")),raw_reviews))
wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1
vocab = list(set(map(lambda x:x[0],wordcnt.most_common())))

word2index = {}
for i,word in enumerate(vocab):
    word2index[word]=i

concatenated = list()
input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

### Filling in the blank

In [4]:
def similar(target='beautiful'):
  target_index = word2index[target]

  scores = Counter()
  for word,index in word2index.items():
    raw_difference = weights_0_1[index] - (weights_0_1[target_index])
    squared_difference = raw_difference * raw_difference
    scores[word] = -math.sqrt(sum(squared_difference))
  return scores.most_common(10)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

In [5]:
alpha, iterations = (0.05, 2)
hidden_size,window,negative = (50,2,5)

weights_0_1 = (np.random.rand(len(vocab),hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab),hidden_size)*0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

for rev_i,review in enumerate(input_dataset * iterations):
  for target_i in range(len(review)):
        
    # since it's really expensive to predict every vocabulary
    # we're only going to predict a random subset
    target_samples = [review[target_i]]+list(concatenated\
    [(np.random.rand(negative)*len(concatenated)).astype('int').tolist()])

    left_context = review[max(0,target_i-window):target_i]
    right_context = review[target_i+1:min(len(review),target_i+window)]

    layer_1 = np.mean(weights_0_1[left_context+right_context],axis=0)
    layer_2 = sigmoid(layer_1.dot(weights_1_2[target_samples].T))
    layer_2_delta = layer_2 - layer_2_target
    layer_1_delta = layer_2_delta.dot(weights_1_2[target_samples])

    weights_0_1[left_context+right_context] -= layer_1_delta * alpha
    weights_1_2[target_samples] -= np.outer(layer_2_delta,layer_1)*alpha

  if(rev_i % 250 == 0):
    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)) + "   " + str(similar('terrible')))
  sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)))
print(similar('terrible'))

Progress:0.99998 [('terrible', -0.0), ('horrible', -2.834826518548311), ('brilliant', -3.0212907361878765), ('phenomenal', -3.5700826478698477), ('superb', -3.652054757348279), ('pathetic', -3.717197670179342), ('marvelous', -3.7482885020897885), ('mediocre', -3.7501919235088566), ('masterful', -3.8372760152403775), ('miserable', -3.95204044576222)]5)])]63)][('terrible', -0.0), ('horrible', -2.796762697351021), ('brilliant', -3.2612000357516893), ('phenomenal', -3.6732169141111135), ('pathetic', -3.713043523268326), ('bad', -3.820526517756504), ('mediocre', -3.838973802913548), ('marvelous', -3.851360501195549), ('superb', -3.8941396374371764), ('masterful', -3.9700464473721357)]


### See how network finds analogys

In [8]:
def analogy(positive=['terrible','good'],negative=['bad']):
    
    norms = np.sum(weights_0_1 * weights_0_1,axis=1)
    norms.resize(norms.shape[0],1)
    
    normed_weights = weights_0_1 * norms
    
    query_vect = np.zeros(len(weights_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]
    for word in negative:
        query_vect -= normed_weights[word2index[word]]
    
    scores = Counter()
    for word,index in word2index.items():
        raw_difference = weights_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)[1:]

In [9]:
analogy(['terrible','good'],['bad'])

[('superb', -222.55410259404772),
 ('terrific', -222.93166050573805),
 ('decent', -222.95729531984014),
 ('fine', -222.988410621835),
 ('great', -223.35586302972584),
 ('worth', -223.37884242503583),
 ('nice', -223.38542143972032),
 ('brilliant', -223.39597869833207),
 ('perfect', -223.4656110159031)]

In [10]:
analogy(['elizabeth','he'],['she'])

[('christopher', -191.2302892734723),
 ('john', -191.7417074080458),
 ('william', -191.8072119302868),
 ('david', -191.83178663836733),
 ('simon', -191.85329794918522),
 ('mr', -191.99448664624535),
 ('fred', -192.0169070881519),
 ('this', -192.03602427546232),
 ('bruce', -192.04545969278882)]