In [23]:
import numpy as np
from scipy.spatial import distance
import sys
#print sys.executable
#print sys.path
import collections

In [24]:
#to check against keywords in trained model
candidate_words = ['curiosity', 'learning', 'root_cause', 'encourage', 'encouraging', 'questioning', 'question_asked', 
                   'afraid', 'daily', 'daily_basis', 'conversation', 'conversational', 'open', 'communication', 'positive', 'positive_feedback', 'positive_response',
                  'environment', 'inviting', 'participation', 'confidence', 'confidencebuilding', 'orientation'
                  ,'expectation', 'equally', 'valuable', 'mistake' , 'examples', 'share_common', 'share_similar', 'sharealike', 'fear',
                  'admit', 'admitting', 'accountability', 'insecurity', 'speak','provider', 'anxiety', 'gain','culture',
                  'retaliation','role_model', 'demonstrated','leader', 'guilt', 'guilty', 'leadership', 'support', 'rounding',
                  'visible', 'relatable', 'personal', 'relationship', 'policy', 'demonstrate']

filename='./server/data/wiki_4gram_50d.txt'

In [25]:

# simple Least Recently Used Cache
# inspired by https://www.kunxi.org/blog/2014/05/lru-cache-in-python/
class LRUCache:
  def __init__(self, capacity):
    self._cache = collections.OrderedDict()
    self._capacity = capacity

  def __setitem__(self, key, value):
    if key in self._cache:
      self._cache.pop(key)
    elif len(self._cache) >= self._capacity:
      self._cache.popitem(last=False)
    self._cache[key] = value

  def __getitem__(self, key):
    if key in self._cache:
      value = self._cache.pop(key)
      # put in the back
      self._cache[key] = value
      return value
    else:
      return None

  def __contains__(self, key):
    return key in self._cache

In [32]:
header_read = False
skip_head=True
#dist_type = 'cosine' #'euclidean'
vocabulary = []
dictionary = {}
embeddings = None
cache_capacity=10000

with open(filename, 'r') as filehandler:
  numbers = []
  for line in filehandler:
    if skip_head and not header_read:
      header_read = True
      pass
    else:
      split = line.split(' ')
      dictionary[split[0]] = len(vocabulary)
      vocabulary.append(split[0])
      numbers.append([float(x) for x in split[1:]]) #split[1:]

embeddings = np.array(numbers, dtype=np.float32)
_cache = LRUCache(cache_capacity)

def get_embedding_for_a_word( word):
    return get_embedding_for_words([word])

def get_word( index):
    if index < len(vocabulary):
      return vocabulary[index]
    else:
      return None

def get_embedding_for_words(words):
    indicies = [dictionary[word] for word in words if word in dictionary]
    if len(indicies) > 0:
      embedding = np.mean(embeddings[indicies, :], axis=0)
    else:
      embedding = np.zeros(embeddings.shape[1],)
    return embedding

def compute_all_distances_from_a_word(word, dist_type):
    cache_value = _cache[word]
    if word in _cache:
      #print 'cache hit:', word
      return _cache[word]
    #print 'cache miss:', word
    embedding = get_embedding_for_a_word(word)
    embedding = embedding.reshape(1, embeddings.shape[1])
    dists = distance.cdist(embedding, embeddings, dist_type)[0,:]
    _cache[word] = dists
    return dists

def recommend_words_by_avg_dist( words, how_many=11, dist_type='euclidean' ):
    embeddings = []
    for word in words:
      # this might be faster because we cache each word's distances
      embeddings.append(compute_all_distances_from_a_word(word, dist_type))

    # average distances
    avg_dists = np.mean(np.array(embeddings), axis=0)

    sort_indices = avg_dists.argsort()
    return [vocabulary[index] for index in sort_indices[:how_many]]

# word similarity by cosine metric 

In [33]:
# Similarity by Euclidean distance
for word in candidate_words:
    print word, recommend_words_by_avg_dist([word], dist_type='cosine')[1:]
    print '\n'

curiosity ['imagination', 'glimpse', 'imaginative', 'fascination', 'nature', 'look', 'curious', 'playful', 'natural_world', 'figment']


learning ['learning_environment', 'learner', 'student_learning', 'help_student', 'reading_writing', 'tutoring', 'experiential', 'problem_solving', 'literacy', 'individualized']


root_cause ['unanticipated', 'problem', 'intractable', 'avoidable', 'unintended', 'vulnerability', 'underlying_cause', 'reduce_risk', 'solution_problem', 'mitigate']


encourage ['encouraging', 'promote', 'encourages', 'aim', 'educate', 'stated_purpose', 'promoting', 'main_goal', 'enhance', 'motivate']


encouraging ['encourage', 'promote', 'promoting', 'encourages', 'encouraged', 'aim', 'bring', 'stated_purpose', 'stated_aim', 'fostering']


questioning ['misunderstanding', 'ignoring', 'question', 'dishonesty', 'culpability', 'argument', 'contempt', 'explaining', 'comment', 'dissent']


question_asked ['rebuttal', 'answer_question', 'question', 'answering_question', 'answer'

demonstrate ['demonstrating', 'demonstrated', 'importantly', 'demonstrates', 'order_achieve', 'evaluate', 'practicality', 'discern', 'suitability', 'ingenuity']




# word similarity by euclidean metric

In [34]:
# Similarity by cosine distance
_cache = LRUCache(cache_capacity)
for word in candidate_words:
    print word, recommend_words_by_avg_dist([word])[1:]
    print '\n'

curiosity ['imagination', 'glimpse', 'fascination', 'incredibly', 'fascinated', 'lifelike', 'imaginative', 'illuminate', 'curious', 'mundane']


learning ['learning_environment', 'learner', 'help_student', 'tutoring', 'reading_writing', 'pedagogy', 'literacy', 'problem_solving', 'teaching', 'educational']


root_cause ['avoidable', 'intractable', 'rectify', 'unanticipated', 'cause_effect', 'alleviating', 'worstcase', 'worsen', 'problem_caused', 'inadequacy']


encourage ['encouraging', 'encourages', 'promote', 'aim', 'educate', 'promoting', 'stimulate', 'encouraged', 'enhance', 'bring']


encouraging ['encourage', 'encouraged', 'promoting', 'encourages', 'bring', 'promote', 'empowering', 'facilitating', 'bringing', 'fostering']


questioning ['misunderstanding', 'ignoring', 'explaining', 'contempt', 'criticize', 'insulting', 'dissent', 'excuse', 'sexist', 'discredit']


question_asked ['questioner', 'emailed', 'contradicting', 'censoring', 'appropriateness', 'rationally', 'pointless', 

relationship ['romantic_relationship', 'motivation', 'interpersonal', 'marital', 'sexual_relationship', 'societal', 'personal_life', 'identity', 'morally', 'love_triangle']


policy ['government_policy', 'economic_policy', 'agenda', 'strategy', 'reform', 'economic', 'implementing', 'government', 'governance', 'policy_issue']


demonstrate ['demonstrating', 'demonstrates', 'importantly', 'order_achieve', 'demonstrated', 'suitability', 'achieves', 'better_understand', 'order_create', 'familiarity']


