In [70]:
import pandas as pd
import nltk
import operator

In [None]:
# 0.7-W2, 0.8-W3, 0.9-W4, WORD_1, 0.9-W5, 0.8-W6, 0.7-W7
# 0.7-W2, 0.8-W3, 0.9-W4, WORD_2, 0.9-W5, 0.8-W6, 0.7-W7
# WORD_1 == WORD_2, translate WORD_1 -> WORD_2 based on CBOW embedding

In [7]:
emotions = {}
emotions["neutral"] = "I just got a new car."
emotions["joy"] = "I love my awesome new car!"
emotions["disgust"] = "I just got a crappy new car."
emotions["guilt"] = "I spent all my money on an expensive new car."
emotions["fear"] = "I hope I do not crash my expensive new car."
emotions["anger"] = "I hate my bad new car."
emotions["shame"] = "I am embarassed by my clunky new car."
emotions["sadness"] = "I wish I could have gotten a better new car."
# awesome -> crappy -> expensive -> clunky -> better

In [23]:
embedding = {}
tokenized_sentences = {}
language = set()
for emotion, sentence in emotions.items():
    tokenized_sentence = nltk.tokenize.word_tokenize(sentence)
    language.update(tokenized_sentence)
    tokenized_sentences[emotion] = tokenized_sentence

embedding = {word: i for i, word in enumerate(language)}
embedded_sentences = {emotion: [embedding[word] for word in sentence] for emotion, sentence in tokenized_sentences.items()}
embedded_sentences

{'neutral': [6, 20, 5, 32, 7, 18, 29],
 'joy': [6, 19, 25, 21, 7, 18, 27],
 'disgust': [6, 20, 5, 32, 1, 7, 18, 29],
 'guilt': [6, 28, 12, 25, 31, 13, 4, 24, 7, 18, 29],
 'fear': [6, 14, 6, 15, 17, 26, 25, 24, 7, 18, 29],
 'anger': [6, 23, 25, 8, 7, 18, 29],
 'shame': [6, 9, 22, 11, 25, 0, 7, 18, 29],
 'sadness': [6, 16, 6, 3, 10, 2, 32, 30, 7, 18, 29]}

In [67]:
word_occurences = {}
for embedded in [embedding for _, embedding in embedded_sentences.items()]:
    for word in embedded:
        word_occurences[word] = word_occurences[word] + 1 if word in word_occurences else 1

In [35]:
def get_language_freq():
    return {embedding[word]: 0 for word in language}

In [68]:
context_window = {}
window_size = 3
for embedded in [embedding for _, embedding in embedded_sentences.items()]:
    for main_i, main_word in enumerate(embedded):
        context_window[main_word] = context_window[main_word] if main_word in context_window else {"pre": {}, "post": {}}
        for compare_i, compare_word in enumerate(embedded[main_i - window_size if main_i > window_size else 0:main_i + window_size + 1]):
            if main_word == compare_word:
                continue
            placement = "pre" if compare_i < main_i else "post"
            distance = abs(window_size - compare_i)
            if distance not in context_window[main_word][placement]:
                context_window[main_word][placement][distance] = get_language_freq()
            context_window[main_word][placement][distance][compare_word] += float(1/word_occurences[compare_word])

In [77]:
context_window[embedding["awesome"]]

{'pre': {3: {0: 0,
   1: 0,
   2: 0,
   3: 0,
   4: 0,
   5: 0,
   6: 0.1,
   7: 0,
   8: 0,
   9: 0,
   10: 0,
   11: 0,
   12: 0,
   13: 0,
   14: 0,
   15: 0,
   16: 0,
   17: 0,
   18: 0,
   19: 0,
   20: 0,
   21: 0,
   22: 0,
   23: 0,
   24: 0,
   25: 0,
   26: 0,
   27: 0,
   28: 0,
   29: 0,
   30: 0,
   31: 0,
   32: 0},
  2: {0: 0,
   1: 0,
   2: 0,
   3: 0,
   4: 0,
   5: 0,
   6: 0,
   7: 0,
   8: 0,
   9: 0,
   10: 0,
   11: 0,
   12: 0,
   13: 0,
   14: 0,
   15: 0,
   16: 0,
   17: 0,
   18: 0,
   19: 1.0,
   20: 0,
   21: 0,
   22: 0,
   23: 0,
   24: 0,
   25: 0,
   26: 0,
   27: 0,
   28: 0,
   29: 0,
   30: 0,
   31: 0,
   32: 0},
  1: {0: 0,
   1: 0,
   2: 0,
   3: 0,
   4: 0,
   5: 0,
   6: 0,
   7: 0,
   8: 0,
   9: 0,
   10: 0,
   11: 0,
   12: 0,
   13: 0,
   14: 0,
   15: 0,
   16: 0,
   17: 0,
   18: 0,
   19: 0,
   20: 0,
   21: 0,
   22: 0,
   23: 0,
   24: 0,
   25: 0.2,
   26: 0,
   27: 0,
   28: 0,
   29: 0,
   30: 0,
   31: 0,
   32: 0}},
 'post': {1: {

In [167]:
def show_contexts(context_window, embedding):
    context = []
    for placement_name, placement in context_window[embedding].items():
        for _, ordering in placement.items():
            
            context.append([freq for freq in sorted(ordering.items(), key=operator.itemgetter(1)) if freq[1] > 0])
        if placement_name == "pre":
            context.append([(embedding, 1.0)])
    return context

In [80]:
reverse_embedding = {val:key for key,val in embedding.items()}

In [154]:
def convert_contexts_to_words(contexts, complete_length, start=""):
    sentences = []
    if not contexts and len(start.split(" ")) == complete_length:
        return [start]
    for i, neighbor in enumerate(contexts):
        for j, possible_neighbor in enumerate(neighbor):
            sentences += convert_contexts_to_words(contexts[i+1:], complete_length, f"{start+' ' if start else start}{reverse_embedding[possible_neighbor[0]]}")
    return sentences

In [162]:
def convert_context_to_sentences(contexts):
    return convert_contexts_to_words(contexts, len(contexts), "")

In [156]:
show_contexts(context_window, embedding["expensive"])

[[(29, 0.2857142857142857), (17, 1.0), (31, 1.0)],
 [(18, 0.25), (13, 1.0), (26, 1.0)],
 [(25, 0.2), (7, 0.25), (4, 1.0)],
 [(24, 1.0)]]

In [166]:
embedding["."]

29

In [168]:
convert_context_to_sentences(show_contexts(context_window, embedding["expensive"]), )

pre
post


['. car my expensive',
 '. car new expensive',
 '. car an expensive',
 '. on my expensive',
 '. on new expensive',
 '. on an expensive',
 '. crash my expensive',
 '. crash new expensive',
 '. crash an expensive',
 'not car my expensive',
 'not car new expensive',
 'not car an expensive',
 'not on my expensive',
 'not on new expensive',
 'not on an expensive',
 'not crash my expensive',
 'not crash new expensive',
 'not crash an expensive',
 'money car my expensive',
 'money car new expensive',
 'money car an expensive',
 'money on my expensive',
 'money on new expensive',
 'money on an expensive',
 'money crash my expensive',
 'money crash new expensive',
 'money crash an expensive']