In [211]:
# coding: utf-8

from question1 import *
import json
import sys
import re
import time

In [219]:
arg1 = 'a'
arg2 = 'data/vocabulary.txt'
arg3 = 'data/word_contexts.txt'

'''
(b) function cosine_similarity to calculate similarity between 2 vectors
input: vector1
input: vector2
output: cosine similarity between vector1 and vector2 as a real number
'''
def cosine_similarity(vector1, vector2):
    
    # Make sure inputs are full.
    vector1 = sparse2full(vector1)
    vector2 = sparse2full(vector2)
    
    def dot_prod(v1, v2):
        return sum([x * y for x, y in zip(v1, v2)])

    num = dot_prod(vector1, vector2)
    denom = dot_prod(vector1, vector1)**0.5 * dot_prod(vector2, vector2)**0.5
    
    return num / float(denom)

'''
(d) function tf_idf to turn existing frequency-based vector model into tf-idf-based vector model
input: freqVectors, a list of frequency-based vectors
output: tfIdfVectors, a list of tf-idf-based vectors
'''
def tf_idf(freqVectors):
    
    tfIdfVectors = []
    N = len(freqVectors)
    
    # Convert to sparse vectors.
    freqVectors = [full2sparse(fv) for fv in freqVectors]
    
    # Define function that calculates TF-IDF for a single term.
    def tf_idf_word(tf, N, df):
        return (1 + math.log(tf, 2)) * (1 + math.log(N / float(df), 2))

    # Generate dictionary df (document frequency of term). Do this by iterating over documents.
    df_dict = {}
    for fv in freqVectors:
        for term, _ in fv:  # don't need frequency
            df_dict[term] = df_dict.get(term, 0) + 1
        
    for fv in freqVectors:
        tfIdfVectors.append([(term, tf_idf_word(tf, N, df_dict[term])) for term, tf in fv])

    return tfIdfVectors

sparse1 = [(0,1), (2,1), (4,2)]
sparse2 = [(0,1), (1,2), (4,1)]
sparse3 = []
full1 = [1, 0, 1, 0, 2]
full2 = [1, 2, 0, 0, 1]
full3 = [0, 0, 0, 0, 0]

def is_sparse(vector):
    return not vector or isinstance(vector[0], tuple)

def sparse2full(sparse, length=5000):
    sparse = list(sparse)  # in case it's a numpy array?
    if not is_sparse(sparse):
        return sparse
    if length is None:
        length = max([tup[0] for tup in sparse]) + 1
    full = [0] * length
    for tup in sparse:
        full[tup[0]] = tup[1]
    return full

def full2sparse(full):
    if is_sparse(full):
        return full
    sparse = [(c, n) for c, n in enumerate(full) if n > 0]
    return sparse

print is_sparse(sparse1)
print is_sparse(sparse3)
print is_sparse(full1)

print sparse2full(sparse1, 5)
print sparse2full(full1, 5)
print sparse2full(sparse2, 5)
print sparse2full(sparse3, 5)

print full2sparse(full1)
print full2sparse(sparse1)
print full2sparse(full2)
print full2sparse(full3)

'''
(a) function load_corpus to read a corpus from disk
input: vocabFile containing vocabulary
input: contextFile containing word contexts
output: id2word mapping word IDs to words
output: word2id mapping words to word IDs
output: vectors for the corpus, as a list of sparse vectors
'''
def load_corpus(vocabFile, contextFile):
    id2word = {}
    word2id = {}
    vectors = []

    with open(vocabFile, 'r') as f:
        vocab = f.read().splitlines()
    
    with open(contextFile, 'r') as f:
        context = f.read().splitlines()
    
    for big_string in context:
        tups = []
        if big_string[0] != '0':  # words without context words have a string that starts with '0'
            cns = big_string.split(' ')[1:]
            for cn in cns:
                c, n = cn.split(':')
                tups.append((int(c), int(n)))
        vectors.append(tups)
    
    for ID, word in enumerate(vocab):
        id2word[ID] = word
        word2id[word] = ID

    return id2word, word2id, vectors

True
True
False
[1, 0, 1, 0, 2]
[1, 0, 1, 0, 2]
[1, 2, 0, 0, 1]
[0, 0, 0, 0, 0]
[(0, 1), (2, 1), (4, 2)]
[(0, 1), (2, 1), (4, 2)]
[(0, 1), (1, 2), (4, 1)]
[]


In [3]:
# Fix paths
import os
print os.getcwd()  # '/Users/sipola/Google Drive/education/coursework/graduate/edinburgh/nlu/nlu-assignment1'
os.chdir('/Users/sipola/Google Drive/education/coursework/graduate/edinburgh/nlu/nlu-assignment1')
print os.getcwd()

/Users/sipola/Google Drive/education/coursework/graduate/edinburgh/nlu/nlu-assignment1
/Users/sipola/Google Drive/education/coursework/graduate/edinburgh/nlu/nlu-assignment1


In [4]:
def equalize_full_lens(v1, v2):  
    
    diff = len(v2) - len(v1)
    if diff > 0:
        v1.extend([0] * diff)
    else:
        v2.extend([0] * -diff)
        
    return v1, v2

print equalize_full_lens([1, 2, 3], [4, 5, 6])
print equalize_full_lens([1, 2, 3], [4, 5, 6, 7])
print equalize_full_lens([1, 2, 3, 10], [4, 5, 6])

([1, 2, 3], [4, 5, 6])
([1, 2, 3, 0], [4, 5, 6, 7])
([1, 2, 3, 10], [4, 5, 6, 0])


In [5]:
with open('data/test.txt', 'r') as f:
    sents_raw = f.read().splitlines()
sents = [json.loads(s) for s in sents_raw]

In [6]:
print 'sents:\n\t{}'.format(sents[0])
print '=' * 25
print 'target_position:\n\t{}'.format(sents[0]['target_position'])
print 'id:\n\t{}'.format(sents[0]['id'])
print '=' * 25
print 'sentence:\n\t{}'.format(sents[0]['sentence'])
print '=' * 25
for w in sents[0]['sentence'].split(' '):
    print w.lower()

sents:
	{u'target_position': u'29', u'target_word': u'side.n', u'id': u'301', u'sentence': u'on.i Sunday.n at.i Craven.n Cottage.n ,.x Jose.n Mourinho.n and.c his.d all.r star.v exhibit.v all.r of.i the.x above.a symptom.n and.c they.d be.v make.v to.x pay.v the.x price.n by.i a.x Fulham.n side.n that.x have.v in.i previous.a week.n wake.v up.x after.i match.n with.i their.d head.n kick.v in.i ..x'}
target_position:
	29
id:
	301
sentence:
	on.i Sunday.n at.i Craven.n Cottage.n ,.x Jose.n Mourinho.n and.c his.d all.r star.v exhibit.v all.r of.i the.x above.a symptom.n and.c they.d be.v make.v to.x pay.v the.x price.n by.i a.x Fulham.n side.n that.x have.v in.i previous.a week.n wake.v up.x after.i match.n with.i their.d head.n kick.v in.i ..x
on.i
sunday.n
at.i
craven.n
cottage.n
,.x
jose.n
mourinho.n
and.c
his.d
all.r
star.v
exhibit.v
all.r
of.i
the.x
above.a
symptom.n
and.c
they.d
be.v
make.v
to.x
pay.v
the.x
price.n
by.i
a.x
fulham.n
side.n
that.x
have.v
in.i
previous.a
week.n
wake.v

In [7]:
'''
helper class to load a thesaurus from disk
input: thesaurusFile, file on disk containing a thesaurus of substitution words for targets
output: the thesaurus, as a mapping from target words to lists of substitution words
'''
def load_thesaurus(thesaurusFile):
    thesaurus = {}
    with open(thesaurusFile) as inFile:
        for line in inFile.readlines():
            word, subs = line.strip().split("\t")
            thesaurus[word] = subs.split(" ")
    return thesaurus

In [22]:
'''
(a) function addition for adding 2 vectors
input: vector1
input: vector2
output: addVector, the resulting vector when adding vector1 and vector2
'''
def addition(vector1, vector2):
    
    # Make sure inputs are full.
    vector1 = sparse2full(vector1, None)
    vector2 = sparse2full(vector2, None)
    vector1, vector2 = equalize_full_lens(vector1, vector2)
    
    added = [x + y for x, y in zip(vector1, vector2)]
    
    return full2sparse(added)

In [23]:
'''
(a) function multiplication for multiplying 2 vectors
input: vector1
input: vector2
output: mulVector, the resulting vector when multiplying vector1 and vector2
'''
def multiplication(vector1, vector2):
    
    # Make sure inputs are full.
    vector1 = sparse2full(vector1, None)
    vector2 = sparse2full(vector2, None)
    vector1, vector2 = equalize_full_lens(vector1, vector2)
    
    multiplied = [x * y for x, y in zip(vector1, vector2)]
    
    return full2sparse(multiplied)

In [24]:
print("(a): vector addition and multiplication")
v1, v2, v3 , v4 = [(0,1), (2,1), (4,2)], [(0,1), (1,2), (4,1)], [1, 0, 1, 0, 2], [1, 2, 0, 0, 1]
try:
    if not set(addition(v1, v2)) == set([(0, 2), (2, 1), (4, 3), (1, 2)]):
        print("\tError: sparse addition returned wrong result")
    else:
        print("\tPass: sparse addition")
except Exception as e:
    print("\tError: exception raised in sparse addition")
    print(e)
try:
    if not set(multiplication(v1, v2)) == set([(0,1), (4,2)]):
        print("\tError: sparse multiplication returned wrong result")
    else:
        print("\tPass: sparse multiplication")
except Exception as e:
    print("\tError: exception raised in sparse multiplication")
    print(e)
try:
    addition(v3,v4)
    print("\tPass: full addition")
except Exception as e:
    print("\tError: exception raised in full addition")
    print(e)
try:
    multiplication(v3,v4)
    print("\tPass: full multiplication")
except Exception as e:
    print("\tError: exception raised in full addition")
    print(e)

(a): vector addition and multiplication
	Pass: sparse addition
	Pass: sparse multiplication
	Pass: full addition
	Pass: full multiplication


In [51]:
model_lda = gensim.models.ldamodel.LdaModel.load('run/lda_model')
# model_lda.get_term_topics(0)
w_vect = frequencyVectors[word2id["house.n"]]
tmp = model_lda.get_document_topics(w_vect)
print tmp
print '=' * 25
print [tup[1] for tup in tmp if tup[0] == 1008][0]

[(18, 0.022655528236718809), (29, 0.013911389361581955), (39, 0.059216902975083245), (43, 0.027167515547374756), (53, 0.013895648531032241), (59, 0.042801687098991341), (63, 0.52349675012988228), (75, 0.026278672983501009), (84, 0.016912568969295674), (87, 0.079649205504056811), (93, 0.013221198967828706), (97, 0.023422562524206623)]


IndexError: list index out of range

In [92]:
'''
(d) function prob_z_given_w to get probability of LDA topic z, given target word w
input: ldaModel
input: topicID as an integer
input: wordVector in frequency space
output: probability of the topic with topicID in the ldaModel, given the wordVector
'''
def prob_z_given_w(ldaModel, topicID, wordVector):
    topic_probs = ldaModel.get_document_topics(wordVector, minimum_probability=0.)
    try:
        prob_topic = [tup[1] for tup in topic_probs if tup[0]==topicID][0]
    except IndexError:
        prob_topic = 0.
    return prob_topic

In [93]:
'''
(d) function prob_w_given_z to get probability of target word w, given LDA topic z
input: ldaModel
input: targetWord as a string
input: topicID as an integer
output: probability of the targetWord, given the topic with topicID in the ldaModel
'''
def prob_w_given_z(ldaModel, targetWord, topicID):
    words = ldaModel.show_topic(topicID, 20000)  # 20000 gives all
    try:
        word_prob = [tup[1] for tup in words if gensim.utils.any2unicode(tup[0])==targetWord][0]
    except IndexError:
        word_prob = 0.
    return word_prob

In [243]:
print prob_z_given_w(ldaModel, houseTopic, vectors[word2id["house.n"]])
print prob_w_given_z(ldaModel, "house.n", houseTopic)

0.0228965307727
0.00195658399623


In [78]:
print("(d): calculating P(Z|w) and P(w|Z)")
print("\tloading corpus")
id2word,word2id,vectors=load_corpus(arg2, arg3)
print("\tloading LDA model")
ldaModel = gensim.models.ldamodel.LdaModel.load("run/lda_model")
houseTopic = ldaModel[vectors[word2id["house.n"]]][0][0]
try:
    if prob_z_given_w(ldaModel, houseTopic, vectors[word2id["house.n"]]) > 0.0:
        print("\tPass: P(Z|w)")
    else:
        print("\tFail: P(Z|w)")
except Exception as e:
    print("\tError: exception during P(Z|w)")
    print(e)
try:
    if prob_w_given_z(ldaModel, "house.n", houseTopic) > 0.0:
        print("\tPass: P(w|Z)")
    else:
        print("\tFail: P(w|Z)")
except Exception as e:
    print("\tError: exception during P(w|Z)")
    print(e)

(d): calculating P(Z|w) and P(w|Z)
	loading corpus
	loading LDA model
	Pass: P(Z|w)
	Pass: P(w|Z)


In [180]:
print load_thesaurus('data/test_thesaurus.txt')['about.r']
print load_thesaurus('data/test_thesaurus.txt')['account.n']
print [gensim.utils.any2unicode(w) for w in load_thesaurus('data/test_thesaurus.txt')['acquire.v']]

['somewhat.r', 'round.r', 'approximately.r', 'roughly.r', 'around.r', 'of.r', 'nearly.r']
['access.n', 'explanation.n', 'subscription.n', 'balance.n', 'facility.n', 'description.n', 'asset.n', 'consideration.n', 'narrative.n', 'report.n', 'fund.n', 'statement.n', 'finance.n']
[u'purchase.v', u'gather.v', u'gain.v', u'secure.v', u'achieve.v', u'obtain.v', u'get.v', u'receive.v', u'learn.v', u'buy.v', u'collect.v', u'find.v']


In [241]:
# '''
# (f) get the best substitution word in a given sentence, according to a given model (tf-idf, word2vec, LDA) and type (addition, multiplication, lda)
# input: jsonSentence, a string in json format
# input: thesaurus, mapping from target words to candidate substitution words
# input: word2id, mapping from vocabulary words to word IDs
# input: model, a vector space, Word2Vec or LDA model
# input: frequency vectors, original frequency vectors (for querying LDA model)
# input: csType, a string indicating the method of calculating context sensitive vectors: "addition", "multiplication", or "lda"
# output: the best substitution word for the jsonSentence in the given model, using the given csType
# '''
# def best_substitute(jsonSentence, thesaurus, word2id, model, frequencyVectors, csType, is_debug=False, vocab_unicode=None):
    
#     window = 5
    
#     target_word = jsonSentence['target_word'].lower()
#     target_position = int(jsonSentence['target_position'])
#     sentence = [w.lower() for w in jsonSentence['sentence'].split(' ')]
#     words = thesaurus[target_word]
    
#     prob_z_given_w_dict = {}
#     prob_w_given_z_dict = {}
    
#     # (b) use addition to get context sensitive vectors
#     if csType == "addition":
#         def context_sensitive(v1, v2, context=None, target_word=None):
#             return addition(v1, v2)

#     # (c) use multiplication to get context sensitive vectors
#     elif csType == "multiplication":
#         def context_sensitive(v1, v2, context=None, target_word=None):
#             return multiplication(v1, v2)
    
#     # (d) use LDA to get context sensitive vectors
#     elif csType == "lda":
#         topicIds = [lst[0] for lst in model.show_topics(-1)]  # -1 gives all topics
#         def context_sensitive(t, c, context, target_word):
#             cs_vector = []
#             for topicId in topicIds:
#                 try:
#                     prob_z_given_w_value = prob_z_given_w_dict[(topicId, target_word)]
#                     # print '        skipping prob_z_given_w calculation...'
#                 except KeyError:
#                     # print '        calculating prob_z_given_w value...'
#                     # t0 = time.time()
#                     prob_z_given_w_value = prob_z_given_w(model, topicId, frequencyVectors[word2id[target_word]])
#                     prob_z_given_w_dict[(topicId, target_word)] = prob_z_given_w_value
#                     # print '            time: {}'.format(time.time() - t0)
#                 try:
#                     prob_w_given_z_value = prob_w_given_z_dict[(topicId, context)]
#                     # print '        skipping prob_w_given_z calculation...'
#                 except KeyError:
#                     # t0 = time.time()
#                     # print '        calculating prob_w_given_z value...'
#                     prob_w_given_z_value = prob_w_given_z(model, context, topicId)
#                     prob_w_given_z_dict[(topicId, context)] = prob_w_given_z_value
#                     # print '            time: {}'.format(time.time() - t0)
#                 cs_vector.append(prob_z_given_w_value * prob_w_given_z_value)
#             return cs_vector
    
#     if vocab_unicode is None:
#         vocab_unicode = [gensim.utils.any2unicode(w) for w in word2id.keys()]
    
#     contexts = []
#     for i in range(target_position - window, target_position + window + 1):
#         if i != target_position and i >= 0 and i < len(sentence) and sentence[i] in vocab_unicode:
#             contexts.append(sentence[i])
#     if not contexts:
#         return None  # fail to predict if no context words
    
#     def get_vector(model, target_word):
#         if csType == 'lda':
#             vector = frequencyVectors[word2id[target_word]]  # no need for model
#         else:
#             if type(model) == gensim.models.word2vec.Word2Vec:
#                 vector = list(model[target_word])
#             else:
#                 vector = model[word2id[target_word]]
#         return vector
    
#     best_sub = None
#     best_score = 0.
#     t = get_vector(model, target_word)
#     for word in words:
#         # print 'word: {}'.format(word)
#         w = get_vector(model, word)
#         score = 0.
#         for context in contexts:
#             # print 'context: {}'.format(context)
#             # print '    getting vector...'
#             try:
#                 c = get_vector(model, context)
#             except KeyError:  # e.g., u'continually.r': word not in vocab
#                 continue
#             if not c:  # context word has no vector
#                 continue
#             # print '    calculating context sensitivity...'
#             tc = context_sensitive(t, c, context, target_word)
#             if not tc:  # sometimes multiplication returns []; then ignore context word
#                 continue
#             score += cosine_similarity(w, tc)
#         if score > best_score:
#             best_score = score
#             best_sub = word
    
#     return best_sub

In [244]:
def best_substitute(jsonSentence, thesaurus, word2id, model, frequencyVectors, csType, vocab_unicode=None):
    
    window = 5
    
    target_word = jsonSentence['target_word'].lower()
    target_position = int(jsonSentence['target_position'])
    sentence = [w.lower() for w in jsonSentence['sentence'].split(' ')]
    words = thesaurus[target_word]
    
    if vocab_unicode is None:
        vocab_unicode = [gensim.utils.any2unicode(w) for w in word2id.keys()]

    # Dicts are necessary for a reasonable run time given how this had been coded.
    # Otherwise the prob_z_given_w and prob_w_given_z calculations each take
    # 0.02-0.15 seconds, and they must be performed *many* times (200 sentences
    # * ~5 thesaurus words * <=10 context words * 100 topics = ~1 million).
    prob_z_given_w_dict = {}
    prob_w_given_z_dict = {}
    
    # (b) use addition to get context sensitive vectors
    if csType == "addition":
        def context_sensitive(v1, v2, context=None, target_word=None):
            return addition(v1, v2)

    # (c) use multiplication to get context sensitive vectors
    elif csType == "multiplication":
        def context_sensitive(v1, v2, context=None, target_word=None):
            return multiplication(v1, v2)
    
    # (d) use LDA to get context sensitive vectors
    elif csType == "lda":
        topicIds = [lst[0] for lst in model.show_topics(-1)]  # -1 gives all topics
        def context_sensitive(t, c, context, target_word):
            cs_vector = []
            for topicId in topicIds:
                # Get prob_z_given_w.
                try:
                    prob_z_given_w_value = prob_z_given_w_dict[(topicId, target_word)]
                except KeyError:
                    prob_z_given_w_value = prob_z_given_w(model, topicId, frequencyVectors[word2id[target_word]])
                    prob_z_given_w_dict[(topicId, target_word)] = prob_z_given_w_value
                # Get prob_w_given_z.
                try:
                    prob_w_given_z_value = prob_w_given_z_dict[(topicId, context)]
                except KeyError:
                    prob_w_given_z_value = prob_w_given_z(model, context, topicId)
                    prob_w_given_z_dict[(topicId, context)] = prob_w_given_z_value
                # Add their product to vector.
                cs_vector.append(prob_z_given_w_value * prob_w_given_z_value)
            return cs_vector
    
    contexts = []
    for i in range(target_position - window, target_position + window + 1):
        if i != target_position and i >= 0 and i < len(sentence) and sentence[i] in vocab_unicode:
            contexts.append(sentence[i])
    if not contexts:
        return None  # fail to predict if no context words
    
    def get_vector(model, target_word):
        if csType == 'lda':
            vector = frequencyVectors[word2id[target_word]]  # no need for model
        else:
            if type(model) == gensim.models.word2vec.Word2Vec:
                vector = list(model[target_word])
            else:
                vector = model[word2id[target_word]]
        return vector
    
    best_word = None
    best_score = 0.
    t = get_vector(model, target_word)
    for word in words:
        w = get_vector(model, word)
        score = 0.
        for context in contexts:
            try:
                c = get_vector(model, context)
            except KeyError:  # e.g., u'continually.r': word not in vocab
                continue
            if not c:  # context word has no vector
                continue
            tc = context_sensitive(t, c, context, target_word)
            if not tc:  # sometimes multiplication returns []; then ignore context word
                continue
            score += cosine_similarity(w, tc)
        if score > best_score:
            best_score = score
            best_word = word
    
    return best_word

In [14]:
id2word, word2id, frequencyVectors = load_corpus('data/vocabulary.txt', 'data/word_contexts.txt')
# jsonSentence = sents[3]
thesaurus = load_thesaurus('data/test_thesaurus.txt')
# csType = 'addition'

In [None]:
model = tf_idf(frequencyVectors)
print best_substitute(jsonSentence, thesaurus, word2id, model, frequencyVectors, csType, is_debug=True)

In [None]:
model = gensim.models.word2vec.Word2Vec.load('run/word2vec_model')
print best_substitute(jsonSentence, thesaurus, word2id, model, frequencyVectors, csType, is_debug=True)

In [242]:
vocab_unicode = [gensim.utils.any2unicode(w) for w in word2id.keys()]
def write_file(model, csType, filename, vocab_unicode):
    print '=' * 25
    print filename
    print '=' * 25
    if os.path.exists(filename):
        os.remove(filename)
    with open(filename, 'w') as f:
        for jsonSentence in sents:
            target_word = jsonSentence['target_word']
            ID = jsonSentence['id']
            best_sub = best_substitute(jsonSentence, thesaurus, word2id, model, frequencyVectors, csType, vocab_unicode=vocab_unicode)
            if best_sub is None:
                best_sub = ''
            output = '{} {} :: {}'.format(target_word, ID, re.sub('\..*', '', best_sub))
            print output
            f.write(output + '\n')

# Write file for LDA.
write_file(model_lda, 'lda', 'run/{}.txt'.format('output_lda'), vocab_unicode)

# Write files for everything else.
models = [tf_idf(frequencyVectors), gensim.models.word2vec.Word2Vec.load('run/word2vec_model')]
model_strs = ['tf-idf', 'word2vec']
csTypes = ['addition', 'multiplication']
for model, model_str in zip(models, model_strs):
    for csType in csTypes:
        filename = 'run/{}_{}.txt'.format(model_str, csType)
        write_file(model, csType, filename, vocab_unicode)

run/output_lda.txt
side.n 301 :: position
side.n 302 :: position
side.n 303 :: position
side.n 304 :: position
side.n 305 :: position
side.n 306 :: position
side.n 307 :: 
side.n 308 :: position
side.n 309 :: position
side.n 310 :: shore
tell.v 311 :: order
tell.v 312 :: explain
tell.v 313 :: explain
tell.v 314 :: explain
tell.v 315 :: explain
tell.v 316 :: explain
tell.v 317 :: explain
tell.v 318 :: notify
tell.v 319 :: explain
tell.v 320 :: explain
terrible.a 321 :: negative
terrible.a 322 :: negative
terrible.a 323 :: negative
terrible.a 324 :: negative
terrible.a 325 :: bad
terrible.a 326 :: bad
terrible.a 327 :: negative
terrible.a 328 :: negative
terrible.a 329 :: bad
terrible.a 330 :: negative
think.v 331 :: contemplate
think.v 332 :: contemplate
think.v 333 :: contemplate
think.v 334 :: contemplate
think.v 335 :: contemplate
think.v 336 :: contemplate
think.v 337 :: contemplate
think.v 338 :: contemplate
think.v 339 :: contemplate
think.v 340 :: contemplate
thus.r 341 :: so
thu

In [246]:
write_file(gensim.models.word2vec.Word2Vec.load('run/word2vec_model'), 'multiplication', 'testtest.txt', vocab_unicode)

testtest.txt
side.n 301 :: 
side.n 302 :: perspective
side.n 303 :: position
side.n 304 :: view
side.n 305 :: responsibility
side.n 306 :: part
side.n 307 :: 
side.n 308 :: view
side.n 309 :: view
side.n 310 :: part
tell.v 311 :: assure
tell.v 312 :: assure
tell.v 313 :: assure
tell.v 314 :: assure
tell.v 315 :: assure
tell.v 316 :: assure
tell.v 317 :: assure
tell.v 318 :: assure
tell.v 319 :: assure
tell.v 320 :: assure
terrible.a 321 :: severe
terrible.a 322 :: negative
terrible.a 323 :: negative
terrible.a 324 :: negative
terrible.a 325 :: severe
terrible.a 326 :: dreadful
terrible.a 327 :: dreadful
terrible.a 328 :: negative
terrible.a 329 :: frightening
terrible.a 330 :: negative
think.v 331 :: feel
think.v 332 :: feel
think.v 333 :: guess
think.v 334 :: guess
think.v 335 :: guess
think.v 336 :: expect
think.v 337 :: suppose
think.v 338 :: guess
think.v 339 :: feel
think.v 340 :: guess
thus.r 341 :: hence
thus.r 342 :: hence
thus.r 343 :: hence
thus.r 344 :: hence
thus.r 345 :: c

In [None]:
if __name__ == "__main__":
    import sys

    part = sys.argv[1]

    # this can give you an indication whether part a (vector addition and multiplication) works.
    if part == "a":
        print("(a): vector addition and multiplication")
        v1, v2, v3 , v4 = [(0,1), (2,1), (4,2)], [(0,1), (1,2), (4,1)], [1, 0, 1, 0, 2], [1, 2, 0, 0, 1]
        try:
            if not set(addition(v1, v2)) == set([(0, 2), (2, 1), (4, 3), (1, 2)]):
                print("\tError: sparse addition returned wrong result")
            else:
                print("\tPass: sparse addition")
        except Exception as e:
            print("\tError: exception raised in sparse addition")
            print(e)
        try:
            if not set(multiplication(v1, v2)) == set([(0,1), (4,2)]):
                print("\tError: sparse multiplication returned wrong result")
            else:
                print("\tPass: sparse multiplication")
        except Exception as e:
            print("\tError: exception raised in sparse multiplication")
            print(e)
        try:
            addition(v3,v4)
            print("\tPass: full addition")
        except Exception as e:
            print("\tError: exception raised in full addition")
            print(e)
        try:
            multiplication(v3,v4)
            print("\tPass: full multiplication")
        except Exception as e:
            print("\tError: exception raised in full addition")
            print(e)

    # you may complete this to get answers for part b (best substitution words with tf-idf and word2vec, using addition)
    if part == "b":
        print("(b) using addition to calculate best substitution words")
        # your code here

    # you may complete this to get answers for part c (best substitution words with tf-idf and word2vec, using multiplication)
    if part == "c":
        print("(c) using multiplication to calculate best substitution words")

    # this can give you an indication whether your part d1 (P(Z|w) and P(w|Z)) works
    if part == "d":
        print("(d): calculating P(Z|w) and P(w|Z)")
        print("\tloading corpus")
        id2word,word2id,vectors=load_corpus(sys.argv[2], sys.argv[3])
        print("\tloading LDA model")
        ldaModel = gensim.models.ldamodel.LdaModel.load("lda.model")
        houseTopic = ldaModel[vectors[word2id["house.n"]]][0][0]
        try:
            if prob_z_given_w(ldaModel, houseTopic, vectors[word2id["house.n"]]) > 0.0:
                print("\tPass: P(Z|w)")
            else:
                print("\tFail: P(Z|w)")
        except Exception as e:
            print("\tError: exception during P(Z|w)")
            print(e)
        try:
            if prob_w_given_z(ldaModel, "house.n", houseTopic) > 0.0:
                print("\tPass: P(w|Z)")
            else:
                print("\tFail: P(w|Z)")
        except Exception as e:
            print("\tError: exception during P(w|Z)")
            print(e)

    # you may complete this to get answers for part d2 (best substitution words with LDA)
    if part == "e":
        print("(e): using LDA to calculate best substitution words")
        # your code here