In [1]:
import os
import sys
import numpy as np
from scipy.sparse import csr_matrix
from modified_rescal import als
from triple_reader import triple_reader
from question_reader import question_reader
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# MUST USE WORD2VEC, FIND EITHER ENTITY-ENTITY OR ENTITY-RELATION that are closest
# from gensim import utils
from gensim.models import word2vec

numQuestions  4
[[u'In lines 2-8, the author of Passage 1 mentions activities that suggest dolphins', u'are unusually sensitive to their environment', u'do not generally thrive in captivity', u'have a unique type of intelligence', u'are uncommonly playful animals', u'have skills usually associated with humans'], [u'The author of Passage 2 would most likely respond to the last sentence of Passage 1 by', u'suggesting that intelligence in animals is virtually impossible to measure', u'observing that intelligence does not mean the same thing for every species', u'questioning the objectivity of the studies already conducted', u'noting that dolphin activities do not require a high level of intelligence', u'arguing that little is actually known about dolphin social behavior'], [u'The two passages differ in their views of dolphin intelligence in that Passage 1 states that dolphins', u'share a sophisticated culture; while Passage 2 contends that dolphin intelligence is roughly equal to human in

In [None]:
# initialize word2vec model
word_model = word2vec.Word2Vec.load_word2vec_format('word2vec.bin', binary=True)

In [None]:
# read in file as tensors
text_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/"
    "Senior Thesis Code/ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/1/Triples/1-long2.txt")
    
tr = triple_reader(text_file)
print tr.tripleList
tensor = tr.tensor
enMap = tr.enMap
relMap = tr.relMap

In [None]:
print len(tensor)
print tensor[0].shape
# convert them to scipy csr_matrix
csrTensor = map(lambda x: csr_matrix(x), tensor)

A, R, f, itr, exectimes = als(csrTensor, 20)
X_tilde = []
for k in range(len(R)):
    X_tilde.append(A.dot(R[k].dot(A.T)))

print A
print R
print f
print itr
print exectimes
print X_tilde
print A.shape
print np.array(X_tilde).shape
print np.array(tensor).shape

In [None]:
# Find the word in entity that's most similar to given word
# default topsim to lowest possible python int
def findSimEn(word):
    topsim = None
    topEn = None
    for en in enMap:
        if type(en) == int:
            continue
        try:
            sim = word_model.similarity(en, word)
        except KeyError as e:
            sim = None
        if sim > topsim:
            topsim = sim
            topEn = en
    return [topEn, topsim]

def findSimRel(word):
    topsim = None
    topRel = None
    for rel in relMap:
        if type(rel) == int:
            continue
        try:
            sim = word_model.similarity(rel, word)
        except KeyError as e:
            sim = None
        if sim > topsim:
            topsim = sim
            topRel = rel
    return [topRel, topsim]

print findSimEn('critic')
print findSimRel('go')

# Return top similarity, sim score, and isEn boolean
def findTopEnOrRel(word):
#     try:
    [topEn, topEnsim] = findSimEn(word)
    [topRel, topRelsim] = findSimRel(word)
    if topEnsim > topRelsim:
        return (topEn, topEnsim, True, word)
    else:
        return (topRel, topRelsim, False, word)
        
    # if can't find similarity, ignore it
#     except KeyError as e:
#         return [None, 0, False, word]
    
# Return top 2 sims for an array of words
def findTopEnOrRelArr(wordArr):
    topArrs = []
    for word in wordArr:
        top = findTopEnOrRel(word)
#         print top
        topArrs.append(top)
#     print topArrs
#     print topArrs
    sortedTop = sorted(topArrs, key=lambda x: -x[1] if x[1] is not None else sys.maxint)
#     print sortedTop
    top1 = sortedTop[0]
    top2 = None
    # if the top is a relation, we have to pick an entity
    if top1[2] == False:
        curIndex = 1
        while curIndex < len(sortedTop):
            cur = sortedTop[curIndex]
            if cur[2] == False:
                curIndex += 1
                continue
            else:
                top2 = cur
                break
    else:
        top2 = sortedTop[1]
    if top2 == None:
        top2 = [0, 0, True]
    return [top1, top2]

print findTopEnOrRelArr(["critic", "went", "hi", "boat", "paint"])

In [None]:
# for each question choice, find the entity/relation that is most-matched in question 
# then we find the choice that matches the result the most

q_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/Senior Thesis Code/"
"ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/1/1-long2-q.txt")
qr = question_reader(q_file)
questions = qr.questionCombos
print qr.numQuestions
print len(questions)
for question in questions:
    question_words = question[0].split(" ")
    print question_words
    # remove stopwords
    question_words = [word for word in question_words if word not in stopwords.words('english')]
#     print question_words
#     print str(question_words)
    [top1, top2] = findTopEnOrRelArr(question_words)
    print [top1, top2]
    
    # if word in question doesn't match, move on
    if top1[0] == None:
        continue
    
    wordMatch = None
    # If both entities go through X_tilde, find which relation has highest score
    if top1[2] == True and top2[2] == True:
        top1EnInt = enMap[top1[0]]
        top2EnInt = enMap[top2[0]]
        max_rel = None
        max_score = None
        for rel in xrange(len(X_tilde)):
            cur_score = X_tilde[rel][top1EnInt][top2EnInt]
            if cur_score > max_score:
                max_score = cur_score
                max_rel = rel
        #Now find which choice matches most with relWord
        relWord = relMap[max_rel]
        wordMatch = relWord
        
    # If entity and relation, find which e_2 has highest score
    elif top1[2] == True and top2[2] == False:
        top1EnInt = enMap[top1[0]]
        top2RelInt = relMap[top2[0]]
        max_en = None
        max_score = None
        for en in xrange(len(X_tilde[top2RelInt][top1EnInt])):
            cur_score = X_tilde[top2RelInt][top1EnInt][en]
            if cur_score > max_score:
                max_score = cur_score
                max_en = en
        enWord = enMap[max_en]
        wordMatch = enWord
    elif top1[2] == False and top2[2] == True:
        top2EnInt = enMap[top2[0]]
        top1RelInt = relMap[top1[0]]
        max_en = None
        max_score = None
        for en in xrange(len(X_tilde[top1RelInt][top2EnInt])):
            cur_score = X_tilde[top1RelInt][top2EnInt][en]
            if cur_score > max_score:
                max_score = cur_score
                max_en = en
        enWord = enMap[max_en]
        wordMatch = enWord
    else:
        raise Exception("shouldn't get here!")
    
    print wordMatch
    # for each choice, iterate through the words in the choice, find the choice with highest similarity
    choices = question[1:]
    max_sim = None
    max_choice = None
    for i in xrange(len(choices)):
        choice = choices[i]
        choice_words = choice.split(" ")
        question_words = [word for word in question_words if word not in stopwords.words('english')]
        for word in choice_words:
            try:
                sim = word_model.similarity(word, wordMatch)
            except KeyError as e:
                sim = 0
            if sim > max_sim:
                max_sim = sim
                max_choice = i
    if max_choice == 0:
        max_choice = "A"
    if max_choice == 1:
        max_choice = "B"
    if max_choice == 2:
        max_choice = "C"
    if max_choice == 3:
        max_choice = "D"
    if max_choice == 4:
        max_choice = "E"
    print max_choice