In [1]:
import os
import sys
import numpy as np
from scipy.sparse import csr_matrix
from modified_rescal import als
from triple_reader import triple_reader
from question_reader_utf8 import question_reader
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# MUST USE WORD2VEC, FIND EITHER ENTITY-ENTITY OR ENTITY-RELATION that are closest
# from gensim import utils
from gensim.models import word2vec

# initialize word2vec model
word_model = word2vec.Word2Vec.load_word2vec_format('word2vec.bin', binary=True)

numQuestions  2
[[u'The example in lines 4-8 primarily suggests that', u"Balzac's work was not especially popular among female readers", u'Balzac could not write convincingly about financial matters', u"Balzac's insights into character were not evident in his everyday life", u'people who knew Balzac personally could not respect him as an artist', u'readers had unreasonable expectation of Balzac the man'], [u"The author mentions Balzac's experience as a schoolboy in order to", u'explain why Balzac was unable to conduct his financial affairs properly', u"point out a possible source of Balzac's powerful imagination", u"exonerate the boarding school for Balzac's lackluster performance", u'foster the impression that Balzac was an unruly student', u"depict the conditions of boarding school life during Balzac's youth"]]


In [3]:
# read in file as tensors
text_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/"
    "Senior Thesis Code/ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/2/Triples/2-small1-1.txt")
    
tr = triple_reader(text_file)
print tr.tripleList
tensor = tr.tensor
enMap = tr.enMap
relMap = tr.relMap

[['Balzac' 'wise' 'money']
 ['Balzac' 'was' 'wise']
 ['Balzac' 'be' 'wise']
 ['source' 'was' 'sensitivity']
 ['It' 'was' 'seems']
 ['life' 'prison' 'doors']
 ['life' 'is' 'prison']
 ['he' 'discovered' 'doors']
 ['he' 'is' 'discovered']
 ['fiction' 'sprang' 'doors']
 ['fiction' 'is' 'sprang']
 ['imagination' 'open' 'doors']
 ['imagination' 'is' 'open']]


In [None]:
print len(tensor)
print tensor[0].shape
# convert them to scipy csr_matrix
csrTensor = map(lambda x: csr_matrix(x), tensor)

A, R, f, itr, exectimes = als(csrTensor, 20)
X_tilde = []
for k in range(len(R)):
    X_tilde.append(A.dot(R[k].dot(A.T)))

print A
print R
print f
print itr
print exectimes
print X_tilde
print A.shape
print np.array(X_tilde).shape
print np.array(tensor).shape

In [4]:
# Find the word in entity that's most similar to given word
# default topsim to lowest possible python int
def findSimEn(word):
    topsim = None
    topEn = None
    for en in enMap:
        if type(en) == int:
            continue
        try:
            sim = word_model.similarity(en, word)
        except KeyError as e:
            sim = None
        if sim > topsim:
            topsim = sim
            topEn = en
    return [topEn, topsim]

def findSimRel(word):
    topsim = None
    topRel = None
    for rel in relMap:
        if type(rel) == int:
            continue
        try:
            sim = word_model.similarity(rel, word)
        except KeyError as e:
            sim = None
        if sim > topsim:
            topsim = sim
            topRel = rel
    return [topRel, topsim]

print findSimEn('critic')
print findSimRel('go')

# Return top similarity, sim score, and isEn boolean
def findTopEnOrRel(word):
#     try:
    [topEn, topEnsim] = findSimEn(word)
    [topRel, topRelsim] = findSimRel(word)
    if topEnsim > topRelsim:
        return (topEn, topEnsim, True, word)
    else:
        return (topRel, topRelsim, False, word)
        
    # if can't find similarity, ignore it
#     except KeyError as e:
#         return [None, 0, False, word]
    
# Return top 2 sims for an array of words
def findTopEnOrRelArr(wordArr):
    topArrs = []
    for word in wordArr:
        top = findTopEnOrRel(word)
#         print top
        topArrs.append(top)
#     print topArrs
#     print topArrs
    sortedTop = sorted(topArrs, key=lambda x: -x[1] if x[1] is not None else sys.maxint)
#     print sortedTop
    top1 = sortedTop[0]
    top2 = None
    # if the top is a relation, we have to pick an entity
    if top1[2] == False:
        curIndex = 1
        while curIndex < len(sortedTop):
            cur = sortedTop[curIndex]
            if cur[2] == False:
                curIndex += 1
                continue
            else:
                top2 = cur
                break
    else:
        top2 = sortedTop[1]
    if top2 == None:
        top2 = [0, 0, True]
    return [top1, top2]

print findTopEnOrRelArr(["critic", "went", "hi", "boat", "paint"])

['source', 0.2538225182774137]
['be', 0.32847276428761218]
[('was', 0.47963733397180919, False, 'went'), ('source', 0.2538225182774137, True, 'critic')]


In [24]:
# for each question choice, find the entity/relation that is most-matched in question 
# then we find the choice that matches the result the most

q_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/Senior Thesis Code/"
"ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/1/1-long2-q.txt")
qr = question_reader(q_file)
questions = qr.questionCombos
print qr.numQuestions
print len(questions)
for question in questions:
    question_words = question[0].split(" ")
    print question_words
    # remove stopwords
    question_words = [word for word in question_words if word not in stopwords.words('english')]
#     print question_words
#     print str(question_words)
    [top1, top2] = findTopEnOrRelArr(question_words)
    print [top1, top2]
    
    # if word in question doesn't match, move on
    if top1[0] == None:
        continue
    
    wordMatch = None
    # If both entities go through X_tilde, find which relation has highest score
    if top1[2] == True and top2[2] == True:
        top1EnInt = enMap[top1[0]]
        top2EnInt = enMap[top2[0]]
        max_rel = None
        max_score = None
        for rel in xrange(len(X_tilde)):
            cur_score = X_tilde[rel][top1EnInt][top2EnInt]
            if cur_score > max_score:
                max_score = cur_score
                max_rel = rel
        #Now find which choice matches most with relWord
        relWord = relMap[max_rel]
        wordMatch = relWord
        
    # If entity and relation, find which e_2 has highest score
    elif top1[2] == True and top2[2] == False:
        top1EnInt = enMap[top1[0]]
        top2RelInt = relMap[top2[0]]
        max_en = None
        max_score = None
        print [top1, top2]
        for en in xrange(len(X_tilde[top2RelInt][top1EnInt])):
            cur_score = X_tilde[top2RelInt][top1EnInt][en]
            if cur_score > max_score:
                max_score = cur_score
                max_en = en
        enWord = enMap[max_en]
        wordMatch = enWord
    elif top1[2] == False and top2[2] == True:
        top2EnInt = enMap[top2[0]]
        top1RelInt = relMap[top1[0]]
        max_en = None
        max_score = None
        for en in xrange(len(X_tilde[top1RelInt][top2EnInt])):
            cur_score = X_tilde[top1RelInt][top2EnInt][en]
            if cur_score > max_score:
                max_score = cur_score
                max_en = en
        enWord = enMap[max_en]
        wordMatch = enWord
    else:
        raise Exception("shouldn't get here!")
    
    print wordMatch
    # for each choice, iterate through the words in the choice, find the choice with highest similarity
    choices = question[1:]
    max_sim = None
    max_choice = None
    for i in xrange(len(choices)):
        choice = choices[i]
        choice_words = choice.split(" ")
        question_words = [word for word in question_words if word not in stopwords.words('english')]
        for word in choice_words:
            try:
                sim = word_model.similarity(word, wordMatch)
            except KeyError as e:
                sim = 0
            if sim > max_sim:
                max_sim = sim
                max_choice = i
    if max_choice == 0:
        max_choice = "A"
    if max_choice == 1:
        max_choice = "B"
    if max_choice == 2:
        max_choice = "C"
    if max_choice == 3:
        max_choice = "D"
    if max_choice == 4:
        max_choice = "E"
    print max_choice

UnicodeDecodeError: 'utf8' codec can't decode byte 0xff in position 0: invalid start byte

In [43]:
a_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/Senior Thesis Code/"
"ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/2/2-long1-a.txt")
az = open(a_file, "r")
answers = []
for line in az:
    answers.append(line.strip())
print answers

# read in file as tensors
text_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/"
    "Senior Thesis Code/ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/2/Triples/2-long1.txt")
    
tr = triple_reader(text_file)
# print tr.tripleList
tensor = tr.tensor
enMap = tr.enMap
relMap = tr.relMap

print len(tensor)
print tensor[0].shape
# convert them to scipy csr_matrix

# do for 10 iterations and pick best model
numIterations = 50
bestAnswers = []
mostCorrect = -1

# define the dimension
if len(tensor[0][0]) < 20:
    d = len(tensor[0][0]) - 1
else:
    d = 20
print "d: ",d
for i in xrange(numIterations):
    csrTensor = map(lambda x: csr_matrix(x), tensor)

    A, R, f, itr, exectimes = als(csrTensor, d)
    X_tilde = []
    for k in range(len(R)):
        X_tilde.append(A.dot(R[k].dot(A.T)))

    # print A
    # print R
    # print f
    # print itr
    # print exectimes
    # print X_tilde
    # print A.shape
    # print np.array(X_tilde).shape
    # print np.array(tensor).shape

    # for each question choice, find the entity/relation that is most-matched in question 
    # then we find the choice that matches the result the most

    our_answers = []

    q_file = ("/Users/SaahilM/Documents/Princeton/Academics/Thesis/Senior Thesis Code/"
    "ModifiedEntityGraph/prod/MCTest/production/MCTest/OCR_text/2/2-long1-q.txt")
    qr = question_reader(q_file)
    questions = qr.questionCombos
#     print questions
    # print qr.numQuestions
    # print len(questions)
    for i in xrange(len(questions)):
        question = questions[i]
        question_words = question[0].split(" ")
#         print question_words
        # remove stopwords
        question_words = [word for word in question_words if word not in stopwords.words('english')]
    #     print question_words
    #     print str(question_words)
        [top1, top2] = findTopEnOrRelArr(question_words)
#         print [top1, top2]

        # if word in question doesn't match, move on
        if top1[0] == None:
            continue

        wordMatch = None
        # If both entities go through X_tilde, find which relation has highest score
        if top1[2] == True and top2[2] == True:
            top1EnInt = enMap[top1[0]]
            top2EnInt = enMap[top2[0]]
            max_rel = None
            max_score = None
            for rel in xrange(len(X_tilde)):
                cur_score = X_tilde[rel][top1EnInt][top2EnInt]
                if cur_score > max_score:
                    max_score = cur_score
                    max_rel = rel
            #Now find which choice matches most with relWord
            relWord = relMap[max_rel]
            wordMatch = relWord

        # If entity and relation, find which e_2 has highest score
        elif top1[2] == True and top2[2] == False:
            top1EnInt = enMap[top1[0]]
            top2RelInt = relMap[top2[0]]
            max_en = None
            max_score = None
            for en in xrange(len(X_tilde[top2RelInt][top1EnInt])):
                cur_score = X_tilde[top2RelInt][top1EnInt][en]
                if cur_score > max_score:
                    max_score = cur_score
                    max_en = en
            enWord = enMap[max_en]
            wordMatch = enWord
        elif top1[2] == False and top2[2] == True:
            top2EnInt = enMap[top2[0]]
            top1RelInt = relMap[top1[0]]
            max_en = None
            max_score = None
            for en in xrange(len(X_tilde[top1RelInt][top2EnInt])):
                cur_score = X_tilde[top1RelInt][top2EnInt][en]
                if cur_score > max_score:
                    max_score = cur_score
                    max_en = en
            enWord = enMap[max_en]
            wordMatch = enWord
        else:
            raise Exception("shouldn't get here!")

#         print wordMatch
        # for each choice, iterate through the words in the choice, find the choice with highest similarity
        choices = question[1:]
        max_sim = None
        max_choice = None
        for i in xrange(len(choices)):
            choice = choices[i]
            choice_words = choice.split(" ")
            question_words = [word for word in question_words if word not in stopwords.words('english')]
            for word in choice_words:
#                 print word
                try:
                    sim = word_model.similarity(word, wordMatch)
#                     print 'NOTERROR'
                except KeyError as e:
                    sim = 0
#                     print 'KEYERROR'
                if sim > max_sim:
                    max_sim = sim
                    max_choice = i
        if max_choice == 0:
            max_choice = "A"
        if max_choice == 1:
            max_choice = "B"
        if max_choice == 2:
            max_choice = "C"
        if max_choice == 3:
            max_choice = "D"
        if max_choice == 4:
            max_choice = "E"
        our_answers.append(max_choice)
#         print max_choice
#     print our_answers
    numCorrect = 0
    for i in xrange(len(our_answers)):
        if answers[i] == our_answers[i]:
            numCorrect += 1
#     print numCorrect
    if numCorrect > mostCorrect:
        mostCorrect = numCorrect
        bestAnswers = our_answers
print bestAnswers
print mostCorrect

['A', 'C', 'E', 'D', 'D', 'D', 'E', 'C', 'A', 'B', 'B', 'A']
49
(83, 83)
d:  20
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
numQuestions  12
['C', 'A', 'D', 'D', 'B', 'E', 'E', 'A', 'B', 'A', 'B', 'B']
3
