In [1]:
__author__ = 'MK'

import nltk
import csv
import numpy as np
import pandas as pd
import time

from nltk.corpus import wordnet

def getdef(word):
    '''
    :param word: input is a word in a string format
    :return: returns a string that combines 3 definitions. 
    '''
    addef = word            #the word itself goes into the definition.

    for synset in wordnet.synsets(word)[:1]:
        addef = addef + ' ' + synset.definition()

    return addef


def doc_to_def(doc):
    ''' 
    :param doc: input is a document in a string format. Ex: "I have an apple" 
    :return: returns a list of definitions. The length of the list is the number of words in a string.   
    '''
    definition = []
    tokens = nltk.word_tokenize(doc)

    length = len(tokens)    # for printing out the progress
    print 'doc_to_def progress:  0 / ', length
    progress = 0

    for element in tokens:
        temp = getdef(element)
        definition.append(temp)

        progress = progress + 1
        print 'doc_to_def progress: ', progress, ' / ', length

    return definition


def list_to_def(list):
    ''' 
    :param list: input is a document in a list format. Ex: "['I', 'have', 'an', 'apple', '.']"
    :return: returns a list of definitions. The length of the list is the number of words in a list.  
    '''
    length = len(list)  # for printing out the progress
    print 'list_to_def progress:  0 / ', length
    progress = 0

    definition = []

    for element in list:
        temp = getdef(element)
        definition.append(temp)

        progress = progress + 1
        print 'list_to_def progress: ', progress, ' / ', length

    return definition


def def_to_beta(definition,wordspace):
    ''' 
    :param definition: input is a list of definitions.  
    :param wordspace: list of words. This is the wordspace. Note that if the wordspace does not contain the words in the input (definition), it will give an error. 
    Use get_wordspace( [input1, input2,..] ) as your wordspace. 
    :return: returns a pandas series of a beta of one document.
    '''

    deflength = len(definition)
    print 'progress:  0 / ', deflength  # for printing out the progress
    progress = 0

    beta = pd.Series(np.zeros(len(wordspace)), index = wordspace)

    for element in definition:
        token = nltk.word_tokenize(element)
        for word in token:
            beta[word] = beta[word]  + 1

        progress = progress + 1
        print 'progress: ', progress, ' / ', deflength

    return beta

def run_beta( text, wordspace ):
    ''' 
    :param text:  One document. it could be a "string" or a "list of strings"
    :param wordspace: list of words. Must contain all the words in the definition of the input words. 
    Use get_wordspace( [input1, input2,..] ) as your wordspace. 
    :return: returns a pandas series of a beta of one document. 
    '''


    if type(text) == str:
        text = doc_to_def(text)
    else:
        text = list_to_def(text)


    beta = def_to_beta(text, wordspace)

    beta = beta / np.linalg.norm(beta)

    return beta

def beta_to_matrix(keybeta, searchbeta):
    ''' 
    :param keybeta: a list of beta values of the key documents. 
    :param searchbeta: a list of the beta values of the search documents. 
    :return: returns the matrix of the Euclidean distance between two betas. 
    '''
    
    matrix = np.zeros(len(keybeta) * len(searchbeta) ).reshape(len(searchbeta),len(keybeta))
    i = 0
    while i < len(searchbeta):
        j = 0
        while j < len(keybeta):
            matrix[i,j] = np.linalg.norm(keybeta[j]-searchbeta[i])
            j = j + 1

        i = i + 1

    return matrix

def run_wsregression( keydoc, searchdoc, wordspace):
    '''
    :param keydoc: a list of key documents (documents are in list or string format)
    :param searchdoc: a list of search documents.
    :param wordspace: list of words that you want to use as wordspace. Use get_wordspace( keydoc + searchdoc )
    :return: matrix in pandas DataFrame format.
    '''

    keybeta = []
    for text in keydoc:
        keybeta.append(run_beta( text, wordspace ))

    searchbeta = []
    for text in searchdoc:
        searchbeta.append(run_beta( text, wordspace ))

    matrix = beta_to_matrix(keybeta, searchbeta)

    matrix = pd.DataFrame(matrix)
    columnnames = []
    rownames = []
    i= 0
    j=0
    while i < matrix.shape[1]:
        columnnames.append('KeyDoc' + str(i+1))
        i = i +1
    while j < matrix.shape[0]:
        rownames.append('SearchDoc' + str(j+1))
        j = j +1

    matrix.columns = [columnnames]
    matrix.index = [rownames]

    return matrix


def get_wordspace( inputdoclist ):
    '''
    :param inputdoclist: list of documents that are used in the wsregression.
    :return: list of all words that show up in the definition of all words in the documents.
    '''

    wordspace = []
    for text in inputdoclist:

        if type(text) == str:
            text = nltk.word_tokenize(text)

        length = len(text)    # for printing out the progress
        print 'get_wordspace progress:  0 / ', length
        progress = 0

        setindex = 0

        for element in text:
            temp = getdef(element)
            wordspace = wordspace +  nltk.word_tokenize(temp)
            progress = progress + 1
            print 'get_wordspace progress: ', progress, ' / ', length

            if (setindex > 3000):
                wordspace = list(set(wordspace))
                setindex = 0

            setindex = setindex + 1

    return list(set(wordspace))



def time_execution(code):
    start = time.clock()
    result = eval(code)
    run_time = time.clock() - start
    return result, run_time



# this procedure searches for sublist in the list. 
def _search(forward, source, target, start=0, end=None):
    """Naive search for target in source."""
    m = len(source)
    n = len(target)
    if end is None:
        end = m
    else:
        end = min(end, m)
    if n == 0 or (end-start) < n:
        # target is empty, or longer than source, so obviously can't be found.
        return None
    if forward:
        x = range(start, end-n+1)
    else:
        x = range(end-n, start-1, -1)
    for i in x:
        if source[i:i+n] == target:
            return i
    return None

import functools
search = functools.partial(_search, True)
rsearch = functools.partial(_search, False)


_doc = """%(name)s(sequence, subsequence [, start [, end]]) -> int or None

Search a sequence[start:end] for a subsequence starting from the %(dir)s,
returning the offset if it is found, otherwise None.

>>> %(name)s([1, 2, "z", 2, "a", 3, 2, "a"], [2, "a"])
%(value)d

If not given, start and end default to the beginning and end of the sequence.
"""

search.__doc__ = _doc % {'name': 'search', 'value': 3, 'dir': 'left'}
rsearch.__doc__ = _doc % {'name': 'rsearch', 'value': 6, 'dir': 'right'}
search.__name__ = 'search'
rsearch.__name__ = 'rsearch'
del _doc, _search



In [2]:
#call the training data.
train = pd.read_table('/Users/MK/GitHub/the_answer_is/data/training_set.tsv', sep = '\t')

In [3]:
import pprint
pprint.pprint(train)

          id                                           question correctAnswer  \
0     100001  When athletes begin to exercise, their heart r...             C   
1     100002  Which example describes a learned behavior in ...             C   
2     100003  When two nuclei are combined into one nucleus,...             D   
3     100004  Which is a distinction between an epidemic and...             B   
4     100005  In which way is the orbit of a comet different...             B   
5     100006  A teacher builds a model of a hydrogen atom. A...             B   
6     100007  Which substance should a student apply to the ...             A   
7     100008  What is the main source of energy for the wate...             A   
8     100009  Which has the greatest effect on aiding the mo...             D   
9     100010  Over time, non-volcanic mountains can form due...             C   
10    100011  The human body has an average, normal temperat...             D   
11    100012  Which is the b

In [4]:
question = train.ix[2][1]
print question
answer1 = train.ix[2][3]
answer2 = train.ix[2][4]
answer3 = train.ix[2][5]
answer4 = train.ix[2][6] 
print answer1, answer2, answer3, answer4

When two nuclei are combined into one nucleus, there is a slight change in mass and the release of a large amount of energy. What is this process called?
conversion reaction fission fusion


In [20]:
def remove_stopwords(string):
    import re
    from nltk.corpus import stopwords
    string = re.sub(r'[^a-zA-Z ]',r'',string)  
    string = string.split()
    string = [word for word in string if word not in stopwords.words('english')]  # remove the stop words. 
    string = ' '.join(string)
    return string

question = remove_stopwords(question)
answer1 = remove_stopwords(answer1)
answer2 = remove_stopwords(answer2)
answer3 = remove_stopwords(answer3)
answer4 = remove_stopwords(answer4)
print question
print answer1, answer2, answer3, answer4

When two nuclei combined one nucleus slight change mass release large amount energy What process called
conversion reaction fission fusion


In [6]:
wordspace = get_wordspace([question,answer1,answer2,answer3,answer4]) 
print wordspace

get_wordspace progress:  0 /  16
get_wordspace progress:  1  /  16
get_wordspace progress:  2  /  16
get_wordspace progress:  3  /  16
get_wordspace progress:  4  /  16
get_wordspace progress:  5  /  16
get_wordspace progress:  6  /  16
get_wordspace progress:  7  /  16
get_wordspace progress:  8  /  16
get_wordspace progress:  9  /  16
get_wordspace progress:  10  /  16
get_wordspace progress:  11  /  16
get_wordspace progress:  12  /  16
get_wordspace progress:  13  /  16
get_wordspace progress:  14  /  16
get_wordspace progress:  15  /  16
get_wordspace progress:  16  /  16
get_wordspace progress:  0 /  1
get_wordspace progress:  1  /  1
get_wordspace progress:  0 /  1
get_wordspace progress:  1  /  1
get_wordspace progress:  0 /  1
get_wordspace progress:  1  /  1
get_wordspace progress:  0 /  1
get_wordspace progress:  1  /  1
['What', u'less', u'money', u'results', u'course', u'discourteous', u'causes', u'system', u'ergs', u'joules', u'(', u'to', u'production', u'issued', u'trans

In [8]:
result = run_wsregression( [question], [answer1,answer2,answer3,answer4], wordspace)
print result

doc_to_def progress:  0 /  16
doc_to_def progress:  1  /  16
doc_to_def progress:  2  /  16
doc_to_def progress:  3  /  16
doc_to_def progress:  4  /  16
doc_to_def progress:  5  /  16
doc_to_def progress:  6  /  16
doc_to_def progress:  7  /  16
doc_to_def progress:  8  /  16
doc_to_def progress:  9  /  16
doc_to_def progress:  10  /  16
doc_to_def progress:  11  /  16
doc_to_def progress:  12  /  16
doc_to_def progress:  13  /  16
doc_to_def progress:  14  /  16
doc_to_def progress:  15  /  16
doc_to_def progress:  16  /  16
progress:  0 /  16
progress:  1  /  16
progress:  2  /  16
progress:  3  /  16
progress:  4  /  16
progress:  5  /  16
progress:  6  /  16
progress:  7  /  16
progress:  8  /  16
progress:  9  /  16
progress:  10  /  16
progress:  11  /  16
progress:  12  /  16
progress:  13  /  16
progress:  14  /  16
progress:  15  /  16
progress:  16  /  16
doc_to_def progress:  0 /  1
doc_to_def progress:  1  /  1
progress:  0 /  1
progress:  1  /  1
doc_to_def progress:  0 /

In [9]:
print result.idxmin()           #my answer is SearchDocD, which is D. 

KeyDoc1    SearchDoc4
dtype: object


In [None]:
%%capture         
#code for suppressing print in std-out.
myanswer = []     # store my answer in list. 
for i in xrange(len(train)):
    question = remove_stopwords(train.ix[i][1])
    answer1 = remove_stopwords(train.ix[i][3])
    answer2 = remove_stopwords(train.ix[i][4])
    answer3 = remove_stopwords(train.ix[i][5])
    answer4 = remove_stopwords(train.ix[i][6])
    wordspace = get_wordspace([question,answer1,answer2,answer3,answer4]) 
    result = run_wsregression( [question], [answer1,answer2,answer3,answer4], wordspace)
    myanswer.append(result.idxmin()[0])
    

In [None]:
train['myanswer'] = myanswer
convert_answer = {'SearchDoc1': 'A', 'SearchDoc2': 'B', 'SearchDoc3': 'C', 'SearchDoc4': 'D' }
train['myanswer'] = train['myanswer'].map(convert_answer)

In [None]:
train

In [None]:
train['correct'] = (train['correctAnswer'] == train['myanswer'])

In [None]:
print 'the percent correct in 2500 question is '
print train['correct'].sum()  / (len(train) + 0.0)

In [10]:
#lets use another dictionary from the PyDictionary package
# use easy_install -U PyDictionary to install the package
# the problem with using this package in the competition is that it makes a query to thesaurus.com to get synonyms. 
from PyDictionary import PyDictionary
dictionary=PyDictionary()
print dictionary.synonym("fusion")


[u'synthesis', u'blend', u'alloy', u'melting', u'commixture']




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [11]:
for word in dictionary.synonym("fusion"):
    print getdef(word)

synthesis the process of producing a chemical compound (usually by the union of simpler chemical compounds)
blend an occurrence of thorough mixing
alloy a mixture containing two or more metallic elements or metallic and nonmetallic elements usually fused together or dissolving into each other when molten
melting the process whereby heat changes something from a solid to a liquid
commixture the act of mixing together


In [None]:
synonyms = dictionary.synonym("fsn")
print " ".join(synonyms)

In [14]:
# change the getdef procedure to include synonyms from the thesaurus.com 
def getdef(word):
    '''
    :param word: input is a word in a string format
    :return: returns a string that combines 3 definitions. 
    '''
    addef = word            #the word itself goes into the definition.
    synonyms = dictionary.synonym(word)
    if synonyms:
        addef = addef + " ".join(synonyms)
    for synset in wordnet.synsets(word)[:1]:
        addef = addef + ' ' + synset.definition()

    return addef

In [21]:
#see if adding thesaurus would change the answer significantly. 
#and as you can see, it doesnt't.

question = train.ix[2][1]
answer1 = train.ix[2][3]
answer2 = train.ix[2][4]
answer3 = train.ix[2][5]
answer4 = train.ix[2][6] 

question = remove_stopwords(question)
answer1 = remove_stopwords(answer1)
answer2 = remove_stopwords(answer2)
answer3 = remove_stopwords(answer3)
answer4 = remove_stopwords(answer4)

wordspace = get_wordspace([question,answer1,answer2,answer3,answer4])  

result = run_wsregression( [question], [answer1,answer2,answer3,answer4], wordspace)
print result

get_wordspace progress:  0 /  16
get_wordspace progress:  1  /  16
get_wordspace progress:  2  /  16
get_wordspace progress:  3  /  16
get_wordspace progress:  4  /  16
get_wordspace progress:  5  /  16
get_wordspace progress:  6  /  16
get_wordspace progress:  7  /  16
get_wordspace progress:  8  /  16
get_wordspace progress:  9  /  16
get_wordspace progress:  10  /  16
get_wordspace progress:  11  /  16
get_wordspace progress:  12  /  16
get_wordspace progress:  13  /  16
What has no Synonyms in the API
get_wordspace progress:  14  /  16
get_wordspace progress:  15  /  16
get_wordspace progress:  16  /  16
get_wordspace progress:  0 /  1
get_wordspace progress:  1  /  1
get_wordspace progress:  0 /  1
get_wordspace progress:  1  /  1
get_wordspace progress:  0 /  1
get_wordspace progress:  1  /  1
get_wordspace progress:  0 /  1
get_wordspace progress:  1  /  1
doc_to_def progress:  0 /  16
doc_to_def progress:  1  /  16
doc_to_def progress:  2  /  16
doc_to_def progress:  3  /  16
d