In [5]:
import nltk
from nltk.corpus import wordnet
import gensim
import logging
import os
import pandas as pd
import numpy as np
from scipy import spatial
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora
import re

class MyDocument(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                yield content.lower().split()

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                line = unicode(line, encoding='utf-8', errors='replace')
                yield line.lower().split()

def get_dictionary(path):
    dictionary = corpora.Dictionary( MySentences('/Users/MK/GitHub/the_answer_is/data/temporary2') )
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    return dictionary

def get_document_tuple( path, dictionary ):
    vector = pd.Series()
    doclist = MyDocument(path) 
    for i,content in enumerate(doclist):
        vector = vector.set_value(i,dictionary.doc2bow(content))
    return list(vector)

def transform_tuple_into_vector(document_tuple,dictionary):
    vector = pd.Series(np.zeros(len(dictionary.token2id)))        #0 array for storing wiki document vectors.
    if not document_tuple == []:
        for onetuple in document_tuple: 
            vector[onetuple[0]] = onetuple[1]
        vector = vector / np.linalg.norm(vector, ord = 1)                       #normalize vector     
        return vector 
    else:
        return vector

def transform_tuples_into_dataframe(document_tuples, dictionary):
    length = len(document_tuples) 
    vector_initialization = pd.Series(np.zeros(len(dictionary.token2id)))  #0 array for storing wiki document vectors. 
    df_vector = pd.DataFrame(vector_initialization)  #initialize dataframe. all vectors will be stored. 
    for i in xrange(len(document_tuples)):           
        #for each wiki documents, we will transform wiki vectors in tuple form into
        #vectors in ususal form. 
        vector = transform_tuple_into_vector( document_tuples[i], dictionary )
        df_vector[i]= pd.DataFrame(vector)
    return df_vector

def get_close_documents(string, dataframe, dictionary, topn):
    string = unicode(string, encoding='utf-8', errors='replace').lower()
    first_vector = transform_tuple_into_vector(dictionary.doc2bow(string.split()), dictionary )
    lengthlist = pd.Series()
    for j in xrange(len(dataframe.columns)):
        #lengthlist = lengthlist.set_value(j, np.linalg.norm(first_vector-dataframe[j]) )
        lengthlist = lengthlist.set_value( j, spatial.distance.cosine(first_vector, dataframe[j]))
    return lengthlist.sort_values().head(topn)

def get_document_by_index(path,index):
    # this gets the filename and content of the document in a directory by index. 
    i= 0
    for fname in os.listdir(path):
        if i == index:
            with open(os.path.join(path, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                return fname, content
        i= i + 1
    return None

def print_names_of_close_documents(path, close_documents_list, outputpath):
    file = open(outputpath + '/close_documents.txt', 'w+')
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        print name
        file.write(text + '\n' + '\n' + '\n' + '\n')        
    print 'text saved as txt'

def get_contents_of_close_documents_by_paragraph(path, close_documents_list):
    merged = ''
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        merged = merged + '\n' + text        
    return merged.split('\n')


def get_distance_between_two_documents(A, B, dictionary):
    import re
    if A:
        A = re.sub(r'[^a-zA-Z ]',r'',A).lower().split()
    else:
        A = ['the']        # just in case A, or B is empty. 
    A = dictionary.doc2bow(A)
    A = transform_tuple_into_vector( A ,dictionary)
    
    if B:
        B = re.sub(r'[^a-zA-Z ]',r'',B).lower().split()
    else:
        B = ['the']       # just in case A, or B is empty. 
    B = dictionary.doc2bow(B)
    B = transform_tuple_into_vector( B ,dictionary)
    #length = spatial.distance.cosine(A,B)
    # I would like to use cosine distance, but the vectors are so sparse that most of the time the output is 0 .
    # So we use euclidean distance. 
    length = np.linalg.norm(A-B)
    return length


stoplist = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
             u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 
             u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 
             u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 
             u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 
             u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', 
             u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', 
             u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', 
             u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', 
             u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', 
             u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', 
             u'should', u'now']


#procedure needed for deleting words not in the training set. 
def only_the_words_in_index( list, index ):
    output = []
    for a, s in enumerate(list):
        if s in index:
            output.append(list[a])
    return output

def split_paragraph_list(paragraph_list):
    output = []
    for paragraph in paragraph_list:
        output.append( paragraph.split() )
    return output


def getdef(word):
    '''
    :param word: input is a word in a string format
    :return: returns a string that combines 3 definitions. 
    '''
    #addef = word.decode("utf8",'ignore')
    addef = word            #the word itself goes into the definition.

    for synset in wordnet.synsets(word)[:3]:
        try:
            addef = addef + ' ' + synset.definition()
        except:
            continue
    return addef

def doc_to_def(doc):
    ''' 
    :param doc: input is a document in a string format. Ex: "I have an apple" 
    :return: returns a list of definitions. The length of the list is the number of words in a string.   
    '''
    definition = []
    tokens = nltk.word_tokenize(doc)

    length = len(tokens)    # for printing out the progress
    #print 'doc_to_def progress:  0 / ', length
    progress = 0

    for element in tokens:
        temp = getdef(element)
        definition.append(temp)

        progress = progress + 1
        #print 'doc_to_def progress: ', progress, ' / ', length

    return definition

def split_definition(definition):
    split_def = []
    for index, line in enumerate(definition):
        line = line.lower()
        try:
            line = re.sub(r'[^a-zA-Z ]',r'',line)        #delete all but alphabet 
        except:
            print 'Error', line
        if not line:
            continue

        if line:                              # delete the empty lines. 
            line = line.split()
            #line = nltk.word_tokenize(line)
            line = [word for word in line if word not in stoplist]  # remove the stop words. 
            if line:
                split_def = split_def + line

    return split_def



In [2]:
dictionary_path = '/Users/MK/GitHub/the_answer_is/data/temporary2'  # question text from training data
dictionary = get_dictionary(dictionary_path)
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
wiki_tuple = get_document_tuple( wiki_path, dictionary )
df_wiki_vector = transform_tuples_into_dataframe(wiki_tuple,dictionary) 

In [3]:
path = '/Users/MK/GitHub/the_answer_is/data'
os.chdir(path)
train = pd.read_table('training_set.tsv',sep = '\t')

In [4]:
q = train.ix[4][1]
print q
close_documents = get_close_documents(q, df_wiki_vector, dictionary,5)
print close_documents
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
outputpath = '/Users/MK/GitHub/the_answer_is/data'
print_names_of_close_documents(wiki_path, close_documents,outputpath)
merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)



sentences = split_paragraph_list(merged)
model = gensim.models.Word2Vec(sentences,size=100, window=5, min_count=2, workers=4)
model.index2word
i =3
q = train.ix[i][1].split()
a1 = train.ix[i][3].split()
a2 = train.ix[i][4].split()
a3 = train.ix[i][5].split()
a4 = train.ix[i][6].split()

q =  only_the_words_in_index( q, model.index2word)
a1 = only_the_words_in_index( a1, model.index2word)
a2 = only_the_words_in_index( a2, model.index2word)
a3 = only_the_words_in_index( a3, model.index2word)
a4 = only_the_words_in_index( a4, model.index2word)
answer_similarity = np.array([np.random.rand(),np.random.rand(),np.random.rand(),np.random.rand()])
myanswer_distance = pd.DataFrame(np.zeros(4).reshape(1,4), columns = ['A','B','C','D'])    #initialize dataframe to store my answers distance

myanswer_distance.set_value(i, 'A', answer_similarity[0] )    #write down distance for each choice
myanswer_distance.set_value(i, 'B', answer_similarity[1] )
myanswer_distance.set_value(i, 'C', answer_similarity[2] )
myanswer_distance.set_value(i, 'D', answer_similarity[3] )
print myanswer_distance
myanswer_distance.idxmin(axis = 1)


In which way is the orbit of a comet different from the orbit of Earth?
1391    0.623611
1358    0.623611
1113    0.686828
380     0.686828
1237    0.755327
dtype: float64
orbital_motion.txt_to_unicode_remove_stopwords.txt
nonrenewable_resources_in_life_science.txt_to_unicode_remove_stopwords.txt
light_reactions_of_photosynthesis.txt_to_unicode_remove_stopwords.txt
comets.txt_to_unicode_remove_stopwords.txt
milky_way.txt_to_unicode_remove_stopwords.txt
text saved as txt


NameError: name 'split_paragraph_list' is not defined

In [7]:
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')
#this will disable print. But we need it to avoid unicode error. 



def get_my_answer_fetch_w2v(train, dictionary, df_vector, doc_path):

    myanswer_distance = pd.DataFrame(np.zeros(4).reshape(1,4), columns = ['A','B','C','D'])    #initialize dataframe to store my answers distance
    #for i in xrange(50):
    for i in xrange(len(train)):       #loop through all questions
        q = train.ix[i][1]
        close_documents = get_close_documents(q, df_vector, dictionary,5)
        merged = get_contents_of_close_documents_by_paragraph(doc_path, close_documents)
        
        merged = split_paragraph_list(merged)     #split each paragraph for w2v input
        # we will train w2v model based on the documents close to the question. 
        model = gensim.models.Word2Vec(merged,size=100, window=5, min_count=2, workers=4)
        
        q = train.ix[i][1].split()
        a1 = train.ix[i][3].split()
        a2 = train.ix[i][4].split()
        a3 = train.ix[i][5].split()
        a4 = train.ix[i][6].split()
        
        q =  only_the_words_in_index( q, model.index2word)
        a1 = only_the_words_in_index( a1, model.index2word)
        a2 = only_the_words_in_index( a2, model.index2word)
        a3 = only_the_words_in_index( a3, model.index2word)
        a4 = only_the_words_in_index( a4, model.index2word)
        
        #if the word in question does not exist in the question or answers, 
        if q == []:
            q = train.ix[i][1]
            q = doc_to_def(q)
            q = split_definition(q)
            q = only_the_words_in_index( q, model.index2word)
            if q == []:
                q == ['distinction']          #if all fails, just make it equal to a random word. 
        if a1 == []:
            a1 = train.ix[i][3]
            a1 = doc_to_def(a1)
            a1 = split_definition(a1)
            a1 = only_the_words_in_index( a1, model.index2word)
            if a1 == []:
                a1 = ['distinction']
        if a2 == []:
            a2 = train.ix[i][4]
            a2 = doc_to_def(a2)
            a2 = split_definition(a2)
            a2 = only_the_words_in_index( a2, model.index2word)
            if a2 == []:
                a2 = ['distinction']
        if a3 == []:
            a3 = train.ix[i][5]
            a3 = doc_to_def(a3)
            a3 = split_definition(a3)
            a3 = only_the_words_in_index( a3, model.index2word)
            if a3 == []:
                a3 = ['distinction']
        if a4 == []:
            a4 = train.ix[i][6]
            a4 = doc_to_def(a4)
            a4 = split_definition(a4)
            a4 = only_the_words_in_index( a4, model.index2word)
            if a4 == []:
                a4 = ['distinction']
        
        
        try:
            
            answer_similarity = np.array([model.n_similarity(q, a1), model.n_similarity(q, a2), 
                                          model.n_similarity(q, a3), model.n_similarity(q, a4)])

        except: 
            print 'Error on ', i, ' and set lengths at random'
            answer_similarity = np.array([np.random.rand(),np.random.rand(),
                                          np.random.rand(),np.random.rand()])
            myanswer_distance.set_value(i, 'A', abs(answer_similarity[0]) )    #write down distance for each choice
            myanswer_distance.set_value(i, 'B', abs(answer_similarity[1]) )    #I put abs because cosine can also give 
            myanswer_distance.set_value(i, 'C', abs(answer_similarity[2]) )    # negative values 
            myanswer_distance.set_value(i, 'D', abs(answer_similarity[3]) )
            print i, abs(answer_similarity[0]), abs(answer_similarity[1]), abs(answer_similarity[2]), abs(answer_similarity[3])  
            continue
        myanswer_distance.set_value(i, 'A', abs(answer_similarity[0]) )    #write down distance for each choice
        myanswer_distance.set_value(i, 'B', abs(answer_similarity[1]) )    #I put abs because cosine can also give 
        myanswer_distance.set_value(i, 'C', abs(answer_similarity[2]) )    # negative values 
        myanswer_distance.set_value(i, 'D', abs(answer_similarity[3]) )
        print i, abs(answer_similarity[0]), abs(answer_similarity[1]), abs(answer_similarity[2]), abs(answer_similarity[3])             
    myanswer = myanswer_distance.idxmin(axis = 1)            #get minimum column name for each row

    return myanswer, myanswer_distance
            
            


In [8]:
myanswer, my_distance = get_my_answer_fetch_w2v(train, dictionary, df_wiki_vector, wiki_path)

In [105]:
#train = train[0:50]
#train['fetch_doc_ws_train_answer'] = myans

In [9]:
train

Unnamed: 0,id,question,correctAnswer,answerA,answerB,answerC,answerD
0,100001,"When athletes begin to exercise, their heart r...",C,at the tissue level,at the organ level,at the system level,at the cellular level
1,100002,Which example describes a learned behavior in ...,C,smelling the air for odors,barking when disturbed,sitting on command,digging in soil
2,100003,"When two nuclei are combined into one nucleus,...",D,conversion,reaction,fission,fusion
3,100004,Which is a distinction between an epidemic and...,B,the symptoms of the disease,the geographical area affected,the species of organisms infected,the season in which the disease spreads
4,100005,In which way is the orbit of a comet different...,B,The orbit of Earth is less circular than the o...,The orbit of a comet is more elliptical than t...,The orbital period of Earth is much longer tha...,The orbital period of a comet is more predicta...
5,100006,A teacher builds a model of a hydrogen atom. A...,B,number of particles,relative mass of particles,types of particles present,charges of particles present
6,100007,Which substance should a student apply to the ...,A,water,vinegar,salt,formaldehyde
7,100008,What is the main source of energy for the wate...,A,the Sun,fossil fuels,clouds,the ocean
8,100009,Which has the greatest effect on aiding the mo...,D,tension,friction,density,gravity
9,100010,"Over time, non-volcanic mountains can form due...",C,oceanic plates colliding with oceanic plates,oceanic plates separating from oceanic plates,continental plates colliding with continental ...,continental plates separating from continental...


In [10]:
train['fetch_w2v_train_answer'] = myanswer
train['fetch_w2v_train_correct'] = (train['correctAnswer'] == train['fetch_w2v_train_answer'])
print 'percent correct is ' , train['fetch_w2v_train_correct'].sum(axis =0) / (len(train) + 0.0)

In [11]:
my_distance.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/fetch_word2vec_distance.csv', encoding='utf-8')    
train.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/fetch_word2vec.csv', encoding='utf-8')