In [9]:
import gensim
import logging
import os
import pandas as pd
import numpy as np
from scipy import spatial
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora

class MyDocument(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                yield content.lower().split()

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                line = unicode(line, encoding='utf-8', errors='replace')
                yield line.lower().split()

def get_dictionary(path):
    dictionary = corpora.Dictionary( MySentences(path) )
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    return dictionary

def get_document_tuple( path, dictionary ):
    vector = pd.Series()
    doclist = MyDocument(path) 
    for i,content in enumerate(doclist):
        vector = vector.set_value(i,dictionary.doc2bow(content))
    return list(vector)

def transform_tuple_into_vector(document_tuple,dictionary):
    vector = pd.Series(np.zeros(len(dictionary.token2id)))        #0 array for storing wiki document vectors.
    if not document_tuple == []:
        for onetuple in document_tuple: 
            vector[onetuple[0]] = onetuple[1]
        vector = vector / np.linalg.norm(vector, ord = 1)                       #normalize vector     
        return vector 
    else:
        return vector

def transform_tuples_into_dataframe(document_tuples, dictionary):
    length = len(document_tuples) 
    vector_initialization = pd.Series(np.zeros(len(dictionary.token2id)))  #0 array for storing wiki document vectors. 
    df_vector = pd.DataFrame(vector_initialization)  #initialize dataframe. all vectors will be stored. 
    for i in xrange(len(document_tuples)):           
        #for each wiki documents, we will transform wiki vectors in tuple form into
        #vectors in ususal form. 
        vector = transform_tuple_into_vector( document_tuples[i], dictionary )
        df_vector[i]= pd.DataFrame(vector)
    return df_vector

def get_close_documents(string, dataframe, dictionary, topn):
    #string = unicode(string, encoding='utf-8', errors='replace').lower()
    #print string
    string = string_stemmer(string)
    first_vector = transform_tuple_into_vector(dictionary.doc2bow(string.split()), dictionary )
    #print first_vector
    lengthlist = pd.Series()
    for j in xrange(len(dataframe.columns)):
        #lengthlist = lengthlist.set_value(j, np.linalg.norm(first_vector-dataframe[j]) )
        lengthlist = lengthlist.set_value( j, spatial.distance.cosine(first_vector, dataframe[j]))
    return lengthlist.sort_values().head(topn)

def get_document_by_index(path,index):
    # this gets the filename and content of the document in a directory by index. 
    i= 0
    for fname in os.listdir(path):
        if i == index:
            with open(os.path.join(path, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                return fname, content
        i= i + 1
    return None


def get_contents_of_close_documents_by_paragraph(path, close_documents_list):
    merged = ''
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        merged = merged + '\n' + text        
    return merged.split('\n')


def get_distance_between_two_documents(A, B, dictionary):
    import re
    if A:
        A = re.sub(r'[^a-zA-Z ]',r'',A).lower().split()
    else:
        A = ['the']        # just in case A, or B is empty. 
    A = dictionary.doc2bow(A)
    A = transform_tuple_into_vector( A ,dictionary)
    
    if B:
        B = re.sub(r'[^a-zA-Z ]',r'',B).lower().split()
    else:
        B = ['the']       # just in case A, or B is empty. 
    B = dictionary.doc2bow(B)
    B = transform_tuple_into_vector( B ,dictionary)
    length = spatial.distance.cosine(A,B)
    # I would like to use cosine distance, but the vectors are so sparse that most of the time the output is 0 .
    # So we use euclidean distance. 
    #length = np.linalg.norm(A-B)
    return length


def string_stemmer(line):
    import re
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    line = re.sub(r'[^a-zA-Z ]',r'',line)
    line = line.split()
    line = [word for word in line if word not in stopwords.words('english')]  # remove the stop words. 
    output = []
    for word in line:
        output.append(stemmer.stem(word))     #stem all words 
    output = ' '.join(output)           # join the list to make a string
    return output


def get_my_answer_all_distance(train, dictionary, df_vector, wiki_path):
    correct = 0
    convert_answer = {3: 'A', 4: 'B', 5: 'C', 6: 'D' }
    myanswer = pd.Series()    #initialize dataframe to store my answers
    myanswer_distance = pd.DataFrame(np.zeros(4).reshape(1,4), columns = ['A','B','C','D'])    #initialize dataframe to store my answers distance
    for i in xrange(len(train)):       #loop through all questions
        q = train.ix[i][1]
        
        close_documents = get_close_documents(q, df_vector, dictionary,5)
        merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)
        four_choices = pd.Series()         # initialize a series to store the best value for each answer. 
        for j in [3,4,5,6]:            # the columns where the answer option lies. 
            A = train.ix[i][1] + ' ' + train.ix[i][j]       # question + each answer choice. 
            dist_list = []             # for storing all distance between A and all paragraphs in close documents. 
            for m in xrange(len(merged)):
                distance = get_distance_between_two_documents(A, merged[m], dictionary)
                if distance > 0:  #to disregrad nan value
                    dist_list.append( distance )
            if dist_list == []:
                dist_list = [1]
            four_choices = four_choices.set_value( j, min(dist_list)  )
            #print np.std(dist_list)
            #print min(dist_list) 
        myanswer_distance.set_value(i, 'A', four_choices[3] )
        myanswer_distance.set_value(i, 'B', four_choices[4] )
        myanswer_distance.set_value(i, 'C', four_choices[5] )
        myanswer_distance.set_value(i, 'D', four_choices[6] )
        myanswer = myanswer.set_value(i, convert_answer[ four_choices.argmin() ])
        print 'question: ',q
        print 'answer: ',train.ix[i][3], train.ix[i][4] ,train.ix[i][5], train.ix[i][6]
        print 'correct answer: ', train.ix[i][2]
        print i, four_choices[3], four_choices[4], four_choices[5], four_choices[6]
        print 'my answer: ', convert_answer[ four_choices.argmin() ]
        if train.ix[i][2] == convert_answer[ four_choices.argmin() ]:
            correct = correct +1.0
        print 'percent correct: ', correct / (i+1) 
    return myanswer_distance, myanswer



def run_fetch_ws(train_file_path, dictionary_folder_path, wiki_folder_path):
    dictionary = get_dictionary(dictionary_folder_path)
    wiki_tuple = get_document_tuple( wiki_folder_path, dictionary )
    df_wiki_vector = transform_tuples_into_dataframe(wiki_tuple,dictionary)  
    train = pd.read_table(train_file_path,sep = '\t')
    distance, answer = get_my_answer_all_distance(train, dictionary, df_wiki_vector, wiki_folder_path)
    
    train['fetch_doc_ws_train_answer'] = answer
    train['fetch_doc_ws_train_correct'] = (train['correctAnswer'] == train['fetch_doc_ws_train_answer'])
    print 'percent correct is ' , train['fetch_doc_ws_train_correct'].sum(axis =0) / (len(train) + 0.0)
    train.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/fetch_doc_ws_train.csv', encoding='utf-8')
    
    return distance, answer




stoplist = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
             u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 
             u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 
             u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 
             u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 
             u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', 
             u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', 
             u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', 
             u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', 
             u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', 
             u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', 
             u'should', u'now']



In [10]:
my_wiki_folder_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_stemmed_all_merged'
my_train_file_path = '/Users/MK/GitHub/the_answer_is/data/training_set.tsv'
my_dictionary_folder_path = '/Users/MK/GitHub/the_answer_is/data/temporary2'  #this folder contains one file which is stemmed. 
distance, answer  = run_fetch_ws(my_train_file_path, my_dictionary_folder_path, my_wiki_folder_path)


question:  When athletes begin to exercise, their heart rates and respiration rates increase.  At what level of organization does the human body coordinate these functions?
answer:  at the tissue level at the organ level at the system level at the cellular level
correct answer:  C
0 0.759992319631 0.775493372467 0.75 0.775493372467
my answer:  C
percent correct:  1.0
question:  Which example describes a learned behavior in a dog?
answer:  smelling the air for odors barking when disturbed sitting on command digging in soil
correct answer:  C
1 0.42264973081 0.292893218813 0.292893218813 0.42264973081
my answer:  B
percent correct:  0.5
question:  When two nuclei are combined into one nucleus, there is a slight change in mass and the release of a large amount of energy. What is this process called?
answer:  conversion reaction fission fusion
correct answer:  D
2 0.643487952046 0.626998076704 0.663877217756 0.663877217756
my answer:  B
percent correct:  0.333333333333
question:  Which is 

KeyboardInterrupt: 