In [66]:
import gensim
import logging
import os
import pandas as pd
import numpy as np
from scipy import spatial
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora

class MyDocument(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                yield content.lower().split()

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                line = unicode(line, encoding='utf-8', errors='replace')
                yield line.lower().split()

def get_dictionary(path):
    dictionary = corpora.Dictionary( MySentences(path) )
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    return dictionary

def get_document_tuple( path, dictionary ):
    vector = pd.Series()
    doclist = MyDocument(path) 
    for i,content in enumerate(doclist):
        vector = vector.set_value(i,dictionary.doc2bow(content))
    return list(vector)

def transform_tuple_into_vector(document_tuple,dictionary):
    vector = pd.Series(np.zeros(len(dictionary.token2id)))        #0 array for storing wiki document vectors.
    if not document_tuple == []:
        for onetuple in document_tuple: 
            vector[onetuple[0]] = onetuple[1]
        vector = vector / np.linalg.norm(vector, ord = 1)                       #normalize vector     
        return vector 
    else:
        return vector

def transform_tuples_into_dataframe(document_tuples, dictionary):
    length = len(document_tuples) 
    vector_initialization = pd.Series(np.zeros(len(dictionary.token2id)))  #0 array for storing wiki document vectors. 
    df_vector = pd.DataFrame(vector_initialization)  #initialize dataframe. all vectors will be stored. 
    for i in xrange(len(document_tuples)):           
        #for each wiki documents, we will transform wiki vectors in tuple form into
        #vectors in ususal form. 
        vector = transform_tuple_into_vector( document_tuples[i], dictionary )
        df_vector[i]= pd.DataFrame(vector)
    return df_vector

def get_close_documents(string, dataframe, dictionary, topn):
    #string = unicode(string, encoding='utf-8', errors='replace').lower()
    string = string_stemmer(string)
    #print string
    first_vector = transform_tuple_into_vector(dictionary.doc2bow(string.split()), dictionary )
    #print first_vector
    lengthlist = pd.Series()
    for j in xrange(len(dataframe.columns)):
        #lengthlist = lengthlist.set_value(j, np.linalg.norm(first_vector-dataframe[j]) )
        lengthlist = lengthlist.set_value( j, spatial.distance.cosine(first_vector, dataframe[j]))
    return lengthlist.sort_values().head(topn)

def get_document_by_index(path,index):
    # this gets the filename and content of the document in a directory by index. 
    i= 0
    for fname in os.listdir(path):
        if i == index:
            with open(os.path.join(path, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                return fname, content
        i= i + 1
    return None

def print_names_of_close_documents(path, close_documents_list, outputpath):
    file = open(outputpath + '/close_documents.txt', 'w+')
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        print name
        file.write(text + '\n' + '\n' + '\n' + '\n')        
    print 'text saved as txt'

def get_contents_of_close_documents_by_paragraph(path, close_documents_list):
    merged = ''
    for i in close_documents_list.index:
        name, text = get_document_by_index(path,i)
        merged = merged + '\n' + text        
    return merged.split('\n')


def get_distance_between_two_documents(A, B, dictionary):
    import re
    if A:
        A = re.sub(r'[^a-zA-Z ]',r'',A).lower().split()
    else:
        A = ['the']        # just in case A, or B is empty. 
    A = dictionary.doc2bow(A)
    A = transform_tuple_into_vector( A ,dictionary)
    
    if B:
        B = re.sub(r'[^a-zA-Z ]',r'',B).lower().split()
    else:
        B = ['the']       # just in case A, or B is empty. 
    B = dictionary.doc2bow(B)
    B = transform_tuple_into_vector( B ,dictionary)
    #length = spatial.distance.cosine(A,B)
    # I would like to use cosine distance, but the vectors are so sparse that most of the time the output is 0 .
    # So we use euclidean distance. 
    length = np.linalg.norm(A-B)
    return length


stoplist = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
             u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 
             u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 
             u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 
             u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 
             u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', 
             u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', 
             u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', 
             u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', 
             u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', 
             u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', 
             u'should', u'now']

def string_stemmer(line):
    import re
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    line = re.sub(r'[^a-zA-Z ]',r'',line)
    line = line.split()
    line = [word for word in line if word not in stopwords.words('english')]  # remove the stop words. 
    output = []
    for word in line:
        output.append(stemmer.stem(word))     #stem all words 
    output = ' '.join(output)           # join the list to make a string
    return output

In [57]:
dictionary_path = '/Users/MK/GitHub/the_answer_is/data/temporary2'  # question text from training data
dictionary = get_dictionary(dictionary_path)
print (dictionary.token2id)
print 'dictionary dimension is', len(dictionary.token2id)

{u'xylem': 0, u'crosspollin': 1, u'freshwat': 2, u'retain': 2935, u'yellow': 4, u'interchang': 5, u'four': 6, u'asian': 7, u'powderi': 8, u'oldest': 9, u'skeleton': 1722, u'whose': 11, u'accur': 12, u'brackish': 13, u'venom': 2906, u'accret': 15, u'bike': 16, u'swap': 17, u'testabl': 18, u'humerus': 19, u'everi': 20, u'paleozo': 21, u'chlorophyl': 22, u'barium': 23, u'void': 24, u'rise': 25, u'ductil': 26, u'pigment': 27, u'microbi': 28, u'jack': 29, u'jacob': 30, u'affect': 31, u'month': 1725, u'vast': 33, u'anemia': 34, u'school': 35, u'parrot': 36, u'wegner': 37, u'cmb': 38, u'factori': 39, u'solid': 2350, u'technolog': 2717, u'transduct': 41, u'venus': 42, u'pesticid': 43, u'chloroplast': 44, u'repres': 2569, u'capsul': 1728, u'speci': 46, u'miller': 47, u'direct': 48, u'horn': 49, u'nail': 50, u'consequ': 51, u'second': 52, u'street': 53, u'chen': 54, u'quill': 55, u'blue': 56, u'settlement': 1731, u'homeostasi': 59, u'altitud': 60, u'lightn': 61, u'near': 62, u'asia': 63, u'petri

In [58]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_stemmed_all_merged'
wiki_tuple = get_document_tuple( wiki_path, dictionary )
wiki_tuple[8000]

#this wiki_vector is in a condensed form. We need to transform it into a long form for getting distance.  

[(4, 5),
 (6, 2),
 (9, 1),
 (10, 1),
 (11, 5),
 (12, 3),
 (20, 4),
 (25, 5),
 (31, 1),
 (33, 3),
 (42, 1),
 (48, 8),
 (51, 5),
 (52, 3),
 (56, 4),
 (58, 11),
 (62, 4),
 (68, 6),
 (70, 1),
 (73, 1),
 (77, 2),
 (89, 9),
 (93, 20),
 (97, 1),
 (99, 1),
 (105, 1),
 (106, 2),
 (110, 3),
 (111, 17),
 (114, 4),
 (118, 2),
 (119, 6),
 (125, 7),
 (126, 5),
 (129, 6),
 (134, 1),
 (145, 16),
 (152, 1),
 (158, 1),
 (162, 2),
 (165, 2),
 (166, 7),
 (169, 30),
 (178, 2),
 (184, 3),
 (186, 4),
 (189, 3),
 (197, 2),
 (201, 3),
 (202, 7),
 (205, 3),
 (211, 6),
 (221, 1),
 (223, 8),
 (226, 7),
 (227, 2),
 (229, 6),
 (230, 6),
 (231, 1),
 (238, 3),
 (240, 1),
 (242, 13),
 (243, 1),
 (245, 7),
 (250, 1),
 (258, 1),
 (265, 4),
 (271, 31),
 (274, 1),
 (276, 3),
 (278, 3),
 (283, 3),
 (284, 1),
 (288, 2),
 (289, 1),
 (303, 2),
 (305, 2),
 (313, 1),
 (317, 1),
 (323, 4),
 (326, 1),
 (333, 10),
 (342, 3),
 (343, 5),
 (349, 6),
 (350, 1),
 (354, 3),
 (363, 1),
 (369, 1),
 (373, 7),
 (374, 3),
 (378, 2),
 (381, 1

In [30]:
print 'dictionary dimension is', len(dictionary.token2id)

dictionary dimension is 3515


In [59]:
df_wiki_vector = transform_tuples_into_dataframe(wiki_tuple,dictionary)         


In [61]:
df_wiki_vector.head(15)
df_wiki_vector[8000]
# this is each column representing one wikipedia page

0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.001279
5       0.000000
6       0.000512
7       0.000000
8       0.000000
9       0.000256
10      0.000256
11      0.001279
12      0.000768
13      0.000000
14      0.000000
15      0.000000
16      0.000000
17      0.000000
18      0.000000
19      0.000000
20      0.001024
21      0.000000
22      0.000000
23      0.000000
24      0.000000
25      0.001279
26      0.000000
27      0.000000
28      0.000000
29      0.000000
          ...   
3485    0.000000
3486    0.000000
3487    0.000000
3488    0.040942
3489    0.000000
3490    0.000000
3491    0.000512
3492    0.000000
3493    0.000000
3494    0.000000
3495    0.000000
3496    0.000512
3497    0.000000
3498    0.000000
3499    0.000000
3500    0.000000
3501    0.000000
3502    0.000000
3503    0.000000
3504    0.000512
3505    0.000000
3506    0.000000
3507    0.000256
3508    0.000000
3509    0.000256
3510    0.000000
3511    0.000000
3512    0.0000

In [33]:
import pandas as pd
import os
path = '/Users/MK/GitHub/the_answer_is/data'
os.chdir(path)
train = pd.read_table('training_set.tsv',sep = '\t')
train.head(20)

Unnamed: 0,id,question,correctAnswer,answerA,answerB,answerC,answerD
0,100001,"When athletes begin to exercise, their heart r...",C,at the tissue level,at the organ level,at the system level,at the cellular level
1,100002,Which example describes a learned behavior in ...,C,smelling the air for odors,barking when disturbed,sitting on command,digging in soil
2,100003,"When two nuclei are combined into one nucleus,...",D,conversion,reaction,fission,fusion
3,100004,Which is a distinction between an epidemic and...,B,the symptoms of the disease,the geographical area affected,the species of organisms infected,the season in which the disease spreads
4,100005,In which way is the orbit of a comet different...,B,The orbit of Earth is less circular than the o...,The orbit of a comet is more elliptical than t...,The orbital period of Earth is much longer tha...,The orbital period of a comet is more predicta...
5,100006,A teacher builds a model of a hydrogen atom. A...,B,number of particles,relative mass of particles,types of particles present,charges of particles present
6,100007,Which substance should a student apply to the ...,A,water,vinegar,salt,formaldehyde
7,100008,What is the main source of energy for the wate...,A,the Sun,fossil fuels,clouds,the ocean
8,100009,Which has the greatest effect on aiding the mo...,D,tension,friction,density,gravity
9,100010,"Over time, non-volcanic mountains can form due...",C,oceanic plates colliding with oceanic plates,oceanic plates separating from oceanic plates,continental plates colliding with continental ...,continental plates separating from continental...


In [70]:
q = train.ix[3][1]
print q

Which is a distinction between an epidemic and a pandemic?


In [71]:
close_documents = get_close_documents(q, df_wiki_vector, dictionary,50)
print close_documents

2006    0.613161
2415    0.701917
3918    0.715735
7305    0.715735
5131    0.715735
5132    0.715735
4944    0.855138
5059    0.877588
5969    0.884078
7634    0.886772
2814    0.893613
5846    0.893613
5053    0.893613
3646    0.893613
2432    0.904436
642     0.908137
7627    0.909612
5113    0.924551
1684    0.924551
4628    0.929938
6963    0.930870
6359    0.931888
619     0.939477
5860    0.940372
6371    0.940372
6656    0.942646
5345    0.942646
1452    0.942646
3769    0.948486
3764    0.948531
6691    0.949200
6688    0.949200
6685    0.949200
6784    0.949200
2462    0.949200
7571    0.949200
6689    0.949229
4152    0.950515
2455    0.950515
2640    0.950515
6575    0.951088
669     0.951720
1576    0.952778
4941    0.952778
3062    0.952778
7224    0.952778
3061    0.952778
7696    0.952778
1575    0.952778
6512    0.952778
dtype: float64


In [72]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_stemmed_all_merged'
outputpath = '/Users/MK/GitHub/the_answer_is/data'
print_names_of_close_documents(wiki_path, close_documents,outputpath)
merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)


distinct.txt_to_unicode_remove_stopwords_and_stem.txt
epidemic.txt_to_unicode_remove_stopwords_and_stem.txt
kingdoms.txt_to_unicode_remove_stopwords_and_stem.txt
thing.txt_to_unicode_remove_stopwords_and_stem.txt
pandemic.txt_to_unicode_remove_stopwords_and_stem.txt
pandemics.txt_to_unicode_remove_stopwords_and_stem.txt
occurrence.txt_to_unicode_remove_stopwords_and_stem.txt
outbreak.txt_to_unicode_remove_stopwords_and_stem.txt
relative_age.txt_to_unicode_remove_stopwords_and_stem.txt
updating.txt_to_unicode_remove_stopwords_and_stem.txt
flu.txt_to_unicode_remove_stopwords_and_stem.txt
reaction.txt_to_unicode_remove_stopwords_and_stem.txt
orion.txt_to_unicode_remove_stopwords_and_stem.txt
influenza.txt_to_unicode_remove_stopwords_and_stem.txt
eras.txt_to_unicode_remove_stopwords_and_stem.txt
biological_hazards_of_radioactivity.txt_to_unicode_remove_stopwords_and_stem.txt
unrecognizable.txt_to_unicode_remove_stopwords_and_stem.txt
paired.txt_to_unicode_remove_stopwords_and_stem.txt
cyst

In [11]:
print merged

[u'', u'invertebrates animals neither possess develop vertebral columnspinal cord derived notochord includes animals apart subphylum vertebrata familiar examples invertebrates include insects crabs lobsters kin snails clams octopuses kin starfish seaurchins kin worms', u'majority animal species invertebrates one estimate puts figure many invertebrate taxa greater number variety species entire subphylum vertebrata', u'socalled invertebrates chaetognatha hemichordata tunicata cephalochordata closely related vertebrates invertebrates makes term invertebrate almost meaningless taxonomic purposes', u'etymology', u'word invertebrate comes form latin word vertebra means joint general sometimes specifically joint spinal column vertebrate turn jointed aspect vertebra derived concept turning expressed root verto vorto turn coupled prefix meaning without', u'taxonomic significance', u'term invertebrates always precise among nonbiologists since accurately describe taxon way arthropoda vertebrata m

In [73]:
def get_my_answer(train, dictionary, df_vector, path):
    convert_answer = {3: 'A', 4: 'B', 5: 'C', 6: 'D' }
    myanswer = pd.Series()    #initialize dataframe to store my answers
    for i in xrange(len(train)):       #loop through all questions
        q = train.ix[i][1]
        close_documents = get_close_documents(q, df_vector, dictionary,5)
        merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)
        four_choices = pd.Series()         # initialize a series to store the best value for each answer. 
        for j in [3,4,5,6]:            # the columns where the answer option lies. 
            A = train.ix[i][1] + ' ' + train.ix[i][j]       # question + each answer choice. 
            dist_list = []             # for storing all distance between A and all paragraphs in close documents. 
            for m in xrange(len(merged)):
                distance = get_distance_between_two_documents(A, merged[m], dictionary)
                if distance > 0:  #to disregrad nan value
                    dist_list.append( distance )
            four_choices = four_choices.set_value( j, min(dist_list)  )
            #print np.std(dist_list)
            #print min(dist_list) 
        myanswer = myanswer.set_value(i, convert_answer[ four_choices.argmin() ])
        print i, convert_answer[ four_choices.argmin() ]
    return myanswer

def get_my_answer_all_distance(train, dictionary, df_vector, path):
    convert_answer = {3: 'A', 4: 'B', 5: 'C', 6: 'D' }
    myanswer = pd.Series()    #initialize dataframe to store my answers
    myanswer_distance = pd.DataFrame(np.zeros(4).reshape(1,4), columns = ['A','B','C','D'])    #initialize dataframe to store my answers distance
    for i in xrange(len(train)):       #loop through all questions
        q = train.ix[i][1]
        close_documents = get_close_documents(q, df_vector, dictionary,5)
        merged = get_contents_of_close_documents_by_paragraph(wiki_path, close_documents)
        four_choices = pd.Series()         # initialize a series to store the best value for each answer. 
        for j in [3,4,5,6]:            # the columns where the answer option lies. 
            A = train.ix[i][1] + ' ' + train.ix[i][j]       # question + each answer choice. 
            dist_list = []             # for storing all distance between A and all paragraphs in close documents. 
            for m in xrange(len(merged)):
                distance = get_distance_between_two_documents(A, merged[m], dictionary)
                if distance > 0:  #to disregrad nan value
                    dist_list.append( distance )
            four_choices = four_choices.set_value( j, min(dist_list)  )
            #print np.std(dist_list)
            #print min(dist_list) 
        myanswer_distance.set_value(i, 'A', four_choices[3] )
        myanswer_distance.set_value(i, 'B', four_choices[4] )
        myanswer_distance.set_value(i, 'C', four_choices[5] )
        myanswer_distance.set_value(i, 'D', four_choices[6] )
        myanswer = myanswer.set_value(i, convert_answer[ four_choices.argmin() ])
        print i, four_choices[3], four_choices[4], four_choices[5], four_choices[6]
    return myanswer_distance

In [69]:
# if you want just the answers
#wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
#myans = get_my_answer(train, dictionary, df_wiki_vector, wiki_path)
#train['fetch_doc_ws_train_answer'] = myans
#train['fetch_doc_ws_train_correct'] = (train['correctAnswer'] == train['fetch_doc_ws_train_answer'])
#print 'percent correct is ' , train['fetch_doc_ws_train_correct'].sum(axis =0) / (len(train) + 0.0)
#train.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/fetch_doc_ws_train.csv', encoding='utf-8')

0 A
1 A
2 A
3 C
4 C
5 B
6 A
7 B
8 B
9 D
10 C
11 A
12 A
13 A
14 A
15 A
16 C
17 B
18 A
19 B
20 C
21 B
22 C
23 B
24 B
25 D
26 B
27 A
28 B
29 A
30 D
31 C
32 D
33 A
34 C
35 A
36 C
37 B
38 A
39 C
40 B
41 B
42 C
43 B
44 A
45 A
46 A
47 B
48 A
49 A
50 D
51 A
52 A
53 B
54 D
55 D
56 C
57 A
58 B
59 C
60 A
61 B
62 A
63 D
64 A
65 A
66 A
67 A
68 A
69 A
70 C
71 B
72 A
73 A
74 A
75 A
76 A
77 D
78 C
79 A
80 A
81 D
82 B
83 D
84 A
85 B
86 A
87 C
88 C
89 A
90 C
91 B
92 D
93 A
94 A
95 B
96 A
97 B
98 A
99 B
100 B
101 A
102 D
103 D
104 A
105 C
106 A
107 B
108 B
109 C
110 B
111 D
112 A
113 A
114 C
115 A
116 D
117 C
118 A
119 B
120 A
121 A
122 D
123 A
124 A
125 C
126 D
127 C
128 B
129 D
130 C
131 B
132 C
133 A
134 C
135 A
136 B
137 D
138 A
139 C
140 C
141 A
142 A
143 D
144 B
145 C
146 A
147 A
148 C
149 D
150 B
151 A
152 B
153 A
154 B
155 D
156 A
157 D
158 A
159 A
160 A
161 C
162 A
163 B
164 A
165 B
166 C
167 D
168 A
169 D
170 C
171 C
172 B
173 A
174 C
175 A
176 D
177 B
178 D
179 A
180 D
181 A
182 A
183 A
184 D


In [27]:
wiki_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_without_stopwords'
myans = get_my_answer_all_distance(train, dictionary, df_wiki_vector, wiki_path)
myans.to_csv('/Users/MK/GitHub/the_answer_is/data/answer/fetch_doc_ws_train_minimum_distance.csv', encoding='utf-8')

0 0.279508497187 0.279508497187 0.279508497187 0.279508497187
1 0.408248290464 0.408248290464 0.408248290464 0.408248290464
2 0.258198889747 0.258198889747 0.258198889747 0.258198889747
3 0.4472135955 0.4472135955 0.408248290464 0.408248290464
4 0.37267799625 0.390022779585 0.342727379489 0.382047844058
5 0.264575131106 0.256436419387 0.256436419387 0.256436419387
6 0.345795680681 0.349540869491 0.349540869491 0.366681885507
7 0.374350648863 0.357118140519 0.374350648863 0.374350648863
8 0.368567275724 0.352323635943 0.352323635943 0.352323635943
9 0.290081385603 0.282206298752 0.284858223105 0.277119910584
10 0.25893090163 0.25893090163 0.255261472546 0.255261472546
11 0.353553390593 0.353553390593 0.353553390593 0.353553390593
12 0.242076711427 0.242076711427 0.248964798866 0.242076711427
13 0.252412185438 0.252412185438 0.263157894737 0.264575131106
14 0.285714285714 0.285714285714 0.297921795862 0.311804782231
15 0.381924948997 0.381924948997 0.381924948997 0.403980197534
16 0.2672