In [14]:
import gensim
import logging
import os
import pandas as pd
import numpy as np
from scipy import spatial
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora

class MyDocument(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            with open(os.path.join(self.dirname, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                yield content.lower().split()

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                line = unicode(line, encoding='utf-8', errors='replace')
                yield line.lower().split()

def get_dictionary(path):
    dictionary = corpora.Dictionary( MySentences(path) )
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    dictionary.filter_tokens(stop_ids) # remove stop words and words that appear only once
    dictionary.compactify() # remove gaps in id sequence after words that were removed
    return dictionary

def get_document_tuple( path, dictionary ):
    vector = pd.Series()
    doclist = MyDocument(path) 
    for i,content in enumerate(doclist):
        vector = vector.set_value(i,dictionary.doc2bow(content))
    return list(vector)

def transform_tuple_into_vector(document_tuple,dictionary):
    vector = pd.Series(np.zeros(len(dictionary.token2id)))        #0 array for storing wiki document vectors.
    if not document_tuple == []:
        for onetuple in document_tuple: 
            vector[onetuple[0]] = onetuple[1]
        vector = vector / np.linalg.norm(vector, ord = 1)                       #normalize vector     
        return vector 
    else:
        return vector

def transform_tuples_into_dataframe(document_tuples, dictionary):
    length = len(document_tuples) 
    vector_initialization = pd.Series(np.zeros(len(dictionary.token2id)))  #0 array for storing wiki document vectors. 
    df_vector = pd.DataFrame(vector_initialization)  #initialize dataframe. all vectors will be stored. 
    for i in xrange(len(document_tuples)):           
        #for each wiki documents, we will transform wiki vectors in tuple form into
        #vectors in ususal form. 
        vector = transform_tuple_into_vector( document_tuples[i], dictionary )
        df_vector[i]= pd.DataFrame(vector)
    return df_vector

def get_close_documents(string, dataframe, dictionary, topn, path):
    #string = unicode(string, encoding='utf-8', errors='replace').lower()
    #print string
    string = string_stemmer(string)
    first_vector = transform_tuple_into_vector(dictionary.doc2bow(string.split()), dictionary )
    #print first_vector
    lengthlist = pd.Series()
    for j in xrange(len(dataframe.columns)):
        lengthlist = lengthlist.set_value( j, spatial.distance.cosine(first_vector, dataframe[j]))
        lengthlist = lengthlist.sort_values().head(topn)
        #now we have a topn index of the close documents. 

    namelist = pd.DataFrame(columns = ['name', 'content'])
    namelist['distance'] = lengthlist
    for i in lengthlist.index:
        name, text = get_document_by_index(path,i)
        namelist.set_value(i, 'name', name )
        namelist.set_value(i, 'content', text)
           
    return namelist


def get_document_by_index(path,index):
    # this gets the filename and content of the document in a directory by index. 
    i= 0
    for fname in os.listdir(path):
        if i == index:
            with open(os.path.join(path, fname)) as content_file:
                content = content_file.read()  
                content = unicode(content, encoding='utf-8', errors='replace')
                return fname, content
        i= i + 1
    return None



def string_stemmer(line):
    import re
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    line = re.sub(r'[^a-zA-Z ]',r'',line)
    line = line.split()
    line = [word for word in line if word not in stopwords.words('english')]  # remove the stop words. 
    output = []
    for word in line:
        output.append(stemmer.stem(word))     #stem all words 
    output = ' '.join(output)           # join the list to make a string
    return output






stoplist = [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours',
             u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', 
             u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', 
             u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', 
             u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', 
             u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', 
             u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', 
             u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', 
             u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', 
             u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', 
             u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', 
             u'should', u'now']



In [2]:
#get close document names for each question. 

#setup
wiki_folder_path = '/Users/MK/GitHub/the_answer_is/data/wikipedia_stemmed_all_merged'
train_file_path = '/Users/MK/GitHub/the_answer_is/data/training_set.tsv'
dictionary_folder_path = '/Users/MK/GitHub/the_answer_is/data/temporary2'  #this folder contains one file which is stemmed. 

dictionary = get_dictionary(dictionary_folder_path)
wiki_tuple = get_document_tuple( wiki_folder_path, dictionary )
df_wiki_vector = transform_tuples_into_dataframe(wiki_tuple,dictionary)  
train = pd.read_table(train_file_path,sep = '\t')



In [15]:
i = 3
close_document =  get_close_documents(train.ix[i][1], df_wiki_vector, dictionary, 5, wiki_folder_path)


In [16]:
close_document

Unnamed: 0,name,content,distance
2006,distinct.txt_to_unicode_remove_stopwords_and_s...,two thing distinct two thing mathemat two thin...,0.613161
2415,epidemic.txt_to_unicode_remove_stopwords_and_s...,epidem greek epi upon demo peopl rapid spread ...,0.701917
3918,kingdoms.txt_to_unicode_remove_stopwords_and_s...,pandem greek pan demo peopl epidem infecti dis...,0.715735
5131,pandemic.txt_to_unicode_remove_stopwords_and_s...,pandem greek pan demo peopl epidem infecti dis...,0.715735
5132,pandemics.txt_to_unicode_remove_stopwords_and_...,pandem greek pan demo peopl epidem infecti dis...,0.715735
