In [None]:
from sklearn.externals import joblib
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import codecs

### Load TFIDF Model

In [None]:
tfidf_model = joblib.load("tfidf.pkl")

### Utility Functions

In [None]:
def mean_sentences_vector_tfidf(sentences):
    vec_list = tfidf_model.transform(sentences)
    
    if len(sentences) > 1:
        mean_vector = np.mean(np.array(vec_list), axis=0, dtype=np.float64)
    else:
        mean_vector = vec_list[0]
    
    return mean_vector

In [None]:
def increase_sentence_vector_tfidf(sentences, sentence_dict = {}):
    for index, sent_vec in enumerate(tfidf_model.transform(sentences)):
        sentence_dict[sentences[index]] = sent_vec
    
    return sentence_dict

In [None]:
def vectorize_document_list_tfidf(documents, sentence_dict={}):
    for doc in documents:
        increase_sentence_vector_tfidf(sent_tokenize(doc), sentence_dict)
    
    return sentence_dict

In [None]:
def get_most_similar_sentences_tfidf(sentence_dict, sentences, tnum=5):
    sentences_to_return = []
    
    mean_vector = mean_sentences_vector_tfidf(sentences)
    
    lowest_distance = 0
    for sent, vector in sentence_dict.iteritems():
        similarity = cosine_similarity(vector, mean_vector)
        if len(sentences_to_return) < tnum:
            sentences_to_return.append((sent, similarity))
            
            if lowest_distance > similarity:
                lowest_distance = similarity
        else:
            if lowest_distance < similarity:
                new_lowest_distance = similarity
                for index, existing_sent in enumerate(sentences_to_return):
                    if existing_sent[1] == lowest_distance:
                        sentences_to_return[index] = (sent, similarity)
                    elif existing_sent[1] < new_lowest_distance:
                        new_lowest_distance = existing_sent[1]
                
                lowest_distance = new_lowest_distance
    
    sentences_to_return.sort(key=lambda x: x[1], reverse=True)
    
    return sentences_to_return

In [None]:
doc_list = []
file_1 = codecs.open('test_documents/crypto_currency.txt', encoding='utf-8', mode='r')
file_2 = codecs.open('test_documents/trump_401k.txt', encoding='utf-8', mode='r')

doc_list.append(file_1.read())
doc_list.append(file_2.read())

In [None]:
sent_dict = vectorize_document_list_tfidf(doc_list)

In [None]:
sentences = get_most_similar_sentences_tfidf(
    sent_dict,
    ["“So he just may not realize that he’s speaking to the privileged few.” Only a third of people contribute anything to their retirement accounts, according to a Census study released this year."],
    10
)

In [None]:
print sentences
print len(sentences)