In [None]:
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.externals import joblib
import numpy as np

tfkld_location = "tfkld.pkl"
pca_location = "pca_tfkld.pkl"

### Load Models

In [None]:
# Load the TFKLD Model, we'll be training the PCA Reduction on the vectors it outputs
tfkld_model = joblib.load(tfkld_location)

# Load PCA
pca_model = joblib.load(pca_location)

### Utility functions for sentence similarity tfkld vectors

In [None]:
def mean_sentences_vector(sentences):
    vec_list = tfkld_model.transform(sentences)
    if len(sentences) > 1:
        array_to_convert = []
        for vec in vec_list:
            array_to_convert.append(vec.toarray()[0].tolist())
        mean_vector = np.mean(np.array(array_to_convert), axis=0, dtype=np.float64).tolist()
    else:
        mean_vector = vec_list[0].toarray()[0].tolist()

    return pca_model.transform([mean_vector])[0].tolist()


In [None]:
def vectorize_sentences(sentences):
    vec_sentences = []
    vec_list = tfkld_model.transform(sentences)
    for vec in vec_list:
        vec_sentences.append(pca_model.transform([vec.toarray()[0].tolist()])[0].tolist())

    return vec_sentences

In [None]:
def increase_sentence_vector(sentences, sentence_dict={}):
    for index, sent_vec in enumerate(tfkld_model.transform(sentences)):
        sentence_dict[sentences[index]] = pca_model.transform([sent_vec.toarray()[0].tolist()])
    
    return sentence_dict

In [None]:
# This will take a list of text and convert it into a sentence vector
def vectorize_document_list(documents, sentence_dict={}):
    for doc in documents:
        sentence_dict = increase_sentence_vector(sent_tokenize(doc), sentence_dict)
    
    return sentence_dict

In [None]:
# Get the sentences whose cosine similarity is closest to the passed sentence
def get_most_similar_sentences(sentence_dict, sentence, tnum=5):
    sentences_to_return = []
    sent_vect = vectorize_sentences([sentence])[0]

    lowest_distance = 0
    for sent, vector in sentence_dict.iteritems():
        similarity = cosine_similarity(np.array(vector).reshape(1, -1), np.array(sent_vect).reshape(1, -1)).tolist()[0][0]
        if len(sentences_to_return) < tnum:
            sentences_to_return.append((sent, similarity))

            if lowest_distance > similarity:
                lowest_distance = similarity
        else:
            if lowest_distance < similarity:
                new_lowest_distance = similarity
                for index, existing_sent in enumerate(sentences_to_return):
                    if existing_sent[1] == lowest_distance:
                        sentences_to_return[index] = (sent, similarity)
                    elif existing_sent[1] < new_lowest_distance:
                        new_lowest_distance = existing_sent[1]

                lowest_distance = new_lowest_distance

    sentences_to_return.sort(key=lambda x: x[1], reverse=True)

    return sentences_to_return
    
    

In [None]:
def get_most_similar_sentences_to_vector(sentence_dict, mean_vector, tnum=5):
    sentences_to_return = []

    lowest_distance = 0
    for sent, vector in sentence_dict.iteritems():
        similarity = cosine_similarity(np.array(vector).reshape(1, -1), np.array(mean_vector).reshape(1, -1)).tolist()[0][0]
        if len(sentences_to_return) < tnum:
            sentences_to_return.append((sent, similarity))

            if lowest_distance > similarity:
                lowest_distance = similarity
        else:
            if lowest_distance < similarity:
                new_lowest_distance = similarity
                for index, existing_sent in enumerate(sentences_to_return):
                    if existing_sent[1] == lowest_distance:
                        sentences_to_return[index] = (sent, similarity)
                    elif existing_sent[1] < new_lowest_distance:
                        new_lowest_distance = existing_sent[1]

                lowest_distance = new_lowest_distance

    sentences_to_return.sort(key=lambda x: x[1], reverse=True)

    return sentences_to_return

In [None]:
def get_distances_between_sentences(sentences):
    sentences_distances = []
    for i in range(0, len(sentences)):
        for j in range(i + 1, len(sentences)):
            distance = {"sent1": sentences[i], "sent2": sentences[j]}
            vec_list_1 = \
            pca_model.transform(
                [tfkld_model.transform([sentences[i]])[0].toarray()[0].tolist()]
            )
            vec_list_2 = \
            pca_model.transform(
                [tfkld_model.transform([sentences[j]])[0].toarray()[0].tolist()]
            )

            distance["distance"] = float(cosine_similarity(vec_list_1, vec_list_2))
            sentences_distances.append(distance)

    return sentences_distances

### Test Utility Functions

In [None]:
doc_list = []
file_1 = open('test_documents/crypto_currency.txt','r')
file_2 = open('test_documents/trump_401k.txt','r')

doc_list.append(file_1.read())
doc_list.append(file_2.read())

In [None]:
sent_dict = vectorize_document_list(doc_list)

vectorize_sentences(["Hello what is your name?", "I like cheese.", "what do you think of me?"])

In [None]:
sentences = get_most_similar_sentences(
    sent_dict,
    "“So he just may not realize that he’s speaking to the privileged few.” Only a third of people contribute anything to their retirement accounts, according to a Census study released this year.",
    10
)
print sentences[0]

In [None]:
mean_sentences_vector(
        ["“So he just may not realize that he’s speaking to the privileged few.” Only a third of people contribute anything to their retirement accounts, according to a Census study released this year."]
    )

In [None]:
new_sentences = get_most_similar_sentences_to_vector(
    sent_dict,
    mean_sentences_vector(
        ["“So he just may not realize that he’s speaking to the privileged few.” Only a third of people contribute anything to their retirement accounts, according to a Census study released this year."]
    )
)
print new_sentences[0]

In [None]:
sentences = get_distances_between_sentences(["hello world.", "I like applesauce."])
print sentences