In [1]:
import platform
from os import listdir
from os.path import isfile, join, basename
import re

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

In [2]:
STUB_DATA_PATH = 'data/stub_data'

In [3]:
if platform.system() not in ('Darwin', 'Linux'):
    fail("I wasn't designed to run on this operating system.")

def simple_clean(doc):
    return re.sub('[^a-zA-Z\s.]', '', doc)
    
def stub_data_paragraphs():
    filenames = listdir(STUB_DATA_PATH)
    stub_paras = {}
    for filename in filenames:
        with open(join(STUB_DATA_PATH, filename), 'r') as file:
            stub_paras[filename] = simple_clean(file.read())
    return stub_paras


In [4]:
paras = stub_data_paragraphs()
names = list(paras.keys())
articles = [paras[name] for name in names]
vectorizer = CountVectorizer()
documents_by_terms = vectorizer.fit_transform(articles)

In [5]:
def vocabulary_as_strings(vectorizer):
    return vectorizer.vocabulary_.keys()

In [6]:
def unidimensionalize(two_dim_vect):
    return np.array(two_dim_vect[0])

In [7]:
def predict(vectorizer, documents_by_terms, names, query):
    vectorized_query = vectorizer.transform([simple_clean(query)])
    nested_sims = cosine_similarity(
        documents_by_terms,
        vectorized_query
    )
    sims = [ns[0] for ns in nested_sims]
    return [names[ind] for ind in np.argsort(sims)]


In [8]:
predict(vectorizer, documents_by_terms, names, 'philosopher')

['Muhammad',
 'Muhammad_Ali',
 'Lord_Byron',
 'Henry_VIII_of_England',
 'Andrew_Ng',
 'Richard_Nixon',
 'Ludwig_Wittgenstein',
 'Han_Fei']