In [46]:
import os
import numpy as np

def get_all_txt(path):
    list_of_files = []
    for r, _, files in os.walk(path):
        for f in files:
            if f.endswith(".txt") and os.path.getsize(os.path.join(r,f)) > 0: 
                list_of_files.append(os.path.join(r,f))
    return list_of_files
            

def article_reader(file_name):
    with open(file_name, 'r') as file:
        for line in file.readlines():
            for word in "".join(filter(lambda l: l.isalnum() or l == ' ', line.replace("\t", " "))).split(" "):
                if word:
                    yield word

                    
def compute_bag_of_words(list_of_files):
    bag_of_words = set()
    for file_name in list_of_files:
        for word in article_reader(file_name):
            bag_of_words.add(word)
    return bag_of_words
    
    
def create_mappings(bag_of_words):
    word_to_index = dict()
    index_to_word = []
    for (i, word) in enumerate(bag_of_words):
        index_to_word.append(word)
        word_to_index[word] = i
    return (word_to_index, index_to_word)


def compute_terms_matrix(list_of_files, word_to_index):
    words_count = len(word_to_index)
    words = np.zeros(shape=(len(list_of_files), words_count))
    for i, file_name in enumerate(list_of_files):
        for word in article_reader(file_name):
            words[i, word_to_index[word]] += 1
    return words

    
def words_frequency(words, word_to_index):
    result = np.zeros(len(word_to_index))
    for word in words:
        result[word_to_index[word]] += 1
    return result
    
    
def idf(term_matrix, word_to_index):
    documents = term_matrix.shape[0]
    words = term_matrix.shape[1]
    return np.repeat(documents, words) / np.count_nonzero(term_matrix, axis = 0)


def idf_matrix_format(matrix, idf_m):
    return matrix * idf_m[np.newaxis,:]


def calculate_probability(key_words, term_matrix, word_to_index, idf_m):
    q_vec = idf_m * words_frequency(key_words, word_to_index)
    q_norm = np.linalg.norm(q_vec)
    d_norms = np.linalg.norm(term_matrix, axis=1)
    print(d_norms)
    return q_vec @ term_matrix.T / (q_norm * d_norms)

def calculate_probability_normed(key_words, term_matrix, word_to_index, idf_m):
    q_vec = idf_m * words_frequency(key_words, word_to_index)
    q_norm = np.linalg.norm(q_vec)
    
    q_vec = q_vec / q_norm
    
    d_norms = np.linalg.norm(term_matrix, axis=1)
    term_normed = term_matrix / d_norms[:,np.newaxis]
    return q_vec @ term_normed.T

In [47]:
list_of_files = get_all_txt("wikiarticles")
bag_of_words = compute_bag_of_words(list_of_files)

word_map, index_map = create_mappings(bag_of_words)

term_matrix = compute_terms_matrix(list_of_files, word_map)

idf_m = idf(term_matrix, word_map)

term_matrix_formatted = idf_matrix_format(term_matrix, idf_m)

probs = calculate_probability_normed(["island", "ocean"], term_matrix_formatted, word_map, idf_m)

a = np.argmax(probs)
print(list_of_files[a])

wikiarticles/USS_Geronimo_(ATA-207).txt
