In [1]:
import os
import numpy as np

def get_all_txt(path):
    list_of_files = []
    for r, _, files in os.walk(path):
        for f in files:
            if f.endswith(".txt") and os.path.getsize(os.path.join(r,f)) > 0: 
                list_of_files.append(os.path.join(r,f))
    return list_of_files
            

def article_reader(file_name):
    with open(file_name, 'r') as file:
        for line in file.readlines():
            for word in line.split(" "):
                if word:
                    yield word


class SearchEngine:
    def __init__(self, filenames):
        self.word_map = None
        self.index_map = None
        self.bag_of_words = None
        self.terms_matrix = None
        self.idf_m = None
        
        self.filenames = filenames
        self.files_count = len(filenames)
        
        self.compute_bag_of_words()
        self.create_mappings()
        self.words_count = len(self.index_map)
    
    
    def compute_bag_of_words(self):
        self.bag_of_words = set()
        for file in self.filenames:
            for word in article_reader(file):
                self.bag_of_words.add(word)


    def create_mappings(self):
        self.word_map = dict()
        self.index_map = []
        for (i, word) in enumerate(self.bag_of_words):
            self.index_map.append(word)
            self.word_map[word] = i


    def compute_terms_matrix(self):
        self.terms_matrix = np.zeros(shape=(self.files_count, self.words_count))
        for i, file_name in enumerate(self.filenames):
            for word in article_reader(file_name):
                self.terms_matrix[i, self.word_map[word]] += 1


    def words_frequency(self, words):
        result = np.zeros(self.words_count)
        for word in words:
            result[self.word_map[word]] += 1
        return result


    def idf(self):
        self.idf_m = np.repeat(self.files_count, self.words_count) / np.count_nonzero(self.terms_matrix, axis = 0)


    def idf_matrix_format(self):
        self.terms_matrix = self.terms_matrix * self.idf_m[np.newaxis,:]


    def calculate_probability(self, key_words):
        q_vec = self.idf_m * self.words_frequency(key_words)
        q_norm = np.linalg.norm(q_vec)
        d_norms = np.linalg.norm(self.terms_matrix, axis=1)
        return q_vec @ self.terms_matrix.T / (q_norm * d_norms)


    def calculate_probability_normed(self, key_words):
        q_vec = self.idf_m * self.words_frequency(key_words)
        q_norm = np.linalg.norm(q_vec)

        q_vec = q_vec / q_norm

        d_norms = np.linalg.norm(self.terms_matrix, axis=1)
        term_normed = self.terms_matrix / d_norms[:,np.newaxis]
        return q_vec @ term_normed.T


    def find_n_articles(self, key_words, n):
        if not self.terms_matrix:
            self.compute_terms_matrix()
        if not self.idf_m:
            self.idf()
            self.idf_matrix_format()
            
        probs = self.calculate_probability_normed(key_words)
            
        return list(self.filenames[i] for i in np.argsort(probs)[::-1][:n])

In [4]:
engine = SearchEngine(get_all_txt("wikiarticles_2"))

print(engine.find_n_articles(["research"], 5))


['wikiarticles_2/Godwin_Laboratory,_University_of_Cambridge.txt', 'wikiarticles_2/Pawel_Tabakow.txt', 'wikiarticles_2/Fair_dealing_in_United_Kingdom_law.txt', 'wikiarticles_2/G._Michael_Purdy.txt', 'wikiarticles_2/Danny_Welch.txt']


In [3]:
print(engine.words_count)

55129
