In [None]:
import os
import numpy as np
import heapq
import itertools

from nltk.stem.porter import PorterStemmer
from scipy import sparse

NORMED = 0
SVD = 1

def get_all_txt(path):
    list_of_files = []
    for r, _, files in os.walk(path):
        for f in files:
            if f.endswith(".txt") and os.path.getsize(os.path.join(r,f)) > 0: 
                list_of_files.append(os.path.join(r,f))
    return list_of_files

    
def article_reader(file_name):
    with open(file_name, 'r') as file:
        for line in file.readlines():
            for word in line.split(" "):
                if word:
                    yield word

                    
def word_trim(word):
    stemmer = PorterStemmer()
    return stemmer.stem(word)
    
                    

class SearchEngine:
    def __init__(self, filenames, k=None):
        self.word_map = None
        self.index_map = None
        self.bag_of_words = None
        self.terms_matrix = None
        self.idf_m = None

        self.u = None
        self.dvt = None
        
        self.filenames = filenames
        self.files_count = len(filenames)
        
        self.compute_bag_of_words()
        self.create_mappings()
        self.words_count = len(self.index_map)
                
        if k is None:
            self.current_k = int(self.files_count / 3)
        else:
            self.current_k = k
    
    
    def compute_bag_of_words(self):
        self.bag_of_words = set()
        for file in self.filenames:
            for word in article_reader(file):
                self.bag_of_words.add(word_trim(word))


    def create_mappings(self):
        self.word_map = dict()
        self.index_map = []
        for (i, word) in enumerate(self.bag_of_words):
            self.index_map.append(word)
            self.word_map[word] = i
        print(f"Words quantity: {len(self.index_map)}")
        print("__________________________")


    def compute_terms_matrix(self):
        tmp_matrix = sparse.lil_matrix((self.words_count, self.files_count), dtype=np.float32)
        print("Terms matrix computing...")
        for i, file_name in enumerate(self.filenames):
            for word in article_reader(file_name):
                tmp_matrix[self.word_map[word_trim(word)], i] += 1
        
        self.terms_matrix = tmp_matrix.tocsr()
        print("Done.\n__________________________")
        
        print("IDF formatting...")
        self.idf()
        self.idf_matrix_format()
        print("Done.\n__________________________")
        
        print("Terms matrix normalizing...")
        d_norms = sparse.linalg.norm(self.terms_matrix, axis=0)
        
        non_zero_tmp = self.terms_matrix.nonzero()
        non_zero = itertools.zip_longest(non_zero_tmp[0], non_zero_tmp[1])
        for (row, col) in non_zero:
            self.terms_matrix[row, col] = self.terms_matrix[row, col] / d_norms[col]
        print("Done.\n__________________________")
        
        
    def compute_svd(self):
        print("Computing SVD decomposition...")
        self.u, d, vt = sparse.linalg.svds(self.terms_matrix,k=self.current_k)
        print("Done.\n__________________________")
        
        print("Computing D @ V.T...")
        self.dvt = sparse.diags(d).dot(vt)
        print("Done.\n__________________________")
        
        print("Preparing D @ V.T to give normalized Ak matrix...")
        for col in range(self.files_count):
            norm = np.linalg.norm(self.u @ self.dvt[:,col])
            self.dvt[:,col] /= norm
        print("Done.\n__________________________")


    def words_frequency(self, words):
        result = sparse.lil_matrix((1, self.words_count), dtype=np.float32)
        for word in words:
            trimmed = word_trim(word)
            if trimmed in self.word_map:
                result[0, self.word_map[trimmed]] += 1
        return result.tocsr()


    def idf(self):
        self.idf_m = np.zeros(self.words_count, dtype=np.float32)
        
        non_zero_tmp = self.terms_matrix.nonzero()
        non_zero = itertools.zip_longest(non_zero_tmp[0], non_zero_tmp[1])
        for (row_n, col_n) in non_zero:
            self.idf_m[row_n] += 1
        
        for row_n in range(self.words_count):
            self.idf_m[row_n] = np.log(self.files_count / self.idf_m[row_n])


    def idf_matrix_format(self):
        non_zero_tmp = self.terms_matrix.nonzero()
        non_zero = itertools.zip_longest(non_zero_tmp[0], non_zero_tmp[1])
        for (row_n, col_n) in non_zero:
            self.terms_matrix[row_n, col_n] = self.terms_matrix[row_n, col_n] * self.idf_m[row_n]


    def calculate_probability_normed(self, key_words):
        q_vec = self.words_frequency(key_words).multiply(self.idf_m)
        
        q_norm = sparse.linalg.norm(q_vec)
        q_vec = q_vec / q_norm
        
        return sparse.csr_matrix.dot(q_vec, self.terms_matrix)

    
    def calculate_probability_svd(self, key_words):        
        q_vec = self.words_frequency(key_words).multiply(self.idf_m)
        q_norm = sparse.linalg.norm(q_vec)
        q_vec = q_vec / q_norm
        
        to_return = sparse.csr_matrix(q_vec).dot(self.u).dot(self.dvt)
        return to_return
        


    def find_n_articles(self, key_words, n, mode=NORMED):
        if self.terms_matrix is None:
            raise("Terms Matrix not calculated")
        
        if mode == NORMED:
            probs = self.calculate_probability_normed(key_words)
        elif mode == SVD:
            if self.u is None or self.dvt is None:
                raise("SVD not computed")
            probs = self.calculate_probability_svd(key_words)
            
        results = [(self.filenames[i], probs[0, i]) for i in probs.nonzero()[1]]
        return heapq.nlargest(n, results, key=lambda t: t[1])

In [None]:
engine = SearchEngine(get_all_txt("wikiarticles_2")[:5000])
engine.compute_terms_matrix()
engine.compute_svd()

In [None]:
print(engine.find_n_articles(["research", "water"], 5, mode=SVD))

print("_________________________________________")

print(engine.find_n_articles(["research", "water"], 5, mode=NORMED))