In [None]:
import os
import numpy as np
import heapq
import itertools

from nltk.stem.porter import PorterStemmer
from scipy import sparse
from sklearn.decomposition import TruncatedSVD


def get_all_txt(path):
    list_of_files = []
    for r, _, files in os.walk(path):
        for f in files:
            if f.endswith(".txt") and os.path.getsize(os.path.join(r,f)) > 0: 
                list_of_files.append(os.path.join(r,f))
    return list_of_files
    
def article_reader(file_name):
    with open(file_name, 'r') as file:
        for line in file.readlines():
            for word in line.split(" "):
                if word:
                    yield word

                    
def word_trim(word):
    stemmer = PorterStemmer()
    return stemmer.stem(word)
    
                    

class SearchEngine:
    def __init__(self, filenames):
        self.word_map = None
        self.index_map = None
        self.bag_of_words = None
        self.terms_matrix = None
        self.terms_matrix_normed = None
        self.idf_m = None
        
        self.filenames = filenames
        self.files_count = len(filenames)
        
        self.compute_bag_of_words()
        self.create_mappings()
        self.words_count = len(self.index_map)
    
    
    def compute_bag_of_words(self):
        self.bag_of_words = set()
        for file in self.filenames:
            for word in article_reader(file):
                self.bag_of_words.add(word_trim(word))


    def create_mappings(self):
        self.word_map = dict()
        self.index_map = []
        for (i, word) in enumerate(self.bag_of_words):
            self.index_map.append(word)
            self.word_map[word] = i
        print(len(self.index_map))


    def compute_terms_matrix(self):
        tmp_matrix = sparse.lil_matrix((self.words_count, self.files_count), dtype=np.float32)
        for i, file_name in enumerate(self.filenames):
            for word in article_reader(file_name):
                tmp_matrix[self.word_map[word_trim(word)], i] += 1
        
        self.terms_matrix = tmp_matrix.tocsr()
        
        self.idf()
        self.idf_matrix_format()
        
        d_norms = sparse.linalg.norm(self.terms_matrix, axis=0)
        tmp_matrix_normed = sparse.lil_matrix((self.words_count, self.files_count), dtype=np.float32)
        
        
        
        non_zero_tmp = self.terms_matrix.nonzero()
        non_zero = itertools.zip_longest(non_zero_tmp[0], non_zero_tmp[1])
        for (row, col) in non_zero:
            tmp_matrix_normed[row, col] = self.terms_matrix[row, col] / d_norms[col]
        self.terms_matrix_normed = tmp_matrix_normed.tocsr()
        


    def words_frequency(self, words):
        result = sparse.lil_matrix((1, self.words_count), dtype=np.float32)
        for word in words:
            result[0, self.word_map[word_trim(word)]] += 1
        return result.tocsr()


    def idf(self):
        self.idf_m = np.zeros(self.words_count, dtype=np.float32)
        
        non_zero_tmp = self.terms_matrix.nonzero()
        non_zero = itertools.zip_longest(non_zero_tmp[0], non_zero_tmp[1])
        for (row_n, col_n) in non_zero:
            self.idf_m[row_n] += 1
        
        for row_n in range(self.words_count):
            self.idf_m[row_n] = self.files_count / self.idf_m[row_n]


    def idf_matrix_format(self):
        non_zero_tmp = self.terms_matrix.nonzero()
        non_zero = itertools.zip_longest(non_zero_tmp[0], non_zero_tmp[1])
        for (row_n, col_n) in non_zero:
            self.terms_matrix[row_n, col_n] = self.terms_matrix[row_n, col_n] * self.idf_m[row_n]


    def calculate_probability(self, key_words):
        q_vec = self.idf_m * self.words_frequency(key_words)
        q_norm = np.linalg.norm(q_vec)
        d_norms = sparse.linalg.norm(self.terms_matrix, axis=1)
        return q_vec @ self.terms_matrix.T / (q_norm * d_norms)


    def calculate_probability_normed(self, key_words):
        q_vec = self.words_frequency(key_words).multiply(self.idf_m)
        
        q_norm = sparse.linalg.norm(q_vec)
        q_vec = q_vec / q_norm
#         print(q_vec)
        return sparse.csr_matrix.dot(q_vec, self.terms_matrix_normed)


    def find_n_articles(self, key_words, n):
        if self.terms_matrix is None:
            self.compute_terms_matrix()
        if self.idf_m is None:
            self.idf()
            self.idf_matrix_format()
            
        probs = self.calculate_probability_normed(key_words)[0]
        results = [(self.filenames[i], probs[0, i]) for i in probs.nonzero()[1]]
        return heapq.nlargest(n, results, key=lambda t: t[1])

In [None]:
engine = SearchEngine(get_all_txt("wikiarticles_2"))
engine.compute_terms_matrix()


In [None]:
print(engine.find_n_articles(["research", "water"], 5))

In [None]:
en2 = SearchEngine(get_all_txt("wikiarticles_2")[:1000])
en2.compute_terms_matrix()

In [None]:
from itertools import product

gam1 = wordnet.synsets("playing")
gam2 = wordnet.synsets("play")

# gam3 = wordnet.synset("play.n.01")
# gam3.

for (w1, w2) in product(gam1, gam2):
    print(f"{w1.name()}: {w1.definition()}\n{w2.name()}: {w2.definition()}")
    print(w1.wup_similarity(w2))
    print("-----------------------------------")
# print(gam1.wup_similarity(gam2))