In [1]:
#pip install --upgrade gensim
from gensim import corpora
from gensim import models
from gensim import similarities
from collections import defaultdict
#nltk.download('stopwords')
from nltk.corpus import stopwords
import re

In [2]:
documents = []
with open('stepik_courses.txt', 'r') as s_c, open('coursera_courses.txt', 'r') as c_c, open('openedu_courses.txt', 'r') as o_c:
    documents.extend([line[2:line.find(", 'https")-1].strip() for line in s_c])
    documents.extend([line[4:line.find(", 'https")-3].strip() for line in c_c])
    documents.extend(list(map(lambda x: eval(x)[0], [line.strip() for line in o_c])))

In [3]:
courses_info = []
with open('stepik_courses.txt', 'r') as s_c, open('coursera_courses.txt', 'r') as c_c, open('openedu_courses.txt', 'r') as o_c:
    courses_info.extend([line[1:-2] for line in s_c])
    courses_info.extend([line[3:-2] for line in c_c])
    courses_info.extend([line[2:-2] for line in o_c])

In [4]:
class Stemmer_:
    _vowel = "[аеиоуыэюя]"
    _non_vowel = "[^аеиоуыэюя]"

    _re_rv = re.compile(_vowel)
    _re_r1 = re.compile(_vowel + _non_vowel)

    _re_perfective_gerund = re.compile(
        r"(((?P<ignore>[ая])(в|вши|вшись))|(ив|ивши|ившись|ыв|ывши|ывшись))$"
    )
    _re_adjective = re.compile(
        r"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|"
        r"ую|юю|ая|яя|ою|ею)$"
    )
    _re_participle = re.compile(
        r"(((?P<ignore>[ая])(ем|нн|вш|ющ|щ))|(ивш|ывш|ующ))$"
    )
    _re_reflexive = re.compile(
        r"(ся|сь)$"
    )
    _re_verb = re.compile(
        r"(((?P<ignore>[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|"
        r"нно))|(ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|"
        r"ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю))$"
    )
    _re_noun = re.compile(
        r"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|"
        r"ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$"
    )
    _re_superlative = re.compile(
        r"(ейш|ейше)$"
    )
    _re_derivational = re.compile(
        r"(ост|ость)$"
    )
    _re_i = re.compile(
        r"и$"
    )
    _re_nn = re.compile(
        r"((?<=н)н)$"
    )
    _re_ = re.compile(
        r"ь$"
    )

    def stem(self, word):
        """
        Gets the stem.
        """

        rv_pos, r2_pos = self._find_rv(word), self._find_r2(word)
        word = self._step_1(word, rv_pos)
        word = self._step_2(word, rv_pos)
        word = self._step_3(word, r2_pos)
        word = self._step_4(word, rv_pos)
        return word

    def _find_rv(self, word):
        
        rv_match = self._re_rv.search(word)
        if not rv_match:
            return len(word)
        return rv_match.end()

    def _find_r2(self, word):

        r1_match = self._re_r1.search(word)
        if not r1_match:
            return len(word)
        r2_match = self._re_r1.search(word, r1_match.end())
        if not r2_match:
            return len(word)
        return r2_match.end()

    def _cut(self, word, ending, pos):

        match = ending.search(word, pos)
        if match:
            try:
                ignore = match.group("ignore") or ""
            except IndexError:
                # No ignored characters in pattern.
                return True, word[:match.start()]
            else:
                # Do not cut ignored part.
                return True, word[:match.start() + len(ignore)]
        else:
            return False, word

    def _step_1(self, word, rv_pos):
        match, word = self._cut(word, self._re_perfective_gerund, rv_pos)
        if match:
            return word
        _, word = self._cut(word, self._re_reflexive, rv_pos)
        match, word = self._cut(word, self._re_adjective, rv_pos)
        if match:
            _, word = self._cut(word, self._re_participle, rv_pos)
            return word
        match, word = self._cut(word, self._re_verb, rv_pos)
        if match:
            return word
        _, word = self._cut(word, self._re_noun, rv_pos)
        return word

    def _step_2(self, word, rv_pos):
        _, word = self._cut(word, self._re_i, rv_pos)
        return word

    def _step_3(self, word, r2_pos):
        _, word = self._cut(word, self._re_derivational, r2_pos)
        return word

    def _step_4(self, word, rv_pos):
        _, word = self._cut(word, self._re_superlative, rv_pos)
        match, word = self._cut(word, self._re_nn, rv_pos)
        if not match:
            _, word = self._cut(word, self._re_, rv_pos)
        return word

In [5]:
stemmer = Stemmer_()
stoplist = set(stopwords.words('russian'))

In [6]:
texts = [
    [stemmer.stem(word) for word in document.lower().split() if word not in stoplist]
    for document in documents
]

In [7]:
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [53]:
tf_idf = models.TfidfModel(corpus)
corpus_= tf_idf[corpus]
lsi = models.LsiModel(corpus_, id2word=dictionary, num_topics = 500) #<=========

In [54]:
index = similarities.MatrixSimilarity(lsi[corpus_])

In [67]:
def query(query, dic = dictionary, stemmer = stemmer, model = lsi, index = index):
    q_bow = dictionary.doc2bow(list(map(stemmer.stem, query.lower().split())))
    q_lsi = model[q_bow]
    sims = index[q_lsi]
    print(f'query: {query}\n') 
    sims = sorted(enumerate(sims), key = lambda item: -item[1])[:10]
    for doc_position, doc_score in sims:
        print(doc_score, courses_info[doc_position])

In [68]:
query('Теория вероятностей матстатистика')

query: Теория вероятностей матстатистика

0.98615897 'Теория вероятностей', 'https://stepik.org/course/98940/promo'
0.98615897 'Теория вероятностей', 'https://stepik.org/course/94188/promo'
0.98615897 'Теория вероятности', 'https://stepik.org/course/52970/promo'
0.98615897 'Теория вероятностей', 'https://stepik.org/course/3089/promo'
0.98615897 'Теория вероятностей', 'https://stepik.org/course/3066/promo'
0.98615897 Теория вероятностей', 'СПбГЭТУ «ЛЭТИ»', '/course/eltech/probability_theory/'
0.86690706 Введение в теорию вероятностей', 'МФТИ', '/course/mipt/PROBTH/'
0.8237488 'Теория вероятностей для начинающих''', 'https://www.coursera.org/learn/probability-theory-basics, '''Moscow Institute of Physics and Technology'''
0.8004688 'Теория вероятностей и ее приложения''', 'https://www.coursera.org/learn/prob-theory, '''HSE University'''
0.79300666 'Теория вероятностей - II (дискретные случайные процессы)', 'https://stepik.org/course/57281/promo'
