In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from scipy.sparse import save_npz
import pickle
from razdel import tokenize

from nltk.corpus import stopwords
import pymorphy2


In [2]:
def tokenize_and_base(text):
    morph = pymorphy2.MorphAnalyzer()

    russian_stopwords = set(stopwords.words('russian'))
    additional_stopwords = {'и', 'припев', 'но', 'я', 'в', 'но', 'что', 'мой', 'свой', 'весь', 'всё', 'на', 'мы', 'c', 'a','вест','это', 'сам'}
    russian_stopwords.update(additional_stopwords)
    words = [i.text for i in tokenize(text)]
    processed_words = []
    for word in words:
        # Remove punctuation
        if not word.isalpha():
            continue
        # Remove stopwords
        if word in russian_stopwords:
            continue
        # lemmatiza
        parsed_word = morph.parse(word)[0]
        normal_form = parsed_word.normal_form
        # Check stopword in lematized
        if normal_form in russian_stopwords:
            continue
        processed_words.append(normal_form)
    return processed_words


In [3]:
df_songs = pd.read_csv('data_my/kish_songs_without_duplcates.csv')

In [4]:
vectorizer = TfidfVectorizer(tokenizer=tokenize_and_base, token_pattern=None)

corpus = vectorizer.fit_transform(df_songs['lyrics'].to_list())

In [5]:
# save vectorizer
with open('data_my/vectorizer.pk', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)
# save corpus
save_npz('data_my/corpus.npz', corpus)

In [14]:
corpus[0].max()

0.5025617872476633

In [9]:
query = 'Песня о 2-х друзьях, на которых напали разбойники'
query_vector = vectorizer.transform([query])
similarity_matrix = cosine_similarity(query_vector, corpus)
similarity_array = np.ravel(similarity_matrix)
best_indexes = np.argsort(similarity_array)[::-1]
top10_index = best_indexes[:10]
top10_similarities = similarity_array[top10_index]
top10_titles = df_songs.iloc[top10_index]['title']

pd.DataFrame({'Название': top10_titles[1:], 'Совпадение': top10_similarities[1:]})

Unnamed: 0,Название,Совпадение
70,Два Друга и Разбойники,0.205547
43,Песня Мушкетёров,0.110314
37,Собрание,0.10653
152,В Париж - Домой,0.099017
172,Бунтарь,0.072206
197,На краю (Последняя ария Тодда),0.065986
115,Рыцарь,0.062172
47,Карапуз,0.055436
76,Пират,0.05076


In [10]:
similarity_array.min()

0.0

In [8]:
top10_titles.to_list()

['Пивной Череп',
 'Милое дело',
 'Разговор с гоблином',
 'Иван Факов',
 'Паника в селе',
 'Если Мясо Мужики',
 'Вино Хоббитов',
 'Наблюдатель',
 'Мёртвый Анархист',
 'Защитник Свиней']

In [19]:
top10_index

array([  0,  97, 118,  28,  71, 170,  41, 108,  15, 223], dtype=int64)