In [104]:
import pandas as pd
import numpy as np

from analysis.data_parsing.word_vectorizer import WordVectorizer
from analysis.data_parsing.word_data_parser import WordDataParser

df = pd.read_csv('../data/interest_data/interest_groups.csv', index_col=0)

vectors = np.array(df.select_dtypes(include=['number'])) 

tags = 'музыка кино книги спорт технологии развлечения путешествия животные ' \
       'наука история еда природа мода'
wv = WordVectorizer('../data/model.bin')
parser = WordDataParser(wv)
tag_lemmas = parser.raw_text_to_words(tags)

tags = tags.replace('\n',' ').split()
print('tags:', tags)
print("tag's lemmas:", tag_lemmas)


tags: ['музыка', 'кино', 'книги', 'спорт', 'технологии', 'развлечения', 'путешествия', 'животные', 'наука', 'история', 'еда', 'природа', 'мода']
tag's lemmas: ['музыка' 'кино' 'книга' 'спорт' 'технология' 'развлечение' 'путешествие'
 'животное' 'наука' 'история' 'еда' 'природа' 'мода']


In [105]:
tag_vectors = [parser.vectorizer.get_vector(tag)[1] for tag in tag_lemmas]
for i in range(len(tag_vectors)):
    if tag_vectors[i] is None:
        print(tags[i])

In [106]:
from sklearn.metrics.pairwise import cosine_distances

dist = cosine_distances(vectors, tag_vectors)
print(dist.shape)
interest_tags = np.argmin(dist, axis=1)
tag_interests = [np.where(interest_tags==i)[0] for i in range(len(tags))]

print(interest_tags, tag_interests)
for i in range(len(tag_interests)):
    print(tags[i], len(tag_interests[i]))
    for x in tag_interests[i]:
        print('    ', df['description'][x])

(128, 13)
[ 8  8 11  4  4  4  3  1  0 11  8  9  0  8 11 11  1 12  0  3 12  1  3  1
 12 11 10  5  4  4  3  7  1 11 10 10  4  9 11  6 12  7 10  7  8  8 10  4
  5 11  1 10  6  6  0  6  1  8  6  5  5  2  4 10 12  1  5 12 12 12 10 11
  2  0 11  8  6  8 12 12  7 11  4 11  8  1  5  8  7  8  7  9 11  5  9  8
  2  8  4  1  3  7 11  7  9  4  0  1 10 11  7  5  4 10  2  4  2  4 10 10
  7  7  8  8  4  0  1  5] [array([  8,  12,  18,  54,  73, 106, 125]), array([  7,  16,  21,  23,  32,  50,  56,  65,  85,  99, 107, 126]), array([ 61,  72,  96, 114, 116]), array([  6,  19,  22,  30, 100]), array([  3,   4,   5,  28,  29,  36,  47,  62,  82,  98, 105, 112, 115,
       117, 124]), array([ 27,  48,  59,  60,  66,  86,  93, 111, 127]), array([39, 52, 53, 55, 58, 76]), array([ 31,  41,  43,  80,  88,  90, 101, 103, 110, 120, 121]), array([  0,   1,  10,  13,  44,  45,  57,  75,  77,  84,  87,  89,  95,
        97, 122, 123]), array([ 11,  37,  91,  94, 104]), array([ 26,  34,  35,  42,  46,  51,  63,  70

In [107]:
df_tags = pd.DataFrame(interest_tags, columns=['tag'])
print(df_tags)
df_tags.to_csv('../data/interest_data/group_tags.csv')


     tag
0      8
1      8
2     11
3      4
4      4
..   ...
123    8
124    4
125    0
126    1
127    5

[128 rows x 1 columns]
