In [1]:
from nltk import WordPunctTokenizer
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pickle as pkl

In [2]:
tokenizer = WordPunctTokenizer()

In [3]:
def acquire_documents(filename):
    with open(filename+'.txt','r',encoding='utf-8') as l:
        lemmas = l.read().split('\n')
    l.close()
    tokens = tokenizer.tokenize_sents(lemmas)
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
    return lemmas, tokens, documents

In [4]:
train_texts, train_tokens, train_documents = acquire_documents('Train_lemmas')

In [5]:
dim = 300

In [6]:
model = Doc2Vec(train_documents, vector_size=dim, window=2, min_count=1, workers=4)

In [7]:
model.save('Train_only.model')

In [8]:
train_vectors = [model.infer_vector(i.words) for i in tqdm(train_documents)]

100%|██████████| 87719/87719 [01:35<00:00, 920.37it/s] 


In [13]:
test_texts, test_tokens, test_documents = acquire_documents('Test_lemmas')

In [14]:
test_vectors = [model.infer_vector(i.words) for i in tqdm(test_documents)]

100%|██████████| 999/999 [00:01<00:00, 880.53it/s]


In [15]:
val_texts, val_tokens, val_documents = acquire_documents('Val_lemmas')

In [16]:
val_vectors = [model.infer_vector(i.words) for i in tqdm(val_documents)]

100%|██████████| 998/998 [00:01<00:00, 877.74it/s]


In [17]:
with open('top_threehundred_sorted_cutathundred.pkl', 'rb') as t:
    top_word_sets = {k:set(val[:300]) for k, val in (pkl.load(t)).items()}
t.close()

In [19]:
top_word_sets['anime']

{'алхимик',
 'анимация',
 'аниме',
 'анимешник',
 'анимэ',
 'арка',
 'берсерка',
 'бибоп',
 'блич',
 'боевой',
 'вампир',
 'гиас',
 'дворецкий',
 'девочка',
 'длинный',
 'досматривать',
 'дубляж',
 'ева',
 'евангелиона',
 'замок',
 'заполнять',
 'затягивать',
 'кп',
 'лайт',
 'манга',
 'мангак',
 'мех',
 'меч',
 'миядзаки',
 'мозг',
 'мувик',
 'мульт',
 'мультик',
 'мультфильм',
 'нарут',
 'наруто',
 'необычно',
 'объектив',
 'овашка',
 'озвучка',
 'онгоинг',
 'опенинг',
 'отсмотреть',
 'охотиться',
 'палатка',
 'пересматривать',
 'повседневность',
 'подросток',
 'подсказывать',
 'покемон',
 'полнометражка',
 'полнометражный',
 'призрак',
 'продолжение',
 'просматривать',
 'просмотр',
 'развиваться',
 'раскрывать',
 'рисовка',
 'романтика',
 'самурай',
 'седзе',
 'сейя',
 'сенэн',
 'сериал',
 'синкай',
 'сиська',
 'создатель',
 'сосед',
 'тайтлы',
 'тетрадка',
 'тетрадь',
 'том',
 'ух',
 'фансервис',
 'филлер',
 'хвост',
 'хентай',
 'ценить',
 'штамп',
 'штука',
 'эльфийский',
 'эндинг

In [20]:
categs = ['anime', 'art', 'books', 'films', 'food', 'football', 'games', 'music', 'nature', 'travel']
# weights = [len(v) / max([len(v) for v in top_sets.values()]) for v in top_sets.values()] # normal weights per quantity
weights = [1] * 10 # non-existent weights

def words_from_tokens(tokens):
    vec = [0] * 10
    for i in range(len(categs)):
        count = 0
        for t in tokens:
            if t in top_word_sets[categs[i]]:
                count += 1
        vec[i] = (count / len(tokens))/weights[i]
    vec = np.array(vec, dtype='float32')
    return vec

In [21]:
def vectors_from_corpus(corpus):
    vectors = [words_from_tokens(text) for text in tqdm(corpus)]
    return vectors

In [22]:
train_word_vectors = vectors_from_corpus(train_tokens)

100%|██████████| 87719/87719 [00:07<00:00, 11096.16it/s]


In [23]:
test_word_vectors = vectors_from_corpus(test_tokens)

100%|██████████| 999/999 [00:00<00:00, 10277.01it/s]


In [24]:
val_word_vectors = vectors_from_corpus(val_tokens)

100%|██████████| 998/998 [00:00<00:00, 10256.45it/s]


In [25]:
with open('trigrams_threehundred_sorted.pkl', 'rb') as t:
    top_tri_sets = {k:set(val[:300]) for k, val in (pkl.load(t)).items()}
t.close()

In [26]:
categs = ['anime', 'art', 'books', 'films', 'food', 'football', 'games', 'music', 'nature', 'travel']
# weights = [len(v) / max([len(v) for v in top_sets.values()]) for v in top_sets.values()] # normal weights per quantity
weights = [1] * 10 # non-existent weights

def tris_from_text(text):
    vec = [0] * 10
    for i in range(len(categs)):
        count = 0
        for trig in top_tri_sets[categs[i]]:
            if trig in text:
                count += 1
        vec[i] = (count / len(text))/weights[i]
    vec = np.array(vec, dtype='float32')
    return vec

In [27]:
def trigrams_from_corpus(corpus):
    trigrams = [tris_from_text(text) for text in tqdm(corpus)]
    return trigrams

In [28]:
train_tri_vectors = trigrams_from_corpus(train_texts)

100%|██████████| 87719/87719 [01:54<00:00, 766.71it/s]


In [29]:
test_tri_vectors = trigrams_from_corpus(test_texts)

100%|██████████| 999/999 [00:01<00:00, 711.39it/s]


In [30]:
val_tri_vectors = trigrams_from_corpus(val_texts)

100%|██████████| 998/998 [00:01<00:00, 712.23it/s]


In [31]:
def joint_vectors(vectors, word_vectors, tri_vectors):
    joint = [np.concatenate([vectors[i], word_vectors[i], tri_vectors[i]]) for i in tqdm(range(len(vectors)))]
    return np.array(joint)

In [32]:
train = joint_vectors(train_vectors, train_word_vectors, train_tri_vectors)

100%|██████████| 87719/87719 [00:00<00:00, 186101.55it/s]


In [33]:
test = joint_vectors(test_vectors, test_word_vectors, test_tri_vectors)

100%|██████████| 999/999 [00:00<00:00, 132246.87it/s]


In [34]:
val = joint_vectors(val_vectors, val_word_vectors, val_tri_vectors)

100%|██████████| 998/998 [00:00<00:00, 131624.28it/s]


In [35]:
with open('Train_'+str(dim+20)+'.pkl', 'wb') as tr:
    pkl.dump(train, tr)

In [36]:
with open('Test_'+str(dim+20)+'.pkl', 'wb') as tst:
    pkl.dump(test, tst)

In [37]:
with open('Val_'+str(dim+20)+'.pkl', 'wb') as vl:
    pkl.dump(val, vl)

In [38]:
def joint_vectors_tris(vectors, tri_vectors):
    joint = [np.concatenate([vectors[i], tri_vectors[i]]) for i in tqdm(range(len(vectors)))]
    return np.array(joint)

In [39]:
train_tri = joint_vectors_tris(train_vectors, train_tri_vectors)

100%|██████████| 87719/87719 [00:00<00:00, 207506.92it/s]


In [40]:
test_tri = joint_vectors_tris(test_vectors, test_tri_vectors)

100%|██████████| 999/999 [00:00<00:00, 177051.88it/s]


In [41]:
val_tri = joint_vectors_tris(val_vectors, val_tri_vectors)

100%|██████████| 998/998 [00:00<00:00, 128932.28it/s]


In [43]:
with open('Train_'+str(dim+10)+'_tris.pkl', 'wb') as tr1:
    pkl.dump(train_tri, tr1)

In [44]:
with open('Test_'+str(dim+10)+'_tris.pkl', 'wb') as tst1:
    pkl.dump(test_tri, tst1)

In [45]:
with open('Val_'+str(dim+10)+'_tris.pkl', 'wb') as vl1:
    pkl.dump(val_tri, vl1)

In [46]:
def joint_vectors_twenty(word_vectors, tri_vectors):
    joint = [np.concatenate([word_vectors[i], tri_vectors[i]]) for i in tqdm(range(len(word_vectors)))]
    return np.array(joint)

In [47]:
train_tw = joint_vectors_tris(train_word_vectors, train_tri_vectors)

100%|██████████| 87719/87719 [00:00<00:00, 312972.51it/s]


In [48]:
test_tw = joint_vectors_tris(test_word_vectors, test_tri_vectors)

100%|██████████| 999/999 [00:00<00:00, 118838.02it/s]


In [49]:
val_tw = joint_vectors_tris(val_word_vectors, val_tri_vectors)

100%|██████████| 998/998 [00:00<00:00, 167691.51it/s]


In [53]:
with open('Train_twenty.pkl', 'wb') as tw1:
    pkl.dump(train_tw, tw1)

In [54]:
with open('Test_twenty.pkl', 'wb') as tw2:
    pkl.dump(test_tw, tw2)

In [55]:
with open('Val_twenty.pkl', 'wb') as tw3:
    pkl.dump(val_tw, tw3)