In [1]:
from nltk import WordPunctTokenizer
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pickle as pkl



In [2]:
tokenizer = WordPunctTokenizer()

In [3]:
def acquire_documents(filename):
    with open(filename+'.txt','r',encoding='utf-8') as l:
        lemmas = l.read().split('\n')
    l.close()
    tokens = tokenizer.tokenize_sents(lemmas)
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
    return lemmas, tokens, documents

In [4]:
train_texts, train_tokens, train_documents = acquire_documents('Train_lemmas')

In [5]:
dim = 300

In [6]:
model = Doc2Vec(train_documents, vector_size=dim, window=2, min_count=1, workers=4)

In [7]:
model.save('Train_only.model')

In [8]:
train_vectors = [model.infer_vector(i.words) for i in tqdm(train_documents)]

100%|██████████████████████████████████| 87719/87719 [01:13<00:00, 1194.71it/s]


In [9]:
test_texts, test_tokens, test_documents = acquire_documents('Test_lemmas')

In [10]:
test_vectors = [model.infer_vector(i.words) for i in tqdm(test_documents)]

100%|██████████████████████████████████████| 999/999 [00:00<00:00, 1122.28it/s]


In [11]:
val_texts, val_tokens, val_documents = acquire_documents('Val_lemmas')

In [12]:
val_vectors = [model.infer_vector(i.words) for i in tqdm(val_documents)]

100%|██████████████████████████████████████| 998/998 [00:00<00:00, 1212.01it/s]


In [13]:
with open('top_threehundred_ppmi.pkl', 'rb') as t:
    top_word_sets = {k:set(val[:300]) for k, val in (pkl.load(t)).items()}
t.close()

In [14]:
top_word_sets['anime']

{'айвазовский',
 'алхимик',
 'анимация',
 'аниме',
 'анимешник',
 'анимэ',
 'аппетит',
 'арка',
 'аудитория',
 'берсерка',
 'бибоп',
 'блич',
 'бог',
 'боевой',
 'брат',
 'брейгель',
 'бронировать',
 'вампир',
 'ван',
 'вести',
 'вещество',
 'вещь',
 'включать',
 'водопад',
 'восторг',
 'впечатлить',
 'вредно',
 'вселенная',
 'вспомнить',
 'вставлять',
 'второй',
 'вы',
 'вывод',
 'выделять',
 'выставляться',
 'гарнир',
 'гениальный',
 'гиас',
 'голливуд',
 'голубой',
 'дворецкий',
 'девочка',
 'действие',
 'день',
 'диалог',
 'дикий',
 'диск',
 'длинный',
 'долгий',
 'досматривать',
 'доставлять',
 'достоинство',
 'драма',
 'другой',
 'дубляж',
 'душа',
 'дюрер',
 'ева',
 'евангелиона',
 'елка',
 'жанр',
 'жарить',
 'животный',
 'замок',
 'занести',
 'записывать',
 'заполнять',
 'запоминаться',
 'затягивать',
 'знакомый',
 'изобразительный',
 'изящный',
 'иллюстратор',
 'индустрия',
 'ирония',
 'исключительно',
 'искусствовед',
 'исторический',
 'исчезать',
 'кабачок',
 'камень',
 'ка

In [15]:
categs = ['anime', 'art', 'books', 'films', 'food', 'football', 'games', 'music', 'nature', 'travel']
# weights = [len(v) / max([len(v) for v in top_sets.values()]) for v in top_sets.values()] # normal weights per quantity
weights = [1] * 10 # non-existent weights

def words_from_tokens(tokens):
    vec = [0] * 10
    for i in range(len(categs)):
        count = 0
        for t in tokens:
            if t in top_word_sets[categs[i]]:
                count += 1
        vec[i] = (count / len(tokens))/weights[i]
    vec = np.array(vec, dtype='float32')
    return vec

In [16]:
def vectors_from_corpus(corpus):
    vectors = [words_from_tokens(text) for text in tqdm(corpus)]
    return vectors

In [17]:
train_word_vectors = vectors_from_corpus(train_tokens)

100%|█████████████████████████████████| 87719/87719 [00:05<00:00, 15640.82it/s]


In [18]:
test_word_vectors = vectors_from_corpus(test_tokens)

100%|██████████████████████████████████████| 999/999 [00:00<00:00, 8613.97it/s]


In [19]:
val_word_vectors = vectors_from_corpus(val_tokens)

100%|█████████████████████████████████████| 998/998 [00:00<00:00, 10313.03it/s]


In [20]:
with open('300 trigrams ppmi.pkl', 'rb') as t:
    top_tri_sets = {k:set(val[:300]) for k, val in (pkl.load(t)).items()}
t.close()

In [26]:
categs = ['anime', 'art', 'books', 'films', 'food', 'football', 'games', 'music', 'nature', 'travel']
# weights = [len(v) / max([len(v) for v in top_sets.values()]) for v in top_sets.values()] # normal weights per quantity
weights = [1] * 10 # non-existent weights

def tris_from_text(text):
    vec = [0] * 10
    for i in range(len(categs)):
        count = 0
        for trig in top_tri_sets[categs[i]]:
            if trig in text:
                count += 1
        vec[i] = (count / len(text))/weights[i]
    vec = np.array(vec, dtype='float32')
    return vec

In [29]:
def trigrams_from_corpus(corpus):
    trigrams = [tris_from_text(text) for text in tqdm(corpus)]
    return trigrams

In [30]:
train_tri_vectors = trigrams_from_corpus(train_texts)



  0%|                                                | 0/87719 [00:00<?, ?it/s]

  0%|                                      | 77/87719 [00:00<01:55, 756.46it/s]

  0%|                                     | 145/87719 [00:00<01:59, 731.59it/s]

  0%|                                     | 206/87719 [00:00<02:06, 690.16it/s]

  0%|                                     | 277/87719 [00:00<02:06, 691.76it/s]

  0%|▏                                    | 357/87719 [00:00<02:04, 703.62it/s]

  0%|▏                                    | 419/87719 [00:00<02:09, 674.90it/s]

  1%|▏                                    | 479/87719 [00:00<02:21, 614.40it/s]

  1%|▏                                    | 547/87719 [00:00<02:17, 632.58it/s]

  1%|▎                                    | 629/87719 [00:00<02:08, 679.03it/s]

  1%|▎                                    | 712/87719 [00:01<02:01, 714.38it/s]

  1%|▎                                    | 787/87719 [00:01<02:00, 724.23it/s]

  1%|▎                    

 15%|█████▏                             | 13079/87719 [00:22<02:14, 553.97it/s]

 15%|█████▏                             | 13135/87719 [00:22<02:14, 554.56it/s]

 15%|█████▎                             | 13203/87719 [00:22<02:11, 565.88it/s]

 15%|█████▎                             | 13263/87719 [00:22<02:10, 571.88it/s]

 15%|█████▎                             | 13328/87719 [00:22<02:10, 571.18it/s]

 15%|█████▎                             | 13388/87719 [00:22<02:08, 577.29it/s]

 15%|█████▎                             | 13447/87719 [00:22<02:07, 580.29it/s]

 15%|█████▍                             | 13525/87719 [00:23<02:02, 605.75it/s]

 15%|█████▍                             | 13587/87719 [00:23<02:01, 609.56it/s]

 16%|█████▍                             | 13649/87719 [00:23<02:01, 611.24it/s]

 16%|█████▍                             | 13711/87719 [00:23<02:06, 583.53it/s]

 16%|█████▌                             | 13796/87719 [00:23<01:58, 623.66it/s]

 16%|█████▌                 

 33%|███████████▌                       | 28871/87719 [00:44<01:15, 781.66it/s]

 33%|███████████▌                       | 28961/87719 [00:44<01:12, 812.04it/s]

 33%|███████████▌                       | 29068/87719 [00:44<01:09, 846.84it/s]

 33%|███████████▋                       | 29154/87719 [00:44<01:09, 848.57it/s]

 33%|███████████▋                       | 29240/87719 [00:44<01:14, 786.17it/s]

 33%|███████████▋                       | 29320/87719 [00:44<01:14, 780.87it/s]

 34%|███████████▋                       | 29401/87719 [00:44<01:14, 784.00it/s]

 34%|███████████▊                       | 29495/87719 [00:44<01:13, 792.72it/s]

 34%|███████████▊                       | 29575/87719 [00:44<01:14, 784.82it/s]

 34%|███████████▊                       | 29654/87719 [00:45<01:20, 719.48it/s]

 34%|███████████▊                       | 29728/87719 [00:45<01:26, 668.31it/s]

 34%|███████████▉                       | 29797/87719 [00:45<01:30, 640.70it/s]

 34%|███████████▉           

 53%|██████████████████▌                | 46604/87719 [01:05<00:45, 899.69it/s]

 53%|██████████████████▋                | 46697/87719 [01:05<00:45, 895.83it/s]

 53%|██████████████████▋                | 46787/87719 [01:05<00:46, 874.28it/s]

 53%|██████████████████▋                | 46875/87719 [01:05<00:49, 828.86it/s]

 54%|██████████████████▋                | 46974/87719 [01:05<00:47, 863.39it/s]

 54%|██████████████████▊                | 47067/87719 [01:06<00:46, 881.89it/s]

 54%|██████████████████▊                | 47173/87719 [01:06<00:44, 904.48it/s]

 54%|██████████████████▊                | 47265/87719 [01:06<00:47, 848.98it/s]

 54%|██████████████████▉                | 47354/87719 [01:06<00:47, 858.46it/s]

 54%|██████████████████▉                | 47441/87719 [01:06<00:52, 764.09it/s]

 54%|██████████████████▉                | 47527/87719 [01:06<00:51, 782.19it/s]

 54%|██████████████████▉                | 47613/87719 [01:06<00:49, 802.20it/s]

 54%|███████████████████    

 72%|█████████████████████████          | 62916/87719 [01:27<00:30, 812.22it/s]

 72%|█████████████████████████▏         | 62998/87719 [01:27<00:30, 797.60it/s]

 72%|█████████████████████████▏         | 63078/87719 [01:27<00:31, 779.45it/s]

 72%|█████████████████████████▏         | 63160/87719 [01:27<00:31, 787.61it/s]

 72%|█████████████████████████▏         | 63258/87719 [01:27<00:30, 807.18it/s]

 72%|█████████████████████████▎         | 63351/87719 [01:27<00:29, 840.27it/s]

 72%|█████████████████████████▎         | 63442/87719 [01:27<00:29, 824.51it/s]

 72%|█████████████████████████▎         | 63529/87719 [01:27<00:28, 836.60it/s]

 73%|█████████████████████████▍         | 63614/87719 [01:27<00:29, 824.77it/s]

 73%|█████████████████████████▍         | 63697/87719 [01:28<00:29, 813.90it/s]

 73%|█████████████████████████▍         | 63779/87719 [01:28<00:30, 793.68it/s]

 73%|█████████████████████████▍         | 63859/87719 [01:28<00:30, 790.68it/s]

 73%|███████████████████████

 92%|████████████████████████████████   | 80359/87719 [01:48<00:09, 804.72it/s]

 92%|████████████████████████████████   | 80453/87719 [01:48<00:08, 840.58it/s]

 92%|████████████████████████████████▏  | 80562/87719 [01:48<00:08, 874.67it/s]

 92%|████████████████████████████████▏  | 80656/87719 [01:48<00:07, 892.58it/s]

 92%|████████████████████████████████▏  | 80747/87719 [01:48<00:08, 834.61it/s]

 92%|████████████████████████████████▎  | 80837/87719 [01:49<00:08, 821.45it/s]

 92%|████████████████████████████████▎  | 80921/87719 [01:49<00:08, 818.76it/s]

 92%|████████████████████████████████▎  | 81004/87719 [01:49<00:08, 784.51it/s]

 92%|████████████████████████████████▎  | 81117/87719 [01:49<00:07, 825.84it/s]

 93%|████████████████████████████████▍  | 81206/87719 [01:49<00:07, 840.14it/s]

 93%|████████████████████████████████▍  | 81298/87719 [01:49<00:07, 861.72it/s]

 93%|████████████████████████████████▍  | 81417/87719 [01:49<00:06, 910.80it/s]

 93%|███████████████████████

In [31]:
test_tri_vectors = trigrams_from_corpus(test_texts)



  0%|                                                  | 0/999 [00:00<?, ?it/s]

  9%|███▌                                    | 89/999 [00:00<00:01, 806.10it/s]

 14%|█████▌                                 | 143/999 [00:00<00:01, 687.21it/s]

 21%|████████                               | 206/999 [00:00<00:01, 650.49it/s]

 25%|█████████▉                             | 254/999 [00:00<00:01, 586.00it/s]

 32%|████████████▎                          | 315/999 [00:00<00:01, 587.16it/s]

 40%|███████████████▍                       | 397/999 [00:00<00:00, 639.47it/s]

 48%|██████████████████▌                    | 475/999 [00:00<00:00, 674.94it/s]

 55%|█████████████████████▌                 | 551/999 [00:00<00:00, 698.25it/s]

 62%|████████████████████████▏              | 619/999 [00:00<00:00, 603.23it/s]

 71%|███████████████████████████▌           | 705/999 [00:01<00:00, 662.45it/s]

 80%|███████████████████████████████        | 795/999 [00:01<00:00, 717.85it/s]

 88%|█████████████████████

In [32]:
val_tri_vectors = trigrams_from_corpus(val_texts)



  0%|                                                  | 0/998 [00:00<?, ?it/s]

  7%|██▊                                     | 70/998 [00:00<00:01, 699.51it/s]

 13%|█████                                  | 131/998 [00:00<00:01, 663.11it/s]

 19%|███████▍                               | 190/998 [00:00<00:01, 639.20it/s]

 23%|█████████▏                             | 234/998 [00:00<00:01, 560.45it/s]

 28%|███████████                            | 284/998 [00:00<00:01, 533.79it/s]

 33%|████████████▊                          | 328/998 [00:00<00:01, 478.65it/s]

 40%|███████████████▌                       | 397/998 [00:00<00:01, 514.03it/s]

 46%|██████████████████▏                    | 464/998 [00:00<00:00, 552.35it/s]

 53%|████████████████████▌                  | 525/998 [00:00<00:00, 567.38it/s]

 61%|███████████████████████▋               | 606/998 [00:01<00:00, 621.24it/s]

 67%|██████████████████████████▏            | 669/998 [00:01<00:00, 610.50it/s]

 74%|█████████████████████

In [33]:
def joint_vectors(vectors, word_vectors, tri_vectors):
    joint = [np.concatenate([vectors[i], word_vectors[i], tri_vectors[i]]) for i in tqdm(range(len(vectors)))]
    return np.array(joint)

In [34]:
train = joint_vectors(train_vectors, train_word_vectors, train_tri_vectors)



  0%|                                                | 0/87719 [00:00<?, ?it/s]

 22%|██████▉                         | 18875/87719 [00:00<00:00, 185494.75it/s]

 47%|███████████████                 | 41289/87719 [00:00<00:00, 188250.79it/s]

 74%|███████████████████████▋        | 64805/87719 [00:00<00:00, 199737.65it/s]

100%|████████████████████████████████| 87719/87719 [00:00<00:00, 218382.64it/s]

In [35]:
test = joint_vectors(test_vectors, test_word_vectors, test_tri_vectors)



  0%|                                                  | 0/999 [00:00<?, ?it/s]

100%|████████████████████████████████████| 999/999 [00:00<00:00, 110916.95it/s]

In [36]:
val = joint_vectors(val_vectors, val_word_vectors, val_tri_vectors)



  0%|                                                  | 0/998 [00:00<?, ?it/s]

100%|████████████████████████████████████| 998/998 [00:00<00:00, 110794.19it/s]

In [37]:
with open('Train_'+str(dim+20)+'.pkl', 'wb') as tr:
    pkl.dump(train, tr)

In [38]:
with open('Test_'+str(dim+20)+'.pkl', 'wb') as tst:
    pkl.dump(test, tst)

In [39]:
with open('Val_'+str(dim+20)+'.pkl', 'wb') as vl:
    pkl.dump(val, vl)