In [1]:
from nltk import WordPunctTokenizer
from tqdm import tqdm
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import pickle as pkl



In [2]:
tokenizer = WordPunctTokenizer()

In [3]:
def acquire_documents(filename):
    with open(filename+'.txt','r',encoding='utf-8') as l:
        lemmas = l.read().split('\n')
    l.close()
    tokens = tokenizer.tokenize_sents(lemmas)
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokens)]
    return lemmas, tokens, documents

In [4]:
train_texts, train_tokens, train_documents = acquire_documents('Train_lemmas')

In [5]:
dim = 300

In [6]:
model = Doc2Vec(train_documents, vector_size=dim, window=2, min_count=1, workers=4)

In [7]:
model.save('Train_only.model')

In [8]:
train_vectors = [model.infer_vector(i.words) for i in tqdm(train_documents)]

100%|███████████████████████████████████| 87844/87844 [02:20<00:00, 624.09it/s]


In [9]:
with open('Train_{}.pkl'.format(dim), 'wb') as tw001:
    pkl.dump(np.array(train_vectors), tw001)

In [10]:
test_texts, test_tokens, test_documents = acquire_documents('Test_lemmas')

In [11]:
test_vectors = [model.infer_vector(i.words) for i in tqdm(test_documents)]

100%|█████████████████████████████████████| 1000/1000 [00:04<00:00, 210.24it/s]


In [12]:
with open('Test_{}.pkl'.format(dim), 'wb') as tw002:
    pkl.dump(np.array(test_vectors), tw002)

In [13]:
val_texts, val_tokens, val_documents = acquire_documents('Val_lemmas')

In [14]:
val_vectors = [model.infer_vector(i.words) for i in tqdm(val_documents)]

100%|█████████████████████████████████████| 1000/1000 [00:01<00:00, 631.02it/s]


In [15]:
with open('Val_{}.pkl'.format(dim), 'wb') as tw003:
    pkl.dump(np.array(val_vectors), tw003)

In [13]:
with open('top_threehundred_sorted_cutathundred.pkl', 'rb') as t:
    top_word_sets = {k:set(val[:300]) for k, val in (pkl.load(t)).items()}
t.close()

In [14]:
top_word_sets['anime']

{'акир',
 'алхимик',
 'аниме',
 'анимешка',
 'анимешник',
 'анимешный',
 'анимэ',
 'арка',
 'арт',
 'бибоп',
 'блич',
 'боевой',
 'вампир',
 'википедия',
 'волос',
 'героиня',
 'гиас',
 'дворецкий',
 'девочка',
 'досмотреть',
 'доставлять',
 'дубляж',
 'дух',
 'ева',
 'евангелиона',
 'замок',
 'звать',
 'июль',
 'кадр',
 'картинка',
 'концовка',
 'кп',
 'кровь',
 'лайт',
 'манга',
 'мех',
 'милый',
 'миядзака',
 'мульт',
 'мультик',
 'мультфильм',
 'мусор',
 'нарисованный',
 'нарута',
 'наруто',
 'наткнуться',
 'овашка',
 'опенинг',
 'пафос',
 'пересматривать',
 'пересмотреть',
 'персонаж',
 'повседневность',
 'поделиться',
 'покемон',
 'полнометражка',
 'полнометражный',
 'призрак',
 'продолжение',
 'происходящий',
 'просмотр',
 'рейтинг',
 'рисовка',
 'саба',
 'самурай',
 'сантиметр',
 'седзе',
 'сейя',
 'сенэн',
 'сериал',
 'серия',
 'синкай',
 'сиська',
 'скрытый',
 'смерть',
 'тайтл',
 'творение',
 'тетрадка',
 'тетрадь',
 'туалет',
 'унесённый',
 'ученик',
 'фансервис',
 'фантази

In [15]:
categs = ['anime', 'art', 'books', 'films', 'food', 'football', 'games', 'music', 'nature', 'travel']
# weights = [len(v) / max([len(v) for v in top_sets.values()]) for v in top_sets.values()] # normal weights per quantity
weights = [1] * 10 # non-existent weights

def words_from_tokens(tokens):
    vec = [0] * 10
    for i in range(len(categs)):
        count = 0
        for t in tokens:
            if t in top_word_sets[categs[i]]:
                count += 1
        if len(tokens) == 0:
            vec[i] = (count / 1)/weights[i]
        else:
            vec[i] = (count / len(tokens))/weights[i]
    vec = np.array(vec, dtype='float32')
    return vec

In [16]:
def vectors_from_corpus(corpus):
    vectors = [words_from_tokens(text) for text in tqdm(corpus)]
    return vectors

In [17]:
train_word_vectors = vectors_from_corpus(train_tokens)

100%|█████████████████████████████████| 87844/87844 [00:04<00:00, 18606.89it/s]


In [18]:
test_word_vectors = vectors_from_corpus(test_tokens)

100%|███████████████████████████████████| 1000/1000 [00:00<00:00, 16382.21it/s]


In [19]:
val_word_vectors = vectors_from_corpus(val_tokens)

100%|███████████████████████████████████| 1000/1000 [00:00<00:00, 14072.72it/s]


In [20]:
with open('trigrams_threehundred_sorted.pkl', 'rb') as t:
    top_tri_sets = {k:set(val[:300]) for k, val in (pkl.load(t)).items()}
t.close()

In [24]:
categs = ['anime', 'art', 'books', 'films', 'food', 'football', 'games', 'music', 'nature', 'travel']
# weights = [len(v) / max([len(v) for v in top_sets.values()]) for v in top_sets.values()] # normal weights per quantity
weights = [1] * 10 # non-existent weights

def tris_from_text(text):
    vec = [0] * 10
    for i in range(len(categs)):
        count = 0
        for trig in top_tri_sets[categs[i]]:
            if trig in text:
                count += 1
        
        if len(text) == 0:
            vec[i] = (count / 1)/weights[i]
        else:
            vec[i] = (count / len(text))/weights[i]
    vec = np.array(vec, dtype='float32')
    return vec

In [25]:
def trigrams_from_corpus(corpus):
    trigrams = [tris_from_text(text) for text in tqdm(corpus)]
    return trigrams

In [26]:
train_tri_vectors = trigrams_from_corpus(train_texts)


  0%|                                                | 0/87844 [00:00<?, ?it/s]
  0%|                                    | 214/87844 [00:00<00:40, 2138.50it/s]
  0%|▏                                   | 439/87844 [00:00<00:40, 2169.96it/s]
  1%|▎                                   | 694/87844 [00:00<00:38, 2271.14it/s]
  1%|▍                                   | 931/87844 [00:00<00:37, 2299.36it/s]
  1%|▍                                  | 1167/87844 [00:00<00:37, 2316.74it/s]
  2%|▌                                  | 1402/87844 [00:00<00:37, 2326.15it/s]
  2%|▋                                  | 1622/87844 [00:00<00:37, 2286.37it/s]
  2%|▋                                  | 1875/87844 [00:00<00:36, 2353.92it/s]
  2%|▊                                  | 2099/87844 [00:00<00:36, 2317.93it/s]
  3%|▉                                  | 2354/87844 [00:01<00:35, 2382.52it/s]
  3%|█                                  | 2588/87844 [00:01<00:36, 2361.76it/s]
  3%|█                                 

 54%|██████████████████▎               | 47227/87844 [00:21<00:16, 2525.12it/s]
 54%|██████████████████▍               | 47483/87844 [00:21<00:16, 2440.36it/s]
 54%|██████████████████▍               | 47745/87844 [00:21<00:16, 2490.47it/s]
 55%|██████████████████▌               | 48004/87844 [00:21<00:15, 2517.63it/s]
 55%|██████████████████▋               | 48258/87844 [00:21<00:15, 2516.26it/s]
 55%|██████████████████▊               | 48518/87844 [00:21<00:15, 2539.88it/s]
 56%|██████████████████▉               | 48773/87844 [00:21<00:15, 2534.78it/s]
 56%|██████████████████▉               | 49030/87844 [00:21<00:15, 2544.71it/s]
 56%|███████████████████               | 49285/87844 [00:21<00:15, 2538.26it/s]
 56%|███████████████████▏              | 49540/87844 [00:21<00:15, 2446.08it/s]
 57%|███████████████████▎              | 49794/87844 [00:22<00:15, 2471.60it/s]
 57%|███████████████████▎              | 50042/87844 [00:22<00:15, 2473.66it/s]
 57%|███████████████████▍              |

In [27]:
test_tri_vectors = trigrams_from_corpus(test_texts)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
 20%|███████▏                             | 195/1000 [00:00<00:00, 1948.59it/s]
 39%|██████████████▌                      | 392/1000 [00:00<00:00, 1954.18it/s]
 61%|██████████████████████▋              | 614/1000 [00:00<00:00, 2013.29it/s]
 86%|███████████████████████████████▉     | 862/1000 [00:00<00:00, 2133.45it/s]
100%|████████████████████████████████████| 1000/1000 [00:00<00:00, 2041.61it/s]

In [28]:
val_tri_vectors = trigrams_from_corpus(val_texts)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
 20%|███████▌                             | 204/1000 [00:00<00:00, 2038.52it/s]
 40%|██████████████▋                      | 396/1000 [00:00<00:00, 1994.58it/s]
 63%|███████████████████████▎             | 630/1000 [00:00<00:00, 2086.39it/s]
 86%|███████████████████████████████▉     | 863/1000 [00:00<00:00, 2153.41it/s]
100%|████████████████████████████████████| 1000/1000 [00:00<00:00, 2076.61it/s]

In [29]:
def joint_vectors(vectors, word_vectors, tri_vectors):
    joint = [np.concatenate([vectors[i], word_vectors[i], tri_vectors[i]]) for i in tqdm(range(len(vectors)))]
    return np.array(joint)

In [30]:
train = joint_vectors(train_vectors, train_word_vectors, train_tri_vectors)


  0%|                                                | 0/87844 [00:00<?, ?it/s]
 31%|█████████▉                      | 27357/87844 [00:00<00:00, 273349.33it/s]
 62%|███████████████████▊            | 54305/87844 [00:00<00:00, 271820.78it/s]
 93%|█████████████████████████████▊  | 81703/87844 [00:00<00:00, 272406.53it/s]
100%|████████████████████████████████| 87844/87844 [00:00<00:00, 267324.04it/s]

In [31]:
test = joint_vectors(test_vectors, test_word_vectors, test_tri_vectors)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 124882.51it/s]

In [32]:
val = joint_vectors(val_vectors, val_word_vectors, val_tri_vectors)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 124915.98it/s]

In [33]:
with open('Train_'+str(dim+20)+'.pkl', 'wb') as tr:
    pkl.dump(train, tr)

In [34]:
with open('Test_'+str(dim+20)+'.pkl', 'wb') as tst:
    pkl.dump(test, tst)

In [35]:
with open('Val_'+str(dim+20)+'.pkl', 'wb') as vl:
    pkl.dump(val, vl)

In [36]:
def joint_vectors_tris(vectors, tri_vectors):
    joint = [np.concatenate([vectors[i], tri_vectors[i]]) for i in tqdm(range(len(vectors)))]
    return np.array(joint)

In [37]:
train_tri = joint_vectors_tris(train_vectors, train_tri_vectors)


  0%|                                                | 0/87844 [00:00<?, ?it/s]
 38%|████████████▎                   | 33804/87844 [00:00<00:00, 337799.51it/s]
 73%|███████████████████████▎        | 63870/87844 [00:00<00:00, 325655.59it/s]
100%|████████████████████████████████| 87844/87844 [00:00<00:00, 325119.07it/s]

In [38]:
test_tri = joint_vectors_tris(test_vectors, test_tri_vectors)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 142755.66it/s]

In [39]:
val_tri = joint_vectors_tris(val_vectors, val_tri_vectors)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 124908.54it/s]

In [40]:
with open('Train_'+str(dim+10)+'_tris.pkl', 'wb') as tr1:
    pkl.dump(train_tri, tr1)

In [41]:
with open('Test_'+str(dim+10)+'_tris.pkl', 'wb') as tst1:
    pkl.dump(test_tri, tst1)

In [42]:
with open('Val_'+str(dim+10)+'_tris.pkl', 'wb') as vl1:
    pkl.dump(val_tri, vl1)

In [43]:
def joint_vectors_twenty(word_vectors, tri_vectors):
    joint = [np.concatenate([word_vectors[i], tri_vectors[i]]) for i in tqdm(range(len(word_vectors)))]
    return np.array(joint)

In [44]:
train_tw = joint_vectors_tris(train_word_vectors, train_tri_vectors)


  0%|                                                | 0/87844 [00:00<?, ?it/s]
 46%|██████████████▌                 | 40077/87844 [00:00<00:00, 400448.63it/s]
100%|████████████████████████████████| 87844/87844 [00:00<00:00, 444830.25it/s]

In [45]:
test_tw = joint_vectors_tris(test_word_vectors, test_tri_vectors)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 166566.22it/s]

In [46]:
val_tw = joint_vectors_tris(val_word_vectors, val_tri_vectors)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 166486.88it/s]

In [47]:
with open('Train_twenty.pkl', 'wb') as tw1:
    pkl.dump(train_tw, tw1)

In [48]:
with open('Test_twenty.pkl', 'wb') as tw2:
    pkl.dump(test_tw, tw2)

In [49]:
with open('Val_twenty.pkl', 'wb') as tw3:
    pkl.dump(val_tw, tw3)

In [52]:
def joint_vector_words(vectors, word_vectors):
    joint = [np.concatenate([vectors[i], word_vectors[i]]) for i in tqdm(range(len(vectors)))]
    return np.array(joint)

In [57]:
train_words = joint_vector_words(train_vectors, train_word_vectors)


  0%|                                                | 0/87844 [00:00<?, ?it/s]
 37%|███████████▉                    | 32833/87844 [00:00<00:00, 328097.20it/s]
 74%|███████████████████████▊        | 65275/87844 [00:00<00:00, 326747.74it/s]
100%|████████████████████████████████| 87844/87844 [00:00<00:00, 324912.36it/s]

In [58]:
test_words = joint_vector_words(test_vectors, test_word_vectors)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 142716.80it/s]

In [59]:
val_words = joint_vector_words(val_vectors, val_word_vectors)


  0%|                                                 | 0/1000 [00:00<?, ?it/s]
100%|██████████████████████████████████| 1000/1000 [00:00<00:00, 124904.82it/s]

In [60]:
with open('Train_'+str(dim+10)+'_allwords.pkl', 'wb') as w1:
    pkl.dump(train_words, w1)

In [61]:
with open('Test_'+str(dim+10)+'_allwords.pkl', 'wb') as w2:
    pkl.dump(test_words, w2)

In [62]:
with open('Val_'+str(dim+10)+'_allwords.pkl', 'wb') as w3:
    pkl.dump(val_words, w3)