In [3]:
import re
import gensim
import logging
import nltk.data
import pandas as pd
from nltk.corpus import stopwords
from gensim.models import word2vec

from tqdm import tqdm
import numpy as np
import wget
import zipfile
import random
import umap.plot

from sklearn.decomposition import TruncatedSVD

In [7]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [8]:
model_file = './all_cyberleninka_model/all_cyberleninka.model'
model = gensim.models.Word2Vec.load(model_file)

2022-03-11 14:44:41,999 : INFO : loading Word2Vec object from ./all_cyberleninka_model/all_cyberleninka.model
2022-03-11 14:44:44,077 : INFO : loading wv recursively from ./all_cyberleninka_model/all_cyberleninka.model.wv.* with mmap=None
2022-03-11 14:44:44,078 : INFO : loading vectors from ./all_cyberleninka_model/all_cyberleninka.model.wv.vectors.npy with mmap=None
2022-03-11 14:44:48,178 : INFO : loading syn1neg from ./all_cyberleninka_model/all_cyberleninka.model.syn1neg.npy with mmap=None
2022-03-11 14:44:53,625 : INFO : setting ignored attribute cum_table to None
2022-03-11 14:45:40,774 : INFO : Word2Vec lifecycle event {'fname': './all_cyberleninka_model/all_cyberleninka.model', 'datetime': '2022-03-11T14:45:40.774495', 'gensim': '4.1.2', 'python': '3.8.12 (default, Oct 12 2021, 03:01:40) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19043-SP0', 'event': 'loaded'}


In [9]:
with open('very_clean_list.txt', encoding='utf8') as f:
    all_nouns = f.read().split('\n')

In [10]:
len(all_nouns)

32468

In [19]:
def get_matrix(all_nouns):
    vectors_of_words = np.zeros((len(all_nouns), model.vector_size))
    for i, word in enumerate(all_nouns):
        vectors_of_words[i] = model.wv[word]
        
    return vectors_of_words

def lsa_matrix(vectors_of_words, n_components, n_iter=100):
    lsa_obj = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=42)
    lsa_data = lsa_obj.fit_transform(vectors_of_words)
    return lsa_data

In [20]:
def sort_results2(lsa_data, all_nouns, all_lsa):    
    sorted_scores_indx = np.argsort(lsa_data, axis=0)[::-1]
    result = np.array(all_nouns)[sorted_scores_indx.ravel()]
    result_nums = np.array(all_lsa)[sorted_scores_indx.ravel()]
    return result, result_nums

In [25]:
def get_n_iterations2(all_nouns, iterations):
    dict_iters = {'0': [all_nouns]}
    dict_iters_num = {'0': []}
    for i in range(iterations):
        print(i)
        iter_name = str(i + 1)
        dict_iters[iter_name] = []
        dict_iters_num[iter_name] = []
        for el in dict_iters[str(i)]:
            first_matrix = get_matrix(el)
            first_lsa = lsa_matrix(first_matrix, i + 2, 200)
            first_result, first_result_num = sort_results2([v[0] for v in first_lsa], el, first_lsa)
            half_of_list = len(first_result) // 3
            dict_iters[iter_name].append(first_result[:half_of_list])
            dict_iters[iter_name].append(first_result[half_of_list:2*half_of_list])
            dict_iters[iter_name].append(first_result[2*half_of_list:])
            dict_iters_num[iter_name].append(first_result_num[:half_of_list])
            dict_iters_num[iter_name].append(first_result_num[half_of_list:2*half_of_list])
            dict_iters_num[iter_name].append(first_result_num[2*half_of_list:])
            
    return dict_iters, dict_iters_num

In [26]:
%%time
res = get_n_iterations2(all_nouns, 7)

0
1
2
3
4
5
6
Wall time: 3min 15s


In [27]:
import pickle
with open('large_1_without_clust.pkl', 'wb') as f:
    pickle.dump(res, f)