In [87]:
import os

import numpy as np
import pandas as pd
from pymystem3 import Mystem

news_df = pd.DataFrame(data=None, columns=['filename', 'text'])
news_df_index = 0
m = Mystem(entire_input=False)

with open('stop-words (Russian).txt', 'r') as file:
    stop_words = file.read().split('\n')

def append_method_stats(filename, text):
    global news_df_index
    news_df.loc[news_df_index] = [filename, text]
    news_df_index += 1

for dirname, _, filenames in os.walk('texts'):
    for filename in filenames:
        filename = os.path.join(dirname, filename)
        with open(filename, 'r') as file:
            text = file.read()
            text = m.lemmatize(text)
            for word in text:
                if word in stop_words:
                    text.remove(word)
            text = ' '.join(text)
            append_method_stats(filename, text)
            
news_df.to_csv('news_df.csv', index=False)

In [132]:
from sklearn.preprocessing import normalize
from scipy import sparse


def tfidf_vectorize(texts):
    m = Mystem(entire_input=False)
    words_set = set()
    documents_count = len(texts)
    words_dict = {}
    texts_words_dict = {}
    
    for i in range(len(texts)):
        if not isinstance(texts[i], (list, tuple, np.ndarray)):
            all_words = m.lemmatize(texts[i])
        else:
            all_words = texts[i]
        
        for word in all_words:
            words_set.add(word)
            
            if i not in texts_words_dict:
                texts_words_dict[i] = {}
            
            if word not in texts_words_dict[i]:
                texts_words_dict[i][word] = 1
            else:
                texts_words_dict[i][word] += 1
        for word, _ in texts_words_dict[i].items():
            if word not in words_dict:
                words_dict[word] = 1
            else:
                words_dict[word] += 1

    num_words = len(words_set)
            
    vectors = np.zeros((len(texts), num_words))
    for i in range(len(texts)):
        vector = np.zeros(num_words)
        for j, word in enumerate(words_set):
            if word in texts_words_dict[i]:
                tf = texts_words_dict[i][word] / len(texts[i])
                idf = np.log((1 + documents_count) / (1 + words_dict[word])) + 1
                vector[j] = tf * idf
        
        vector_norm = np.sqrt(np.sum(vector ** 2))
        for k in range(num_words):
            vector[k] /= vector_norm
        vectors[i] = vector
        
    return sparse.csr_matrix(vectors), list(words_set)

def freq_vectorize(texts):
    m = Mystem(entire_input=False)
    words_set = set()
    texts_words_dict = {}
    
    for i in range(len(texts)):
        if not isinstance(texts[i], (list, tuple, np.ndarray)):
            all_words = m.lemmatize(texts[i])
        else:
            all_words = texts[i]
            
        for word in all_words:
            words_set.add(word)
            
            if i not in texts_words_dict:
                texts_words_dict[i] = {}
            
            if word not in texts_words_dict[i]:
                texts_words_dict[i][word] = 1
            else:
                texts_words_dict[i][word] += 1

    num_words = len(words_set)
            
    vectors = np.zeros((len(texts), num_words))
    for i in range(len(texts)):
        vector = np.zeros(num_words)
        for j, word in enumerate(words_set):
            if word in texts_words_dict[i]:
                vector[j] = texts_words_dict[i][word]
        vectors[i] = vector
        
    return sparse.csr_matrix(vectors), list(words_set)

def normfreq_vectorize(texts):
    m = Mystem(entire_input=False)
    words_set = set()
    texts_words_dict = {}
    
    for i in range(len(texts)):
        if not isinstance(texts[i], (list, tuple, np.ndarray)):
            all_words = m.lemmatize(texts[i])
        else:
            all_words = texts[i]
            
        for word in all_words:
            words_set.add(word)
            
            if i not in texts_words_dict:
                texts_words_dict[i] = {}
            
            if word not in texts_words_dict[i]:
                texts_words_dict[i][word] = 1
            else:
                texts_words_dict[i][word] += 1

    num_words = len(words_set)
            
    vectors = np.zeros((len(texts), num_words))
    for i in range(len(texts)):
        vector = np.zeros(num_words)
        for j, word in enumerate(words_set):
            if word in texts_words_dict[i]:
                vector[j] = texts_words_dict[i][word]
        vectors[i] = vector
        
    return normalize(sparse.csr_matrix(vectors), norm="l1", axis=1), list(words_set)

def vectorize_texts(df, method='tfidf'):
    if method == 'tfidf':
        vectors, terms = tfidf_vectorize(df['text'])
        return vectors, terms
    elif method == 'freq':
        vectors, terms = freq_vectorize(df['text'])
        return vectors, terms
    elif method == 'normfreq':
        vectors, terms = normfreq_vectorize(df['text'])
        return vectors, terms
    
tfidf_vectors, tfidf_terms = vectorize_texts(news_df, method='tfidf')
freq_vectors, freq_terms = vectorize_texts(news_df, method='freq')
normfreq_vectors, normfreq_terms = vectorize_texts(news_df, method='normfreq')

sparse.save_npz('vectors/tfidf_vectors.npz', tfidf_vectors)
np.save('vectors/tfidf_terms.npy', tfidf_terms)
sparse.save_npz('vectors/freq_vectors.npz', freq_vectors)
np.save('vectors/freq_terms.npy', freq_terms)
# print(freq_vectors[:10])
sparse.save_npz('vectors/normfreq_vectors.npz', normfreq_vectors)
np.save('vectors/normfreq_terms.npy', normfreq_terms)

  (0, 66)	2.0
  (0, 83)	1.0
  (0, 85)	3.0
  (0, 92)	1.0
  (0, 142)	1.0
  (0, 256)	2.0
  (0, 266)	1.0
  (0, 308)	1.0
  (0, 310)	1.0
  (0, 498)	1.0
  (0, 510)	1.0
  (0, 584)	1.0
  (0, 664)	1.0
  (0, 758)	1.0
  (0, 824)	1.0
  (0, 833)	1.0
  (0, 1011)	1.0
  (0, 1060)	1.0
  (0, 1075)	1.0
  (0, 1129)	2.0
  (0, 1133)	1.0
  (0, 1153)	1.0
  (0, 1165)	1.0
  (0, 1182)	1.0
  (0, 1190)	3.0
  :	:
  (9, 1273)	3.0
  (9, 1277)	1.0
  (9, 1294)	2.0
  (9, 1303)	1.0
  (9, 1412)	1.0
  (9, 1445)	1.0
  (9, 1490)	1.0
  (9, 1495)	1.0
  (9, 1500)	1.0
  (9, 1515)	1.0
  (9, 1533)	1.0
  (9, 1590)	2.0
  (9, 1598)	2.0
  (9, 1650)	2.0
  (9, 1673)	1.0
  (9, 1805)	1.0
  (9, 1919)	1.0
  (9, 1921)	1.0
  (9, 1981)	3.0
  (9, 2019)	1.0
  (9, 2062)	1.0
  (9, 2083)	2.0
  (9, 2134)	2.0
  (9, 2188)	1.0
  (9, 2265)	1.0


In [123]:
tfidf_vectors.shape

(74, 2325)

In [131]:
print(sparse.csr_matrix([1, 2, 3, 4, 5]))

  (0, 0)	1
  (0, 1)	2
  (0, 2)	3
  (0, 3)	4
  (0, 4)	5


In [124]:
freq_vectors.shape

(74, 2325)

In [125]:
normfreq_vectors.shape

(74, 2325)

In [126]:
import time


def add_document_to_collection(filepath):
    news_df = pd.read_csv('news_df.csv')
    news_df_index = news_df.shape[0]

    with open('stop-words (Russian).txt', 'r') as file:
        stop_words = file.read().split('\n')

    with open(filepath, 'r') as file:
        text = file.read()
        text = m.lemmatize(text)
        for word in text:
            if word in stop_words:
                text.remove(word)
        text = ' '.join(text)
        news_df.loc[news_df_index] = [filename, text]

    news_df.to_csv('news_df.csv', index=False)
    print(news_df.shape)

    start_time = time.time()
    tfidf_vectors, tfidf_terms = vectorize_texts(news_df, method='tfidf')
    print(tfidf_vectors.shape)
    print(f"TF-IDF model vectorization time: {time.time() - start_time}")

    start_time = time.time()
    freq_vectors, freq_terms = vectorize_texts(news_df, method='freq')
    print(freq_vectors.shape)
    print(f"Frequency model vectorization time: {time.time() - start_time}")

    start_time = time.time()
    normfreq_vectors, normfreq_terms = vectorize_texts(news_df, method='normfreq')
    print(normfreq_vectors.shape)
    print(f"Normalized frequency model vectorization time: {time.time() - start_time}")

    sparse.save_npz('vectors/tfidf_vectors.npz', tfidf_vectors)
    np.save('vectors/tfidf_terms.npy', tfidf_terms)
    sparse.save_npz('vectors/freq_vectors.npz', freq_vectors)
    np.save('vectors/freq_terms.npy', freq_terms)
    sparse.save_npz('vectors/normfreq_vectors.npz', normfreq_vectors)
    np.save('vectors/normfreq_terms.npy', normfreq_terms)

add_document_to_collection('Финляндия.txt')

In [127]:
def choose_top_10(array):
    top_10_indexes = []
    top_10_values = []
    for i in range(10):
        max_index = np.argmax(array)
        top_10_indexes.append(max_index)
        top_10_values.append(array[max_index])
        array[max_index] = 0
    return top_10_indexes, top_10_values

def get_top_terms(vectors, terms):
    news_df = pd.read_csv('news_df.csv')
    text_names = news_df['filename']
    df = pd.DataFrame(data=None, columns=['text', 'top_terms'])
    df_index = 0
    
    rows, _ = vectors.shape
    vectors = vectors.toarray()
    for i in range(rows):
        top_10_indexes, top_10_values = choose_top_10(vectors[i, :])
        top_terms_json = {}
        for index, value in zip(top_10_indexes, top_10_values):
            top_terms_json[terms[index]] = value
        df.loc[df_index] = [text_names[i], top_terms_json]
        df_index += 1
    return df
        
top_tf_idf_terms = get_top_terms(tfidf_vectors, tfidf_terms)
top_tf_idf_terms.to_csv('top_tf_idf_terms.csv', index=False)

In [128]:
top_freq_terms = get_top_terms(freq_vectors, freq_terms)
top_freq_terms.to_csv('top_freq_terms.csv', index=False)

In [129]:
top_normfreq_terms = get_top_terms(normfreq_vectors, normfreq_terms)
top_normfreq_terms.to_csv('top_normfreq_terms.csv', index=False)