In [38]:
import sys
sys.path.append('..')

In [39]:
from dlnlputils.data import build_vocabulary, tokenize_corpus, vectorize_texts

In [40]:
texts = """Казнить нельзя, помиловать. Нельзя наказывать.
Казнить, нельзя помиловать. Нельзя освободить.
Нельзя не помиловать.
Обязательно освободить.""".split("\n")

tokenized_texts = tokenize_corpus(texts, min_token_size=1)
print(tokenized_texts)
vocab, freq = build_vocabulary(tokenized_texts, min_count=1)
print(vocab, freq)

[['казнить', 'нельзя', 'помиловать', 'нельзя', 'наказывать'], ['казнить', 'нельзя', 'помиловать', 'нельзя', 'освободить'], ['нельзя', 'не', 'помиловать'], ['обязательно', 'освободить']]
{'помиловать': 0, 'нельзя': 1, 'казнить': 2, 'освободить': 3, 'наказывать': 4, 'не': 5, 'обязательно': 6} [0.75 0.75 0.5  0.5  0.25 0.25 0.25]


In [41]:
import scipy.sparse
result = scipy.sparse.dok_matrix((len(tokenized_texts), len(vocab)), dtype='float32')

for text_i, text in enumerate(tokenized_texts):
    for token in text:
        if token in vocab:
            result[text_i, vocab[token]] += 1

In [42]:
# convert dok_matrix to pandas.DataFrame
import pandas as pd
df = pd.DataFrame(result.toarray(), columns=vocab.keys(), index=texts)
df

Unnamed: 0,помиловать,нельзя,казнить,освободить,наказывать,не,обязательно
"Казнить нельзя, помиловать. Нельзя наказывать.",1.0,2.0,1.0,0.0,1.0,0.0,0.0
"Казнить, нельзя помиловать. Нельзя освободить.",1.0,2.0,1.0,1.0,0.0,0.0,0.0
Нельзя не помиловать.,1.0,1.0,0.0,0.0,0.0,1.0,0.0
Обязательно освободить.,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [43]:
temp = result.copy()

In [44]:
temp.toarray()

array([[1., 2., 1., 0., 1., 0., 0.],
       [1., 2., 1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1.]], dtype=float32)

In [45]:
temp.mean()

0.5357143

In [46]:
print(temp.tocsc())

  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (0, 1)	2.0
  (1, 1)	2.0
  (2, 1)	1.0
  (0, 2)	1.0
  (1, 2)	1.0
  (1, 3)	1.0
  (3, 3)	1.0
  (0, 4)	1.0
  (2, 5)	1.0
  (3, 6)	1.0


In [47]:
temp = temp.tocsr()
sr = 1 / temp.sum(1)
tf = temp.multiply(sr)
pd.DataFrame(tf.toarray())

Unnamed: 0,0,1,2,3,4,5,6
0,0.2,0.4,0.2,0.0,0.2,0.0,0.0
1,0.2,0.4,0.2,0.2,0.0,0.0,0.0
2,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.0
3,0.0,0.0,0.0,0.5,0.0,0.0,0.5


In [48]:
temp = temp.tocsc()
sc = 1 / temp.sum(0)
pd.DataFrame(temp.multiply(sc).toarray())

Unnamed: 0,0,1,2,3,4,5,6
0,0.333333,0.4,0.5,0.0,1.0,0.0,0.0
1,0.333333,0.4,0.5,0.5,0.0,0.0,0.0
2,0.333333,0.2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.5,0.0,0.0,1.0


In [49]:
temp = temp.tocsr()
tfidf = tf.multiply(1 / freq)
pd.DataFrame(tfidf.toarray())

Unnamed: 0,0,1,2,3,4,5,6
0,0.266667,0.533333,0.4,0.0,0.8,0.0,0.0
1,0.266667,0.533333,0.4,0.4,0.0,0.0,0.0
2,0.444444,0.444444,0.0,0.0,0.0,1.333333,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,2.0


In [50]:
temp = temp.tocsr()
idf = (temp > 0).astype('float32').multiply(1 / freq)
pd.DataFrame(idf.toarray())

Unnamed: 0,0,1,2,3,4,5,6
0,1.333333,1.333333,2.0,0.0,4.0,0.0,0.0
1,1.333333,1.333333,2.0,2.0,0.0,0.0,0.0
2,1.333333,1.333333,0.0,0.0,0.0,4.0,0.0
3,0.0,0.0,0.0,2.0,0.0,0.0,4.0


In [55]:
import numpy as np
t = np.log(tf.toarray() + 1) * idf.toarray()
mean = t.mean(0)
std = t.std(0, ddof=1)
t_standartized = (t - mean) / std
mean, std, t_standartized

(array([0.21744177, 0.32020885, 0.1823216 , 0.29389334, 0.1823216 ,
        0.28768212, 0.4054651 ], dtype=float32),
 array([0.15937145, 0.21566401, 0.21052684, 0.38517493, 0.36464316,
        0.57536423, 0.8109302 ], dtype=float32),
 array([[ 0.16096792,  0.59546685,  0.8660254 , -0.76301265,  1.5000001 ,
         -0.5       , -0.5       ],
        [ 0.16096792,  0.59546685,  0.8660254 ,  0.18368238, -0.50000006,
         -0.5       , -0.5       ],
        [ 1.042435  ,  0.2938242 , -0.8660254 , -0.76301265, -0.50000006,
          1.5       , -0.5       ],
        [-1.364371  , -1.4847579 , -0.8660254 ,  1.342343  , -0.50000006,
         -0.5       ,  1.5       ]], dtype=float32))

In [58]:
sorted_vocab = [x[1] for x in sorted(vocab.items(), key=lambda x: (freq[x[1]], x[0]))]
sorted_vocab

[4, 5, 6, 2, 3, 1, 0]

In [59]:
# rearrange t_standartized
t_standartized_rearranged = t_standartized[:, sorted_vocab]
t_standartized_rearranged

array([[ 1.5000001 , -0.5       , -0.5       ,  0.8660254 , -0.76301265,
         0.59546685,  0.16096792],
       [-0.50000006, -0.5       , -0.5       ,  0.8660254 ,  0.18368238,
         0.59546685,  0.16096792],
       [-0.50000006,  1.5       , -0.5       , -0.8660254 , -0.76301265,
         0.2938242 ,  1.042435  ],
       [-0.50000006, -0.5       ,  1.5       , -0.8660254 ,  1.342343  ,
        -1.4847579 , -1.364371  ]], dtype=float32)

In [63]:
for i in range(len(t_standartized_rearranged)):
    for j in range(len(t_standartized_rearranged[i])):
        print(f"{round(t_standartized_rearranged[i][j], 2):.2f}", end=" ")
    print()

1.50 -0.50 -0.50 0.87 -0.76 0.60 0.16 
-0.50 -0.50 -0.50 0.87 0.18 0.60 0.16 
-0.50 1.50 -0.50 -0.87 -0.76 0.29 1.04 
-0.50 -0.50 1.50 -0.87 1.34 -1.48 -1.36 


In [54]:
test = np.arange(9).reshape(3, 3)
print(test)
test.std()

[[0 1 2]
 [3 4 5]
 [6 7 8]]


2.581988897471611