In [31]:
import sys
sys.path.append('..')

In [32]:
from dlnlputils.data import build_vocabulary, tokenize_corpus, vectorize_texts

In [33]:
texts = """Казнить нельзя, помиловать. Нельзя наказывать.
Казнить, нельзя помиловать. Нельзя освободить. Нельзя!
Нельзя не помиловать.
Обязательно освободить.""".split("\n")

tokenized_texts = tokenize_corpus(texts, min_token_size=1)
print(tokenized_texts)
vocab, freq = build_vocabulary(tokenized_texts, min_count=1)
print(vocab, freq)

[['казнить', 'нельзя', 'помиловать', 'нельзя', 'наказывать'], ['казнить', 'нельзя', 'помиловать', 'нельзя', 'освободить', 'нельзя'], ['нельзя', 'не', 'помиловать'], ['обязательно', 'освободить']]
{'помиловать': 0, 'нельзя': 1, 'казнить': 2, 'освободить': 3, 'наказывать': 4, 'не': 5, 'обязательно': 6} [0.75 0.75 0.5  0.5  0.25 0.25 0.25]


In [34]:
import scipy.sparse
result = scipy.sparse.dok_matrix((len(tokenized_texts), len(vocab)), dtype='float32')

for text_i, text in enumerate(tokenized_texts):
    for token in text:
        if token in vocab:
            result[text_i, vocab[token]] += 1

In [35]:
# convert dok_matrix to pandas.DataFrame
import pandas as pd
df = pd.DataFrame(result.toarray(), columns=vocab.keys(), index=texts)
df

Unnamed: 0,помиловать,нельзя,казнить,освободить,наказывать,не,обязательно
"Казнить нельзя, помиловать. Нельзя наказывать.",1.0,2.0,1.0,0.0,1.0,0.0,0.0
"Казнить, нельзя помиловать. Нельзя освободить. Нельзя!",1.0,3.0,1.0,1.0,0.0,0.0,0.0
Нельзя не помиловать.,1.0,1.0,0.0,0.0,0.0,1.0,0.0
Обязательно освободить.,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [36]:
temp = result.copy()

In [37]:
temp.toarray()

array([[1., 2., 1., 0., 1., 0., 0.],
       [1., 3., 1., 1., 0., 0., 0.],
       [1., 1., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 1.]], dtype=float32)

In [38]:
print(temp.tocsc())

  (0, 0)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (0, 1)	2.0
  (1, 1)	3.0
  (2, 1)	1.0
  (0, 2)	1.0
  (1, 2)	1.0
  (1, 3)	1.0
  (3, 3)	1.0
  (0, 4)	1.0
  (2, 5)	1.0
  (3, 6)	1.0


In [56]:
temp = temp.tocsr()
sr = 1 / temp.sum(1)
tf = temp.multiply(sr)
pd.DataFrame(tf.toarray())

Unnamed: 0,0,1,2,3,4,5,6
0,0.2,0.4,0.2,0.0,0.2,0.0,0.0
1,0.166667,0.5,0.166667,0.166667,0.0,0.0,0.0
2,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.0
3,0.0,0.0,0.0,0.5,0.0,0.0,0.5


In [53]:
temp = temp.tocsc()
sc = 1 / temp.sum(0)
pd.DataFrame(temp.multiply(sc).toarray())

Unnamed: 0,0,1,2,3,4,5,6
0,0.333333,0.333333,0.5,0.0,1.0,0.0,0.0
1,0.333333,0.5,0.5,0.5,0.0,0.0,0.0
2,0.333333,0.166667,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.5,0.0,0.0,1.0


In [57]:
temp = temp.tocsr()
tfidf = tf.multiply(1 / freq)
pd.DataFrame(tfidf.toarray())

Unnamed: 0,0,1,2,3,4,5,6
0,0.266667,0.533333,0.4,0.0,0.8,0.0,0.0
1,0.222222,0.666667,0.333333,0.333333,0.0,0.0,0.0
2,0.444444,0.444444,0.0,0.0,0.0,1.333333,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,2.0


In [None]:
temp = temp.tocsr()
idf = (temp > 0).astype('float32').multiply(1 / freq)
pd.DataFrame(idf.toarray())