# Реализация алгоритма TF-IDF

https://ru.wikipedia.org/wiki/TF-IDF

In [2]:
from typing import List
import numpy as np

In [32]:
from collections import Counter

def tf_idf(corpus: List[List[str]]) -> np.ndarray:
    """
    Реализация TF-IDF
    :param corpus: List[List[int]] - текстовый корпус, который надо закодировать
        List первого уровня - документ
        List второго уровня - токены
    :return: np.ndarray - матрица TF-IDF: строки - документ, столбцы - токены
    """
    vacabulary = {}
    tfs = []
    for doc in corpus:
        c = Counter(doc)
        n_tokens_in_doc = len(doc)
        tf = {t: n / n_tokens_in_doc for t, n in c.items()}
        tfs.append(tf)

        for token in tf.keys():
            vacabulary[token] = vacabulary.get(token, 0) + 1
    
    matrix = []
    for i in range(len(corpus)):
        row = []
        for token in vacabulary.keys():
            idf = np.log10(len(corpus) / vacabulary[token])
            el = tfs[i][token] * idf if token in tfs[i].keys() else 0
            row.append(el)
        matrix.append(row)

    return np.array(matrix), vacabulary, tfs



In [33]:
corpus = [
    ['дает', 'корова', 'молоко'],
    ['молоко', 'молоко', 'кефир'],
    ['облако', 'кефир', 'кефир'],

]

m, vac, tfs = tf_idf(corpus)
m

array([[0.15904042, 0.15904042, 0.05869709, 0.        , 0.        ],
       [0.        , 0.        , 0.11739417, 0.05869709, 0.        ],
       [0.        , 0.        , 0.        , 0.11739417, 0.15904042]])

In [34]:
vac

{'дает': 1, 'корова': 1, 'молоко': 2, 'кефир': 2, 'облако': 1}

In [35]:
tfs

[{'дает': 0.3333333333333333,
  'корова': 0.3333333333333333,
  'молоко': 0.3333333333333333},
 {'молоко': 0.6666666666666666, 'кефир': 0.3333333333333333},
 {'облако': 0.3333333333333333, 'кефир': 0.6666666666666666}]

In [29]:
# Варант №2

from collections import Counter

def tf_idf_2(corpus: List[List[str]]) -> np.ndarray:
    """
    Реализация TF-IDF
    :param corpus: List[List[int]] - текстовый корпус, который надо закодировать
        List первого уровня - документ
        List второго уровня - токены
    :return: np.ndarray - матрица TF-IDF: строки - документ, столбцы - токены
    """
    vacabulary = {}
    tfs = []
    for doc in corpus:
        c = Counter(doc)
        n_tokens_in_doc = len(doc)
        tf = {t: n / n_tokens_in_doc for t, n in c.items()}
        tfs.append(tf)

        for token in tf.keys():
            vacabulary[token] = vacabulary.get(token, 0) + 1
    
    matrix = np.zeros((len(corpus), len(vacabulary)))
    idx_token = dict(zip(vacabulary.keys(), range(len(vacabulary))))
    num_token = list(vacabulary.keys())

    for i, tf in enumerate(tfs):
        for token, value in tf.items():
            matrix[i, idx_token[token]] = value
    
    for i in range(len(vacabulary)):
        matrix[:, i] *= np.log10(len(corpus) / vacabulary[num_token[i]])

    return matrix

In [30]:
corpus = [
    ['дает', 'корова', 'молоко'],
    ['молоко', 'молоко', 'кефир'],
    ['облако', 'кефир', 'кефир'],

]

tf_idf_2(corpus)

array([[0.15904042, 0.15904042, 0.05869709, 0.        , 0.        ],
       [0.        , 0.        , 0.11739417, 0.05869709, 0.        ],
       [0.        , 0.        , 0.        , 0.11739417, 0.15904042]])

In [31]:
corpus = [
    ['data', 'science', 'is', 'one', 'of', 'the', 'most', 'important', 'fields', 'of', 'science'],
    ['this', 'is', 'one', 'of', 'the', 'best', 'data', 'science', 'courses'],
    ['data', 'scientists', 'analyze', 'data'],
]

tf_idf_2(corpus)

array([[0.        , 0.03201659, 0.0160083 , 0.0160083 , 0.03201659,
        0.0160083 , 0.04337466, 0.04337466, 0.04337466, 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.0195657 , 0.0195657 , 0.0195657 , 0.0195657 ,
        0.0195657 , 0.        , 0.        , 0.        , 0.05301347,
        0.05301347, 0.05301347, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.11928031, 0.11928031]])

Сравнить с:
https://www.sefidian.com/2022/07/28/understanding-tf-idf-with-python-example/

In [28]:
import pandas as pd
import numpy as np

corpus = ['data science is one of the most important fields of science',
          'this is one of the best data science courses',
          'data scientists analyze data' ]

words_set = set()
 
for doc in  corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
     
print('Number of words in the corpus:',len(words_set))
print('The words in the corpus: \n', words_set)

n_docs = len(corpus)         #·Number of documents in the corpus
n_words_set = len(words_set) #·Number of unique words in the
 
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=list(words_set))
 
# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ') # Words in the document
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))

print("IDF of: ")
 
idf = {}
 
for w in words_set:
    k = 0    # number of documents in the corpus that contain this word
     
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
             
    idf[w] =  np.log10(n_docs / k)
     
    print(f'{w:>15}: {idf[w]:>10}' )

    df_tf_idf = df_tf.copy()
 
for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]
         
df_tf_idf

Number of words in the corpus: 14
The words in the corpus: 
 {'fields', 'of', 'best', 'most', 'courses', 'important', 'the', 'is', 'scientists', 'one', 'data', 'this', 'science', 'analyze'}
IDF of: 
         fields: 0.47712125471966244
             of: 0.17609125905568124
           best: 0.47712125471966244
           most: 0.47712125471966244
        courses: 0.47712125471966244
      important: 0.47712125471966244
            the: 0.17609125905568124
             is: 0.17609125905568124
     scientists: 0.47712125471966244
            one: 0.17609125905568124
           data:        0.0
           this: 0.47712125471966244
        science: 0.17609125905568124
        analyze: 0.47712125471966244


Unnamed: 0,fields,of,best,most,courses,important,the,is,scientists,one,data,this,science,analyze
0,0.043375,0.032017,0.0,0.043375,0.0,0.043375,0.016008,0.016008,0.0,0.016008,0.0,0.0,0.032017,0.0
1,0.0,0.019566,0.053013,0.0,0.053013,0.0,0.019566,0.019566,0.0,0.019566,0.0,0.053013,0.019566,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.11928,0.0,0.0,0.0,0.0,0.11928
