In [2]:
import nltk
import math
import numpy as np
from collections import Counter
from typing import List
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize

# Завантаження корпусу
nltk.download('treebank')

# Інтерфейс попередньої обробки тексту
class Preprocessor:
    @staticmethod
    def tokenize(text: str) -> List[str]:
        return word_tokenize(text)

# Клас для побудови TF-IDF
class TFIDFVectorizer:
    def __init__(self, preprocessor: Preprocessor = Preprocessor()):
        self.preprocessor = preprocessor
        self.vocabulary = {}
        self.idf_scores = {}

    def fit(self, documents: List[str]):
        """Будує словник та обчислює IDF для всіх слів."""
        doc_tokens = [set(self.preprocessor.tokenize(doc)) for doc in documents]
        total_docs = len(documents)

        all_terms = set(term for tokens in doc_tokens for term in tokens)
        for idx, term in enumerate(all_terms):
            self.vocabulary[term] = idx
            doc_count = sum(1 for tokens in doc_tokens if term in tokens)
            self.idf_scores[term] = math.log((1 + total_docs) / (1 + doc_count)) + 1

    def transform(self, documents: List[str]) -> np.ndarray:
        """Перетворює документи у TF-IDF матрицю."""
        tfidf_matrix = np.zeros((len(documents), len(self.vocabulary)))

        for i, doc in enumerate(documents):
            tokens = self.preprocessor.tokenize(doc)
            tf_counts = Counter(tokens)
            total_terms = len(tokens)

            for term, count in tf_counts.items():
                if term in self.vocabulary:
                    tf = count / total_terms
                    idf = self.idf_scores[term]
                    idx = self.vocabulary[term]
                    tfidf_matrix[i, idx] = tf * idf

        return tfidf_matrix

    def fit_transform(self, documents: List[str]) -> np.ndarray:
        self.fit(documents)
        return self.transform(documents)

# Використання
sentences = [" ".join(sent) for sent in treebank.sents()[:50]]

vectorizer = TFIDFVectorizer()
tfidf_matrix = vectorizer.fit_transform(sentences)

# Перевірка косинусної подібності
def cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Приклади
similarity_0_1 = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
similarity_0_40 = cosine_similarity(tfidf_matrix[0], tfidf_matrix[40])

print(f"Cosine similarity between 0 and 1: {similarity_0_1:.4f}")
print(f"Cosine similarity between 0 and 40: {similarity_0_40:.4f}")


Cosine similarity between 0 and 1: 0.1199
Cosine similarity between 0 and 40: 0.0374


[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!
