<a href="https://colab.research.google.com/github/RomGor1/Methods-of-semantic-information-processing/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [78]:
import math
from collections import Counter

In [79]:
class MyCountVectorizer:
    def __init__(self):
        self.vocabulary_ = None

    def fit(self, texts):

        unique_words = set()
        for text in texts:
            unique_words.update(text.split())
        self.vocabulary_ = {word: idx for idx, word in enumerate(sorted(unique_words))}

    def transform(self, texts):

        if self.vocabulary_ is None:
            raise ValueError("метод fit.")

        num_texts = len(texts)
        num_words = len(self.vocabulary_)
        matrix = [[0] * num_words for _ in range(num_texts)]

        for i, text in enumerate(texts):
            word_counts = Counter(text.split())
            for word, count in word_counts.items():
                if word in self.vocabulary_:
                    matrix[i][self.vocabulary_[word]] = count

        return matrix

    def fit_transform(self, texts):
        self.fit(texts)
        return self.transform(texts)


In [80]:
class MyTfidfVectorizer:
    def __init__(self):
        self.count_vectorizer = MyCountVectorizer()
        self.idf = None

    def fit(self, texts):

        self.count_vectorizer.fit(texts)
        matrix = self.count_vectorizer.transform(texts)


        num_texts = len(texts)
        num_words = len(self.count_vectorizer.vocabulary_)
        self.idf = [0] * num_words

        for i in range(num_words):

            doc_count = sum(1 for text in matrix if text[i] > 0)
            self.idf[i] = math.log((num_texts + 1) / (doc_count + 1)) + 1

    def transform(self, texts):
        if self.idf is None:
            raise ValueError("метод fit.")


        matrix = self.count_vectorizer.transform(texts)


        tfidf_matrix = []
        for text in matrix:
            tfidf_text = [tf * idf for tf, idf in zip(text, self.idf)]
            tfidf_matrix.append(tfidf_text)

        return tfidf_matrix

    def fit_transform(self, texts):
        self.fit(texts)
        return self.transform(texts)

In [81]:

texts = [
    "Пример первого текста для обработки.",
    "Второй текст для анализа.",
    "Еще один пример текста."
]


texts = [text.lower().split() for text in texts]
texts = [" ".join(text) for text in texts]

print("Тексты после токенизации:")
for text in texts:
    print(text)

Тексты после токенизации:
пример первого текста для обработки.
второй текст для анализа.
еще один пример текста.


In [82]:

count_vectorizer = MyCountVectorizer()


count_matrix = count_vectorizer.fit_transform(texts)


print("\nCountVectorizer Matrix:")
for row in count_matrix:
    print(row)

print("\nСловарь (термины):", count_vectorizer.vocabulary_)


CountVectorizer Matrix:
[0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0]
[1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0]
[0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1]

Словарь (термины): {'анализа.': 0, 'второй': 1, 'для': 2, 'еще': 3, 'обработки.': 4, 'один': 5, 'первого': 6, 'пример': 7, 'текст': 8, 'текста': 9, 'текста.': 10}


In [83]:
tfidf_vectorizer = MyTfidfVectorizer()


tfidf_matrix = tfidf_vectorizer.fit_transform(texts)


print("\nTF-IDF Matrix:")
for row in tfidf_matrix:
    print(row)


TF-IDF Matrix:
[0.0, 0.0, 1.2876820724517808, 0.0, 1.6931471805599454, 0.0, 1.6931471805599454, 1.2876820724517808, 0.0, 1.6931471805599454, 0.0]
[1.6931471805599454, 1.6931471805599454, 1.2876820724517808, 0.0, 0.0, 0.0, 0.0, 0.0, 1.6931471805599454, 0.0, 0.0]
[0.0, 0.0, 0.0, 1.6931471805599454, 0.0, 1.6931471805599454, 0.0, 1.2876820724517808, 0.0, 0.0, 1.6931471805599454]
