<a href="https://colab.research.google.com/github/RomGor1/Methods-of-semantic-information-processing/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('ru_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [14]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


nlp = spacy.load("ru_core_news_sm")

In [15]:
def preprocessing(texts: list[str]) -> list[list[str]]:
    processed_texts = []
    for text in texts:
        doc = nlp(text.lower())
        filtered_tokens = [token.text for token in doc if token.is_alpha]
        processed_texts.append(filtered_tokens)
    return processed_texts

In [16]:
def make_dict(texts: list[list[str]]) -> dict:
    word_dict = {}
    for text in texts:
        for word in text:
            if word not in word_dict:
                word_dict[word] = len(word_dict)
    return word_dict

In [17]:
def count_num_words(texts: list[list[str]]) -> list[int]:
    return [len(text) for text in texts]

In [18]:
def bag_of_words(texts: list[str]):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer.get_feature_names_out()

In [19]:
def tfidf(texts: list[str]):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer.get_feature_names_out()

In [20]:
texts = ["Пример первого текста для обработки.", "Второй текст для анализа."]
processed_texts = preprocessing(texts)
word_dict = make_dict(processed_texts)
word_counts = count_num_words(processed_texts)
bow_matrix, bow_feature_names = bag_of_words(texts)
tfidf_matrix, tfidf_feature_names = tfidf(texts)

In [21]:
print("Processed Texts:", processed_texts)
print("Word Dictionary:", word_dict)
print("Word Counts:", word_counts)
print("Bag of Words Matrix:\n", bow_matrix)
print("Bag of Words Feature Names:", bow_feature_names)
print("TF-IDF Matrix:\n", tfidf_matrix)
print("TF-IDF Feature Names:", tfidf_feature_names)

Processed Texts: [['пример', 'первого', 'текста', 'для', 'обработки'], ['второй', 'текст', 'для', 'анализа']]
Word Dictionary: {'пример': 0, 'первого': 1, 'текста': 2, 'для': 3, 'обработки': 4, 'второй': 5, 'текст': 6, 'анализа': 7}
Word Counts: [5, 4]
Bag of Words Matrix:
 [[0 0 1 1 1 1 0 1]
 [1 1 1 0 0 0 1 0]]
Bag of Words Feature Names: ['анализа' 'второй' 'для' 'обработки' 'первого' 'пример' 'текст' 'текста']
TF-IDF Matrix:
 [[0.         0.         0.33517574 0.47107781 0.47107781 0.47107781
  0.         0.47107781]
 [0.53404633 0.53404633 0.37997836 0.         0.         0.
  0.53404633 0.        ]]
TF-IDF Feature Names: ['анализа' 'второй' 'для' 'обработки' 'первого' 'пример' 'текст' 'текста']
