<a href="https://colab.research.google.com/github/SirvavialTAG/NLP/blob/main/NLP_lab_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pymorphy3 nltk langdetect



In [2]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet

import pymorphy3
from string import punctuation
import numpy as np
from math import log

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [3]:
english_texts = [
    "About two years ago, in 1808, after returning to St. Petersburg from his "
    "trip to the estates, Pierre unwittingly became the head of St. Petersburg "
    "Freemasonry.",
    "He set up mess halls and funeral lodges, recruited new members, took care "
    "of connecting the various lodges and acquiring authentic acts.",
    "He gave his money for the construction of temples and "
    "replenished, as much as he could, the collection of alms, for which most "
    "of the members were stingy and careless.",
    "He supported the poor house set "
    "up by the order in St. Petersburg almost alone at his own expense."
]

russian_texts = [
    "Года два тому назад, в 1808 году, вернувшись в Петербург из своей поездки "
    "по имениям, Пьер невольно стал во главе петербургского масонства.",
    "Он устроивал столовые и надгробные ложи, вербовал новых членов, заботился "
    "о соединении различных лож и о приобретении подлинных актов.",
    "Он давал свои деньги на устройство храмин и пополнял, насколько мог, "
    "сборы милостыни, на которые большинство членов были скупы и неаккуратны.",
    "Он почти один на свои средства поддерживал дом бедных, устроенный орденом "
    "в Петербурге."
]

In [4]:
def tokenize_and_delete_stopwords(text: str, language: str) -> list[str]:
    all_tokens = word_tokenize(text)
    word_tokens = [token.lower() for token in all_tokens if token not in punctuation]
    stop_words = set(stopwords.words(language))
    filtered_tokens = [word for word in word_tokens if word not in stop_words]
    return filtered_tokens

In [5]:
def preprocessing_russian_text(texts: list[str]) -> list[list[str]]:
    result_texts = []
    for text in texts:
        filtered_tokens = tokenize_and_delete_stopwords(text, 'russian')
        morph = pymorphy3.MorphAnalyzer()
        lemmas = [morph.parse(token)[0].normal_form for token in filtered_tokens]
        result_texts.append(lemmas)

    return result_texts

In [6]:
def preprocessing_english_text(texts: list[str]) -> list[list[str]]:
    result_texts = []
    for text in texts:
        filtered_tokens = tokenize_and_delete_stopwords(text, 'english')
        lemmatizer = nltk.WordNetLemmatizer()
        tagged_words = nltk.pos_tag(filtered_tokens)
        tag_dict = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
        }
        lemmas = [lemmatizer.lemmatize(word, pos=tag_dict.get(tag[0], wordnet.NOUN)) for word, tag in tagged_words]
        result_texts.append(lemmas)

    return result_texts

In [7]:
result_english_texts = preprocessing_english_text(english_texts)
print(result_english_texts)

result_russian_texts = preprocessing_russian_text(russian_texts)
print(result_russian_texts)

[['two', 'year', 'ago', '1808', 'return', 'st.', 'petersburg', 'trip', 'estates', 'pierre', 'unwittingly', 'become', 'head', 'st.', 'petersburg', 'freemasonry'], ['set', 'mess', 'halls', 'funeral', 'lodge', 'recruit', 'new', 'member', 'take', 'care', 'connect', 'various', 'lodge', 'acquire', 'authentic', 'act'], ['give', 'money', 'construction', 'temples', 'replenish', 'much', 'could', 'collection', 'alms', 'member', 'stingy', 'careless'], ['support', 'poor', 'house', 'set', 'order', 'st.', 'petersburg', 'almost', 'alone', 'expense']]
[['год', 'тот', 'назад', '1808', 'год', 'вернуться', 'петербург', 'свой', 'поездка', 'имение', 'пьер', 'невольно', 'стать', 'глава', 'петербургский', 'масонство'], ['устроивать', 'столовая', 'надгробный', 'ложа', 'вербовать', 'новый', 'член', 'заботиться', 'соединение', 'различный', 'ложа', 'приобретение', 'подлинный', 'акт'], ['давать', 'свой', 'деньга', 'устройство', 'храмина', 'пополнять', 'насколько', 'мочь', 'сбор', 'милостыня', 'который', 'большинст

In [8]:
def make_dict(processed_texts: list[str]) -> dict[str]:
    unique_words = {}
    for text in processed_texts:
        for word in text:
            if word not in unique_words:
                unique_words[word] = 1
            else:
                unique_words[word] += 1

    return dict(sorted(unique_words.items()))

In [9]:
rus_word_dict = make_dict(result_russian_texts)
print(rus_word_dict)

eng_word_dict = make_dict(result_english_texts)
print(eng_word_dict)

{'1808': 1, 'акт': 1, 'бедный': 1, 'большинство': 1, 'вербовать': 1, 'вернуться': 1, 'глава': 1, 'год': 2, 'давать': 1, 'деньга': 1, 'дом': 1, 'заботиться': 1, 'имение': 1, 'который': 1, 'ложа': 2, 'масонство': 1, 'милостыня': 1, 'мочь': 1, 'надгробный': 1, 'назад': 1, 'насколько': 1, 'неаккуратный': 1, 'невольно': 1, 'новый': 1, 'орден': 1, 'петербург': 2, 'петербургский': 1, 'поддерживать': 1, 'подлинный': 1, 'поездка': 1, 'пополнять': 1, 'приобретение': 1, 'пьер': 1, 'различный': 1, 'сбор': 1, 'свой': 3, 'скупой': 1, 'соединение': 1, 'средство': 1, 'стать': 1, 'столовая': 1, 'тот': 1, 'устроенный': 1, 'устроивать': 1, 'устройство': 1, 'храмина': 1, 'член': 2}
{'1808': 1, 'acquire': 1, 'act': 1, 'ago': 1, 'almost': 1, 'alms': 1, 'alone': 1, 'authentic': 1, 'become': 1, 'care': 1, 'careless': 1, 'collection': 1, 'connect': 1, 'construction': 1, 'could': 1, 'estates': 1, 'expense': 1, 'freemasonry': 1, 'funeral': 1, 'give': 1, 'halls': 1, 'head': 1, 'house': 1, 'lodge': 2, 'member': 2,

In [10]:
# Bag of Words
def count_num_words(texts: list[str], word_dict: dict[str, int]) -> list[list[int]]:
    matrix_words = np.zeros((len(texts), len(word_dict)), dtype=int)
    sorted_words = sorted(word_dict.keys())
    for i, text in enumerate(texts):
        for word in text:
            col_index = sorted_words.index(word)
            matrix_words[i][col_index] += 1

    return matrix_words

In [11]:
rus_words_matrix = count_num_words(result_russian_texts, rus_word_dict)
print("Русский текст:\n", rus_words_matrix)

eng_words_matrix = count_num_words(result_english_texts, eng_word_dict)
print("\nАнглийский текст:\n", eng_words_matrix)

Русский текст:
 [[1 0 0 0 0 1 1 2 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1
  0 0 0 1 0 1 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 1 0 0 2 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0
  0 1 0 0 1 0 0 1 0 0 1]
 [0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1
  1 0 0 0 0 0 0 0 1 1 1]
 [0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 1
  0 0 1 0 0 0 1 0 0 0 0]]

Английский текст:
 [[1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 2 1 0 0 0 1
  0 2 0 0 0 0 1 1 1 0 1]
 [0 1 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 2 1 1 0 0 1 0 0 0 0 1 0 0
  1 0 0 0 1 0 0 0 0 1 0]
 [0 0 0 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0
  0 0 1 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 0 0
  1 1 0 1 0 0 0 0 0 0 0]]


In [12]:
# TF-IDF
def calculate_tf_idf(matrix: list[list[int]]) -> list[list[int]]:
    # TF
    tf = matrix / matrix.sum(axis=1, keepdims=True)
    # IDF
    N = matrix.shape[0]
    M = (matrix > 0).sum(axis=0)
    idf = np.log(N / (1 + M))
    # TF-IDF
    tf_idf_matrix = tf * idf

    return tf_idf_matrix

In [13]:
result_rus_matrix = calculate_tf_idf(rus_words_matrix)
print("Русский текст:\n", result_rus_matrix)

result_eng_matrix = calculate_tf_idf(eng_words_matrix)
print("\nАнглийский текст:\n", result_eng_matrix)

Русский текст:
 [[0.0433217  0.         0.         0.         0.         0.0433217
  0.0433217  0.0866434  0.         0.         0.         0.
  0.0433217  0.         0.         0.0433217  0.         0.
  0.         0.0433217  0.         0.         0.0433217  0.
  0.         0.01798013 0.0433217  0.         0.         0.0433217
  0.         0.         0.0433217  0.         0.         0.
  0.         0.         0.         0.0433217  0.         0.0433217
  0.         0.         0.         0.         0.        ]
 [0.         0.04951051 0.         0.         0.04951051 0.
  0.         0.         0.         0.         0.         0.04951051
  0.         0.         0.09902103 0.         0.         0.
  0.04951051 0.         0.         0.         0.         0.04951051
  0.         0.         0.         0.         0.04951051 0.
  0.         0.04951051 0.         0.04951051 0.         0.
  0.         0.04951051 0.         0.         0.04951051 0.
  0.         0.04951051 0.         0.         0.0