<a href="https://colab.research.google.com/github/SirvavialTAG/NLP/blob/main/NLP_lab_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pymorphy3 nltk



In [2]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger_eng')

In [3]:
text = (
    "About two years ago, in 1808, after returning to St. Petersburg from his "
    "trip to the estates, Pierre unwittingly became the head of St. Petersburg "
    "Freemasonry. He set up mess halls and funeral lodges, recruited new "
    "members, took care of connecting the various lodges and acquiring "
    "authentic acts. He gave his money for the construction of temples and "
    "replenished, as much as he could, the collection of alms, for which most "
    "of the members were stingy and careless. He supported the poor house set "
    "up by the order in St. Petersburg almost alone at his own expense."
)

**Лемматизация**

In [4]:
def get_wordnet_pos(tag):
    tag_dict = {
        'J': wordnet.ADJ,
        'V': wordnet.VERB,
        'N': wordnet.NOUN,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag[0], None)


lemmatizer = WordNetLemmatizer()
words = text.split()
tagged_words = nltk.pos_tag(words)

lemmatized_text = []
for word, tag in tagged_words:
  pos = get_wordnet_pos(tag)
  if pos is not None:
    lemma = lemmatizer.lemmatize(word, pos=pos)
    lemmatized_text.append(lemma)
  else:
    lemmatized_text.append(word)

joined_lemmatized_text = " ".join(lemmatized_text)
print("Лемматизация:", joined_lemmatized_text)

Лемматизация: About two year ago, in 1808, after return to St. Petersburg from his trip to the estates, Pierre unwittingly become the head of St. Petersburg Freemasonry. He set up mess hall and funeral lodges, recruit new members, take care of connect the various lodge and acquire authentic acts. He give his money for the construction of temple and replenished, as much as he could, the collection of alms, for which most of the member be stingy and careless. He support the poor house set up by the order in St. Petersburg almost alone at his own expense.


**Стемминг**

In [5]:
stemmer = PorterStemmer()
stemmed_text = [stemmer.stem(word) for word in text.split()]
joined_stemmed_text = " ".join(stemmed_text)
print("Стемминг:", joined_stemmed_text)

Стемминг: about two year ago, in 1808, after return to st. petersburg from hi trip to the estates, pierr unwittingli becam the head of st. petersburg freemasonry. he set up mess hall and funer lodges, recruit new members, took care of connect the variou lodg and acquir authent acts. he gave hi money for the construct of templ and replenished, as much as he could, the collect of alms, for which most of the member were stingi and careless. he support the poor hous set up by the order in st. petersburg almost alon at hi own expense.


**Токенизация**

In [6]:
def tokenize(text: str) -> list[str]:
  tokens = []
  for char in text:
    if char.isascii():
      tokens.append(char)
  return tokens

In [7]:
test_text_1 = "Текст для проверки работы пользовательской функции токенизации."
test_text_2 = (
          "Text for verifying the operation of a custom tokenization function."
)

test_tokenize_text_1 = tokenize(test_text_1)
test_tokenize_text_2 = tokenize(test_text_2)

print("Тест 1: ", test_tokenize_text_1)
print("Тест 2: ", tokenize(test_text_2))

Тест 1:  [' ', ' ', ' ', ' ', ' ', ' ', '.']
Тест 2:  ['T', 'e', 'x', 't', ' ', 'f', 'o', 'r', ' ', 'v', 'e', 'r', 'i', 'f', 'y', 'i', 'n', 'g', ' ', 't', 'h', 'e', ' ', 'o', 'p', 'e', 'r', 'a', 't', 'i', 'o', 'n', ' ', 'o', 'f', ' ', 'a', ' ', 'c', 'u', 's', 't', 'o', 'm', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'f', 'u', 'n', 'c', 't', 'i', 'o', 'n', '.']


**Векторизация**

In [8]:
def vectorize(tokens: list[str]) -> list[int]:
  vectors = [ord(char) for char in tokens]
  return vectors

In [9]:
test_vectorize_text_1 = vectorize(test_tokenize_text_1)
test_vectorize_text_2 = vectorize(test_tokenize_text_2)

print("Тест 3: ", test_vectorize_text_1)
print("Тест 4: ", test_vectorize_text_2)

Тест 3:  [32, 32, 32, 32, 32, 32, 46]
Тест 4:  [84, 101, 120, 116, 32, 102, 111, 114, 32, 118, 101, 114, 105, 102, 121, 105, 110, 103, 32, 116, 104, 101, 32, 111, 112, 101, 114, 97, 116, 105, 111, 110, 32, 111, 102, 32, 97, 32, 99, 117, 115, 116, 111, 109, 32, 116, 111, 107, 101, 110, 105, 122, 97, 116, 105, 111, 110, 32, 102, 117, 110, 99, 116, 105, 111, 110, 46]


**Токенизация и векторизация после лемматизации**

In [10]:
tokenize_lemmatized_text = tokenize(joined_lemmatized_text)
print(tokenize_lemmatized_text[:45])

vectorize_lemmatized_text = vectorize(tokenize_lemmatized_text)
print(vectorize_lemmatized_text[:45])

['A', 'b', 'o', 'u', 't', ' ', 't', 'w', 'o', ' ', 'y', 'e', 'a', 'r', ' ', 'a', 'g', 'o', ',', ' ', 'i', 'n', ' ', '1', '8', '0', '8', ',', ' ', 'a', 'f', 't', 'e', 'r', ' ', 'r', 'e', 't', 'u', 'r', 'n', ' ', 't', 'o', ' ']
[65, 98, 111, 117, 116, 32, 116, 119, 111, 32, 121, 101, 97, 114, 32, 97, 103, 111, 44, 32, 105, 110, 32, 49, 56, 48, 56, 44, 32, 97, 102, 116, 101, 114, 32, 114, 101, 116, 117, 114, 110, 32, 116, 111, 32]


**Токенизация и векторизация после стемминга**

In [11]:
tokenize_stemmed_text = tokenize(joined_stemmed_text)
print(tokenize_stemmed_text[:45])

vectorize_stemmed_text = vectorize(tokenize_stemmed_text)
print(vectorize_stemmed_text[:45])

['a', 'b', 'o', 'u', 't', ' ', 't', 'w', 'o', ' ', 'y', 'e', 'a', 'r', ' ', 'a', 'g', 'o', ',', ' ', 'i', 'n', ' ', '1', '8', '0', '8', ',', ' ', 'a', 'f', 't', 'e', 'r', ' ', 'r', 'e', 't', 'u', 'r', 'n', ' ', 't', 'o', ' ']
[97, 98, 111, 117, 116, 32, 116, 119, 111, 32, 121, 101, 97, 114, 32, 97, 103, 111, 44, 32, 105, 110, 32, 49, 56, 48, 56, 44, 32, 97, 102, 116, 101, 114, 32, 114, 101, 116, 117, 114, 110, 32, 116, 111, 32]
