<a href="https://colab.research.google.com/github/SavageGinny/MLP-Jupiters/blob/main/Lab2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Загружаем библиотеки

In [5]:
import nltk
import string
import pandas as pd
import math
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer




# Загрузка необходимых ресурсов nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Текст

In [6]:
texts = [
    "Natural language processing is amazing!",
    "Machine learning and deep learning are part of AI.",
    "AI is revolutionizing the world."
]

Лемантизация и токенизация

In [7]:
def preprocessing(texts: list[str]) -> list[list[str]]:
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))
    preprocessed_texts = []

    for text in texts:
        tokens = word_tokenize(text.lower())  # Приведение к нижнему регистру и токенизация
        tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Удаление стоп-слов и пунктуации
        tokens = [lemmatizer.lemmatize(word, pos='v') for word in tokens]  # Лемматизация с учетом глагольной формы
        preprocessed_texts.append(tokens)

    return preprocessed_texts

Смотрим, что получилось

In [8]:
preprocessing(texts)

[['natural', 'language', 'process', 'amaze'],
 ['machine', 'learn', 'deep', 'learn', 'part', 'ai'],
 ['ai', 'revolutionize', 'world']]

Векторизация

In [9]:
def make_dict(texts: list[str]) -> dict:
    preprocessed_texts = preprocessing(texts)
    vocab = sorted(set(word for text in preprocessed_texts for word in text))
    return {i: word for i, word in enumerate(vocab)}


In [10]:
make_dict(texts)

{0: 'ai',
 1: 'amaze',
 2: 'deep',
 3: 'language',
 4: 'learn',
 5: 'machine',
 6: 'natural',
 7: 'part',
 8: 'process',
 9: 'revolutionize',
 10: 'world'}

Подсчет слов в текстах

In [11]:
def count_num_word(texts: list[str]) -> list[int]:
    preprocessed_texts = preprocessing(texts)
    return [len(text) for text in preprocessed_texts]

In [12]:
count_num_word(texts)

[4, 6, 3]

In [15]:
def bag_of_words(texts: list[str]) -> pd.DataFrame:
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    return df

In [17]:
def bag_of_words(texts: list[str]) -> pd.DataFrame:
    preprocessed_texts = preprocessing(texts)
    vocab = sorted(set(word for text in preprocessed_texts for word in text))
    word_index = {word: i for i, word in enumerate(vocab)}

    bow_matrix = []
    for text in preprocessed_texts:
        vector = [0] * len(vocab)
        for word in text:
            vector[word_index[word]] += 1
        bow_matrix.append(vector)

    return pd.DataFrame(bow_matrix, columns=vocab)

In [18]:
bag_of_words(texts)

Unnamed: 0,ai,amaze,deep,language,learn,machine,natural,part,process,revolutionize,world
0,0,1,0,1,0,0,1,0,1,0,0
1,1,0,1,0,2,1,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,1,1


In [21]:
def tf_idf(texts: list[str]) -> pd.DataFrame:
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    return df

In [23]:
def tf_idf(texts: list[str]) -> pd.DataFrame:
    preprocessed_texts = preprocessing(texts)
    vocab = sorted(set(word for text in preprocessed_texts for word in text))
    word_index = {word: i for i, word in enumerate(vocab)}

    tf_matrix = []
    idf = {word: 0 for word in vocab}
    num_docs = len(preprocessed_texts)

    for text in preprocessed_texts:
        word_counts = Counter(text)
        total_words = len(text)
        tf_vector = [word_counts[word] / total_words if total_words > 0 else 0 for word in vocab]
        tf_matrix.append(tf_vector)

        for word in set(text):
            idf[word] += 1

    idf = {word: math.log(num_docs / (count + 1)) + 1 for word, count in idf.items()}  # Сглаженный IDF

    tfidf_matrix = [[tf * idf[word] for word, tf in zip(vocab, tf_vector)] for tf_vector in tf_matrix]

    return pd.DataFrame(tfidf_matrix, columns=vocab)

In [24]:
tf_idf(texts)

Unnamed: 0,ai,amaze,deep,language,learn,machine,natural,part,process,revolutionize,world
0,0.0,0.351366,0.0,0.351366,0.0,0.0,0.351366,0.0,0.351366,0.0,0.0
1,0.166667,0.0,0.234244,0.0,0.468488,0.234244,0.0,0.234244,0.0,0.0,0.0
2,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.468488,0.468488
