In [1]:
import nltk
import string
from datasets import load_dataset

In [2]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\R1sed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Загрузка нужных данных NLTK:
nltk.download('punkt') # Для токенизации
nltk.download('wordnet') # Для лемматизации
nltk.download('omw-1.4') # Часто нужно для корректной лемматизации (WordNet)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\R1sed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\R1sed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\R1sed\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
# Загружаем датасет ag_news (Hugging Face Datasets)
dataset = load_dataset("ag_news")

In [5]:
# Берём первый пример из обучающей выборки
first_example = dataset["train"][0]
first_text = first_example["text"]
print("Исходный текст:")
print(first_text)

Исходный текст:
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.


In [6]:
# Шаг 1: Токенизация без доп. очистки
tokens = nltk.word_tokenize(first_text)
print("\nТокены (без дополнительной очистки):")
print(tokens)


Токены (без дополнительной очистки):
['Wall', 'St.', 'Bears', 'Claw', 'Back', 'Into', 'the', 'Black', '(', 'Reuters', ')', 'Reuters', '-', 'Short-sellers', ',', 'Wall', 'Street', "'s", 'dwindling\\band', 'of', 'ultra-cynics', ',', 'are', 'seeing', 'green', 'again', '.']


In [7]:
# Функция предобработки со стеммингом
def preprocess_with_stemming(text): # Обрезание слов без опоры на лексические формы
    # Приведение к нижнему регистру
    text = text.lower()
    # Удаление знаков препинания
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Токенизация
    tokens = nltk.word_tokenize(text)
    # Применение стемминга
    stemmer = nltk.PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [8]:
# Функция предобработки с лемматизацией
def preprocess_with_lemmatization(text): # Обрезание(преобразование) слов с опорой на лексические формы 
    # Приведение к нижнему регистру
    text = text.lower()
    # Удаление знаков препинания
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Токенизация
    tokens = nltk.word_tokenize(text)
    # Применение лемматизации
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [9]:
# Сравним результаты стемминга и лемматизации
stemmed_result = preprocess_with_stemming(first_text)
lemmatized_result = preprocess_with_lemmatization(first_text)
print("\nТокены после стемминга:")
print(stemmed_result)
print("\nТокены после лемматизации:")
print(lemmatized_result)


Токены после стемминга:
['wall', 'st', 'bear', 'claw', 'back', 'into', 'the', 'black', 'reuter', 'reuter', 'shortsel', 'wall', 'street', 'dwindlingband', 'of', 'ultracyn', 'are', 'see', 'green', 'again']

Токены после лемматизации:
['wall', 'st', 'bear', 'claw', 'back', 'into', 'the', 'black', 'reuters', 'reuters', 'shortsellers', 'wall', 'street', 'dwindlingband', 'of', 'ultracynics', 'are', 'seeing', 'green', 'again']


In [10]:
demo_texts = dataset["train"]["text"][1:6]
print(demo_texts[0])

Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.


In [11]:
stemmed_result = [preprocess_with_stemming(text) for text in demo_texts]

In [12]:
lemmatized_result = [preprocess_with_lemmatization(text) for text in demo_texts]

In [13]:
def base_preprocess(text): 
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.split()

In [14]:
base_result = [base_preprocess(text) for text in demo_texts]

In [15]:
for i in range(len(stemmed_result)):
    print("Необработанные токены:")
    print(base_result[i])
    print("Stemmed токены:")
    print(stemmed_result[i])
    print("Lemmatized Токены:")
    print(lemmatized_result[i])

Необработанные токены:
['carlyle', 'looks', 'toward', 'commercial', 'aerospace', 'reuters', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'groupwhich', 'has', 'a', 'reputation', 'for', 'making', 'welltimed', 'and', 'occasionallycontroversial', 'plays', 'in', 'the', 'defense', 'industry', 'has', 'quietly', 'placedits', 'bets', 'on', 'another', 'part', 'of', 'the', 'market']
Stemmed токены:
['carlyl', 'look', 'toward', 'commerci', 'aerospac', 'reuter', 'reuter', 'privat', 'invest', 'firm', 'carlyl', 'groupwhich', 'ha', 'a', 'reput', 'for', 'make', 'welltim', 'and', 'occasionallycontroversi', 'play', 'in', 'the', 'defens', 'industri', 'ha', 'quietli', 'placedit', 'bet', 'on', 'anoth', 'part', 'of', 'the', 'market']
Lemmatized Токены:
['carlyle', 'look', 'toward', 'commercial', 'aerospace', 'reuters', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'groupwhich', 'ha', 'a', 'reputation', 'for', 'making', 'welltimed', 'and', 'occasionallycontroversial', 'play', 'in', 'the', '

Видно, что Стемминг грубо обрезает слова создавая несуществующие, лемматизация работает лучше.

In [16]:
def find_difference(base, stemmed, lemmatized):
    print("Слово\tСтемминг\tЛемматизация")
    for i, token in enumerate(base):
        if stemmed[i] != lemmatized[i]:
            print(token + ' ' + stemmed[i] + ' ' + lemmatized[i])

In [17]:
find_difference(base_result[0], stemmed_result[0], lemmatized_result[0])

Слово	Стемминг	Лемматизация
carlyle carlyl carlyle
commercial commerci commercial
aerospace aerospac aerospace
reuters reuter reuters
reuters reuter reuters
private privat private
investment invest investment
carlyle carlyl carlyle
reputation reput reputation
making make making
welltimed welltim welltimed
occasionallycontroversial occasionallycontroversi occasionallycontroversial
defense defens defense
industry industri industry
quietly quietli quietly
placedits placedit placedits
another anoth another


#### Тоже самое для датасета IMDB Movie Reviews

In [18]:
imdb_dataset = load_dataset("imdb")
print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [19]:
imdb_texts = imdb_dataset["train"]["text"][:5]
print(imdb_texts[0])

I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, eve

In [20]:
imdb_base_result = [base_preprocess(text) for text in imdb_texts]
imdb_stemmed_result = [preprocess_with_stemming(text) for text in imdb_texts]
imdb_lemmatized_result = [preprocess_with_lemmatization(text) for text in imdb_texts]

In [21]:
for i in range(len(imdb_stemmed_result)):
    print("Необработанные токены:")
    print(imdb_base_result[i])
    print("Stemmed токены:")
    print(imdb_stemmed_result[i])
    print("Lemmatized Токены:")
    print(imdb_lemmatized_result[i])

Необработанные токены:
['i', 'rented', 'i', 'am', 'curiousyellow', 'from', 'my', 'video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it', 'was', 'first', 'released', 'in', '1967', 'i', 'also', 'heard', 'that', 'at', 'first', 'it', 'was', 'seized', 'by', 'us', 'customs', 'if', 'it', 'ever', 'tried', 'to', 'enter', 'this', 'country', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', 'controversial', 'i', 'really', 'had', 'to', 'see', 'this', 'for', 'myselfbr', 'br', 'the', 'plot', 'is', 'centered', 'around', 'a', 'young', 'swedish', 'drama', 'student', 'named', 'lena', 'who', 'wants', 'to', 'learn', 'everything', 'she', 'can', 'about', 'life', 'in', 'particular', 'she', 'wants', 'to', 'focus', 'her', 'attentions', 'to', 'making', 'some', 'sort', 'of', 'documentary', 'on', 'what', 'the', 'average', 'swede', 'thought', 'about', 'certain', 'political', 'issues', 'such', 'as', 'the', 'vietnam', 'war', 'and', 'race', 'issues', 'in'

In [22]:
find_difference(imdb_base_result[0], imdb_stemmed_result[0], imdb_lemmatized_result[0])

Слово	Стемминг	Лемматизация
rented rent rented
because becaus because
controversy controversi controversy
surrounded surround surrounded
released releas released
seized seiz seized
us us u
tried tri tried
this thi this
country countri country
therefore therefor therefore
being be being
considered consid considered
controversial controversi controversial
really realli really
this thi this
centered center centered
named name named
everything everyth everything
focus focu focus
attentions attent attention
making make making
documentary documentari documentary
average averag average
political polit political
issues issu issue
as as a
issues issu issue
united unit united
asking ask asking
ordinary ordinari ordinary
politics polit politics
classmates classmat classmate
married marri married
this thi this
considered consid considered
pornographic pornograph pornographic
really realli really
nudity nuditi nudity
cheaply cheapli cheaply
countrymen countrymen countryman
shocking shock shocking
r

### Очистка от стоп слов

In [23]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\R1sed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [30]:
print(sorted(list(stop_words)))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she', "she's", 'should', "should've", 'shouldn', "shouldn't", 'so', 'some',

In [25]:
example1 = dataset["train"]["text"][0]
print(example1)

Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.


In [31]:
print(set(string.punctuation))

{'$', '.', '#', '|', '*', ':', ';', '[', '!', "'", ']', '&', '~', '+', '%', '>', '_', '@', '\\', '{', ')', '}', '<', '(', '=', '^', '/', '`', '-', '?', ',', '"'}


In [44]:
def stopwords_preprocess(text): 
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [word for word in text if (word not in stop_words and word not in set(string.punctuation))] # удаление стоп-слов и пунктуации
    text = [word for word in text if (set(word) - set(string.punctuation)) != set()]
    return text

In [52]:
def stopwords_preprocess2(text): 
    text = text.lower().split()
    text = [word for word in text if word not in stop_words] # удаление стоп-слов
    text = ' '.join(text)
    text = nltk.word_tokenize(text)
    text = [word for word in text if (set(word) - set(string.punctuation)) != set()]
    return text

In [45]:
stopwords_preprocess(example1)

['wall',
 'st.',
 'bears',
 'claw',
 'back',
 'black',
 'reuters',
 'reuters',
 'short-sellers',
 'wall',
 'street',
 "'s",
 'dwindling\\band',
 'ultra-cynics',
 'seeing',
 'green']

In [46]:
example2 = imdb_dataset["test"]["text"][0]
print(example2)

I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to alway

In [47]:
stopwords_preprocess(example2)

['love',
 'sci-fi',
 'willing',
 'put',
 'lot',
 'sci-fi',
 'movies/tv',
 'usually',
 'underfunded',
 'under-appreciated',
 'misunderstood',
 'tried',
 'like',
 'really',
 'good',
 'tv',
 'sci-fi',
 'babylon',
 '5',
 'star',
 'trek',
 'original',
 'silly',
 'prosthetics',
 'cheap',
 'cardboard',
 'sets',
 'stilted',
 'dialogues',
 'cg',
 "n't",
 'match',
 'background',
 'painfully',
 'one-dimensional',
 'characters',
 'overcome',
 "'sci-fi",
 'setting',
 "'m",
 'sure',
 'think',
 'babylon',
 '5',
 'good',
 'sci-fi',
 'tv',
 "'s",
 "'s",
 'clichéd',
 'uninspiring',
 'us',
 'viewers',
 'might',
 'like',
 'emotion',
 'character',
 'development',
 'sci-fi',
 'genre',
 'take',
 'seriously',
 'cf',
 'star',
 'trek',
 'may',
 'treat',
 'important',
 'issues',
 'yet',
 'serious',
 'philosophy',
 "'s",
 'really',
 'difficult',
 'care',
 'characters',
 'simply',
 'foolish',
 'missing',
 'spark',
 'life',
 'actions',
 'reactions',
 'wooden',
 'predictable',
 'often',
 'painful',
 'watch',
 'maker

In [53]:
stopwords_preprocess2(example2)

['love',
 'sci-fi',
 'willing',
 'put',
 'lot',
 'sci-fi',
 'movies/tv',
 'usually',
 'underfunded',
 'under-appreciated',
 'misunderstood',
 'tried',
 'like',
 'this',
 'really',
 'did',
 'good',
 'tv',
 'sci-fi',
 'babylon',
 '5',
 'star',
 'trek',
 'the',
 'original',
 'silly',
 'prosthetics',
 'cheap',
 'cardboard',
 'sets',
 'stilted',
 'dialogues',
 'cg',
 'match',
 'background',
 'painfully',
 'one-dimensional',
 'characters',
 'can',
 'not',
 'overcome',
 "'sci-fi",
 'setting',
 'i',
 "'m",
 'sure',
 'think',
 'babylon',
 '5',
 'good',
 'sci-fi',
 'tv',
 'not',
 'clichéd',
 'uninspiring',
 'us',
 'viewers',
 'might',
 'like',
 'emotion',
 'character',
 'development',
 'sci-fi',
 'genre',
 'take',
 'seriously',
 'cf',
 'star',
 'trek',
 'may',
 'treat',
 'important',
 'issues',
 'yet',
 'serious',
 'philosophy',
 'really',
 'difficult',
 'care',
 'characters',
 'simply',
 'foolish',
 'missing',
 'spark',
 'life',
 'actions',
 'reactions',
 'wooden',
 'predictable',
 'often',
 'p