# Text preprocessing

## Устновка зависимостей и данных

In [1]:
pip install nltk datasets

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     --------- ------------------------------ 10.2/41.5 kB ? eta -:--:--
     ---------------------------- --------- 30.7/41.5 kB 660.6 kB/s eta 0:00:01
     -------------------------------------- 41.5/41.5 kB 401.4 kB/s eta 0:00:00
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collectin


[notice] A new release of pip is available: 24.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import re
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Загрузка необходимых ресурсов NLTK
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Сергей\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Сергей\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train")

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

## Реализация функции для предобработки текстовых данных

In [8]:
def preprocess_text(text):
    # 1. Приведение к нижнему регистру
    text = text.lower()

    # 2. Удаление лишних пробелов
    text = re.sub(r'\s+', ' ', text).strip()

    # 3. Удаление цифр
    text = re.sub(r'\d+', '', text)

    # 4. Удаление знаков препинания
    text = re.sub(r'[^\w\s]', '', text)

    # 5. Удаление URL-ссылок
    text = re.sub(r'http\S+|www\S+', '', text)

    # 6. Токенизация (на основе регулярного выражения)
    tokens = re.findall(r'\b\w+\b', text)

    # 7. Удаление стоп-слов
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 8. Стэмминг
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Соединение обработанных токенов обратно в строку
    return " ".join(tokens)

In [9]:
print(f"Пример данных: {dataset[0]}")

Пример данных: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK 

In [10]:
texts = [item['article'] for item in dataset if 'article' in item]
raw_text = " ".join(texts[:100])

processed_text = preprocess_text(raw_text)

print(f"Длина исходного текста: {len(raw_text)} символов")
print(f"Длина обработанного текста: {len(processed_text)} символов")
print(f"Первые 100 слов обработанного текста: {processed_text.split()[:100]}")

Длина исходного текста: 358630 символов
Длина обработанного текста: 216375 символов
Первые 100 слов обработанного текста: ['london', 'england', 'reuter', 'harri', 'potter', 'star', 'daniel', 'radcliff', 'gain', 'access', 'report', 'million', 'million', 'fortun', 'turn', 'monday', 'insist', 'money', 'wont', 'cast', 'spell', 'daniel', 'radcliff', 'harri', 'potter', 'harri', 'potter', 'order', 'phoenix', 'disappoint', 'gossip', 'columnist', 'around', 'world', 'young', 'actor', 'say', 'plan', 'fritter', 'cash', 'away', 'fast', 'car', 'drink', 'celebr', 'parti', 'dont', 'plan', 'one', 'peopl', 'soon', 'turn', 'suddenli', 'buy', 'massiv', 'sport', 'car', 'collect', 'someth', 'similar', 'told', 'australian', 'interview', 'earlier', 'month', 'dont', 'think', 'ill', 'particularli', 'extravag', 'thing', 'like', 'buy', 'thing', 'cost', 'pound', 'book', 'cd', 'dvd', 'radcliff', 'abl', 'gambl', 'casino', 'buy', 'drink', 'pub', 'see', 'horror', 'film', 'hostel', 'part', 'ii', 'current', 'six', 'plac

## Создание Мешка слов и N - грамм

In [11]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

def bag_of_words(text):
    words = text.split()  # Разделяем текст на слова
    return Counter(words)

def generate_ngrams(text, n):
    tokens = text.split()
    ngrams = zip(*[tokens[i:] for i in range(n)])  # Создаем сдвиги для n-грамм
    return [" ".join(gram) for gram in ngrams]

In [13]:
bow = bag_of_words(processed_text)
bigrams = generate_ngrams(processed_text, 2)
fourgrams = generate_ngrams(processed_text, 4)

In [16]:
print("Мешок слов:")
for word, count in list(bow.items())[:30]:
    print(f"{word}: {count}")

Мешок слов:
london: 34
england: 25
reuter: 23
harri: 13
potter: 7
star: 16
daniel: 3
radcliff: 4
gain: 4
access: 11
report: 109
million: 49
fortun: 4
turn: 39
monday: 26
insist: 6
money: 21
wont: 7
cast: 3
spell: 1
order: 22
phoenix: 2
disappoint: 4
gossip: 1
columnist: 2
around: 29
world: 62
young: 26
actor: 19
say: 159


In [17]:
print("\n2. Первые 10 2-грамм:")
print("\n".join(bigrams[:10]))

print("\n3. Первые 10 4-грамм:")
print("\n".join(fourgrams[:10]))


2. Первые 10 2-грамм:
london england
england reuter
reuter harri
harri potter
potter star
star daniel
daniel radcliff
radcliff gain
gain access
access report

3. Первые 10 4-грамм:
london england reuter harri
england reuter harri potter
reuter harri potter star
harri potter star daniel
potter star daniel radcliff
star daniel radcliff gain
daniel radcliff gain access
radcliff gain access report
gain access report million
access report million million
