# Подготовка данных для тренировки модели

- Для начала сконструируем текст из произведений, чтобы оценить метрики модели на старте
- На платформе [Kaggle](https://www.kaggle.com/datasets/d0rj3228/russian-literature) нашел хорошо структурированный датасет произведений

In [3]:
%cd ..

/Users/rzaripov/Desktop/ITMO/avito-ds-internship-task


In [8]:
import os
import zipfile
import glob
from tqdm.auto import tqdm
import re
from collections import Counter
import string
from datasets import load_dataset
import getpass
os.environ["HF_TOKEN"] =  getpass.getpass("Enter your HF_TOKEN: ")

  from .autonotebook import tqdm as notebook_tqdm


# [russian-literature](https://www.kaggle.com/datasets/d0rj3228/russian-literature)

In [6]:
data_dir = 'data/russian-literature'

In [7]:
os.makedirs(data_dir, exist_ok=True)

In [None]:
#!/bin/bash
!curl -L -o {data_dir}/russian-literature.zip\
  https://www.kaggle.com/api/v1/datasets/download/d0rj3228/russian-literature

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 20.5M  100 20.5M    0     0  7168k      0  0:00:02  0:00:02 --:--:-- 8937k


In [None]:
!unzip -O utf-8 -o -q {data_dir}/russian-literature.zip -d {data_dir}

In [8]:
def process_file(file_path):
    """Read file and clean text."""
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    # Remove tabs and newlines
    text = text.replace("\n", " ").replace("\t", " ")

    # Optionally collapse multiple spaces
    text = " ".join(text.split())
    return text

In [9]:
def process_all_files(output_file):
    folders_to_process = ["prose", "publicism"]
    
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for folder in folders_to_process:
            folder_path = f'{data_dir}/{folder}'

            if not os.path.exists(folder_path):
                continue
            
            txt_pattern = os.path.join(folder_path, "**", "*.txt")
            txt_files = glob.glob(txt_pattern, recursive=True)
            
            print(f"Found {len(txt_files)} files in {folder_path}")
            
            for txt_file in tqdm(txt_files, desc=f"Processing {folder_path}"):
                text = process_file(txt_file)
                if text:
                    outfile.write(text + " ")

In [10]:
output_file = f'{data_dir}/training_data_russian_literature.txt'

In [11]:
process_all_files(output_file)

Found 269 files in data/russian-literature/prose


Processing data/russian-literature/prose: 100%|██████████| 269/269 [00:00<00:00, 548.88it/s]


Found 26 files in data/russian-literature/publicism


Processing data/russian-literature/publicism: 100%|██████████| 26/26 [00:00<00:00, 2469.14it/s]


In [15]:
os.path.getsize(output_file) / 1024 ** 3

0.06160872895270586

In [18]:
def analyze_symbols(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    
    char_counter = Counter(text)
    
    symbols = {char: count for char, count in char_counter.items() 
               if not char.isspace()}
    
    return sorted(symbols.items(), key=lambda x: x[1], reverse=True)

symbols = analyze_symbols(output_file)

NameError: name 'output_file' is not defined

In [19]:
symbols

[('о', 3205703),
 ('е', 2458264),
 ('а', 2291190),
 ('и', 1909490),
 ('н', 1807461),
 ('т', 1731853),
 ('с', 1486543),
 ('л', 1374696),
 ('в', 1278746),
 ('р', 1189235),
 ('к', 944286),
 ('м', 893448),
 ('д', 867177),
 (',', 830774),
 ('у', 810240),
 ('п', 699358),
 ('я', 624182),
 ('ь', 576643),
 ('ы', 536645),
 ('г', 523339),
 ('б', 494544),
 ('з', 465634),
 ('ч', 460831),
 ('.', 390535),
 ('й', 316944),
 ('ж', 312867),
 ('ш', 256599),
 ('х', 247373),
 ('ю', 190488),
 ('-', 146158),
 ('ц', 97758),
 ('щ', 86729),
 ('–', 80611),
 ('э', 72278),
 ('!', 63656),
 ('Н', 63043),
 ('—', 62599),
 ('В', 54288),
 ('П', 52162),
 ('?', 50259),
 ('О', 41789),
 ('А', 40603),
 (';', 40595),
 (':', 37350),
 ('С', 37130),
 ('К', 34873),
 ('ф', 34276),
 ('И', 33208),
 ('М', 32682),
 ('Д', 28547),
 ('Т', 28342),
 ('Я', 26460),
 ('e', 24060),
 ('…', 23733),
 ('«', 19061),
 ('»', 18906),
 ('Г', 17665),
 ('Б', 17332),
 ('Л', 14904),
 ('ё', 14235),
 ('Ч', 14062),
 ('"', 13906),
 ('Р', 13515),
 ('a', 12583),


# [ru_news](https://huggingface.co/datasets/IlyaGusev/ru_news)

In [10]:
dataset = load_dataset('IlyaGusev/ru_news', split="train", streaming=True)

In [11]:
# Регулярное выражение для удаления смайликов и эмодзи
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # эмоциональные иконки
    u"\U0001F300-\U0001F5FF"  # символы и пиктограммы
    u"\U0001F680-\U0001F6FF"  # транспорт и карты
    u"\U0001F1E0-\U0001F1FF"  # флаги (iOS)
    u"\U00002500-\U00002BEF"  # различные символы
    u"\U00002702-\U000027B0"
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u2640-\u2642" 
    u"\u2600-\u2B55"
    u"\u200d"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\ufe0f"  # вариационный селектор-16
    u"\u3030"
    "]+", flags=re.UNICODE)

In [12]:
output_filename = "data/ru_news.txt"
max_news_count = 100000

with open(output_filename, 'w', encoding='utf-8') as file:
    for i, example in tqdm(enumerate(dataset), total=max_news_count, desc="Обработка новостей"):
        if i >= max_news_count:
            break
        
        cleaned_text = example["text"].replace('\n', ' ').replace('\t', ' ').strip()
        cleaned_text = emoji_pattern.sub('', cleaned_text)  # удаляем смайлики
        cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # удаляем множественные пробелы
        
        if cleaned_text:
            file.write(cleaned_text + ' ')

Обработка новостей:   0%|          | 0/100000 [00:00<?, ?it/s]

Обработка новостей: 100%|██████████| 100000/100000 [00:22<00:00, 4364.24it/s]


In [24]:
symbols = analyze_symbols(output_filename)

In [25]:
symbols

[('о', 14018919),
 ('е', 10762604),
 ('а', 10427393),
 ('и', 10246328),
 ('н', 8588432),
 ('т', 8129835),
 ('с', 7155343),
 ('р', 7129403),
 ('в', 5799274),
 ('л', 5265821),
 ('к', 4184997),
 ('д', 3804829),
 ('м', 3733835),
 ('п', 3578049),
 ('у', 3035290),
 ('я', 2374514),
 ('ы', 2299874),
 ('г', 2108336),
 ('б', 2039534),
 ('з', 1966629),
 (',', 1886530),
 ('ь', 1756781),
 ('й', 1648078),
 ('ч', 1557462),
 ('.', 1464816),
 ('х', 1096102),
 ('ж', 1055872),
 ('ц', 797438),
 ('ю', 750233),
 ('ш', 684673),
 ('"', 494046),
 ('щ', 493404),
 ('С', 488934),
 ('-', 447273),
 ('ф', 399760),
 ('В', 371253),
 ('э', 368910),
 ('П', 355319),
 ('1', 341693),
 ('0', 332229),
 ('К', 328899),
 ('А', 328395),
 ('Р', 308544),
 ('2', 292569),
 ('М', 271437),
 ('О', 259714),
 ('Н', 235432),
 ('n', 207985),
 ('Т', 206674),
 ('И', 184506),
 ('«', 180647),
 ('»', 179956),
 ('Д', 172841),
 ('Б', 146128),
 ('Г', 134438),
 ('5', 132722),
 ('У', 121146),
 ('3', 118439),
 ('e', 114213),
 ('Ф', 112714),
 ('4', 10