# Подготовка данных для тренировки модели

- Для начала сконструируем текст из произведений, чтобы оценить метрики модели на старте
- На платформе [Kaggle](https://www.kaggle.com/datasets/d0rj3228/russian-literature) нашел хорошо структурированный датасет произведений

In [3]:
%cd ..

/Users/rzaripov/Desktop/ITMO/avito-ds-internship-task


In [17]:
import os
import zipfile
import glob
from tqdm.auto import tqdm
import re
from collections import Counter
import string

In [6]:
data_dir = 'data/russian-literature'

In [7]:
os.makedirs(data_dir, exist_ok=True)

In [None]:
#!/bin/bash
!curl -L -o {data_dir}/russian-literature.zip\
  https://www.kaggle.com/api/v1/datasets/download/d0rj3228/russian-literature

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 20.5M  100 20.5M    0     0  7168k      0  0:00:02  0:00:02 --:--:-- 8937k


In [None]:
!unzip -O utf-8 -o -q {data_dir}/russian-literature.zip -d {data_dir}

In [8]:
def process_file(file_path):
    """Read file and clean text."""
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    # Remove tabs and newlines
    text = text.replace("\n", " ").replace("\t", " ")

    # Optionally collapse multiple spaces
    text = " ".join(text.split())
    return text

In [9]:
def process_all_files(output_file):
    folders_to_process = ["prose", "publicism"]
    
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for folder in folders_to_process:
            folder_path = f'{data_dir}/{folder}'

            if not os.path.exists(folder_path):
                continue
            
            txt_pattern = os.path.join(folder_path, "**", "*.txt")
            txt_files = glob.glob(txt_pattern, recursive=True)
            
            print(f"Found {len(txt_files)} files in {folder_path}")
            
            for txt_file in tqdm(txt_files, desc=f"Processing {folder_path}"):
                text = process_file(txt_file)
                if text:
                    outfile.write(text + " ")

In [10]:
output_file = f'{data_dir}/training_data_russian_literature.txt'

In [11]:
process_all_files(output_file)

Found 269 files in data/russian-literature/prose


Processing data/russian-literature/prose: 100%|██████████| 269/269 [00:00<00:00, 548.88it/s]


Found 26 files in data/russian-literature/publicism


Processing data/russian-literature/publicism: 100%|██████████| 26/26 [00:00<00:00, 2469.14it/s]


In [15]:
os.path.getsize(output_file) / 1024 ** 3

0.06160872895270586

In [18]:
def analyze_symbols(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        text = file.read()
    
    char_counter = Counter(text)
    
    symbols = {char: count for char, count in char_counter.items() 
               if not char.isspace()}
    
    return sorted(symbols.items(), key=lambda x: x[1], reverse=True)

symbols = analyze_symbols(output_file)

In [19]:
symbols

[('о', 3205703),
 ('е', 2458264),
 ('а', 2291190),
 ('и', 1909490),
 ('н', 1807461),
 ('т', 1731853),
 ('с', 1486543),
 ('л', 1374696),
 ('в', 1278746),
 ('р', 1189235),
 ('к', 944286),
 ('м', 893448),
 ('д', 867177),
 (',', 830774),
 ('у', 810240),
 ('п', 699358),
 ('я', 624182),
 ('ь', 576643),
 ('ы', 536645),
 ('г', 523339),
 ('б', 494544),
 ('з', 465634),
 ('ч', 460831),
 ('.', 390535),
 ('й', 316944),
 ('ж', 312867),
 ('ш', 256599),
 ('х', 247373),
 ('ю', 190488),
 ('-', 146158),
 ('ц', 97758),
 ('щ', 86729),
 ('–', 80611),
 ('э', 72278),
 ('!', 63656),
 ('Н', 63043),
 ('—', 62599),
 ('В', 54288),
 ('П', 52162),
 ('?', 50259),
 ('О', 41789),
 ('А', 40603),
 (';', 40595),
 (':', 37350),
 ('С', 37130),
 ('К', 34873),
 ('ф', 34276),
 ('И', 33208),
 ('М', 32682),
 ('Д', 28547),
 ('Т', 28342),
 ('Я', 26460),
 ('e', 24060),
 ('…', 23733),
 ('«', 19061),
 ('»', 18906),
 ('Г', 17665),
 ('Б', 17332),
 ('Л', 14904),
 ('ё', 14235),
 ('Ч', 14062),
 ('"', 13906),
 ('Р', 13515),
 ('a', 12583),
