## Second try

In [16]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import sentencepiece as spm
#import pandas as pd

import json


#from sklearn.model_selection import train_test_split


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Tokenizer

In [None]:
texts = []
with open('train.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line.strip())
        texts.append(data['text'])

with open('corpus.txt', 'w', encoding='utf-8') as f:
    for text in texts:
        f.write(text + '\n')


In [17]:
spm.SentencePieceTrainer.train(
    input='corpus.txt',
    model_prefix='tokenizer',
    vocab_size=20000,
    model_type='unigram',
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    )

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: corpus.txt
  input_format: 
  model_prefix: tokenizer
  model_type: UNIGRAM
  vocab_size: 20000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differ

##### TEST

In [18]:
# Загрузка
sp = spm.SentencePieceProcessor()
sp.Load('tokenizer.model')  # Файл модели

# test
text = "Отличный русский текст для TextCNN классификации!"
tokens = sp.Encode(text, out_type=int)  # IDs
pieces = sp.Encode(text, out_type=str)  # Подслова

print("ОРИГИНАЛ:", text)
print("IDS:", tokens)
print("ТОКЕНЫ:", pieces)
print("Vocab size:", sp.GetPieceSize())
print("Длина:", len(tokens))


ОРИГИНАЛ: Отличный русский текст для TextCNN классификации!
IDS: [1069, 403, 12968, 78, 3124, 13140, 1, 1213, 36, 2111, 1554, 9090, 10]
ТОКЕНЫ: ['▁Отличный', '▁русский', '▁текст', '▁для', '▁T', 'ext', 'CNN', '▁класс', 'и', 'ф', 'ик', 'ации', '!']
Vocab size: 20000
Длина: 13


#### Data loader

In [None]:
class JsonLDataset(Dataset):
    def __init__(self, file_path, tokenizer_path='tokenizer.model', max_length=256):
        self.file_path = file_path
        self.max_length = max_length
        self.data = []

        self.sp = spm.SentencePieceProcessor()
        self.sp.load(tokenizer_path)

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                sample = json.loads(line.strip())
                self.data.append({
                    'text': sample['text'],
                    'label': sample['label']
                })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        text = item['text']
        tokens = self.sp.encode(text, out_type=int)

        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        else:
            pad_id = self.sp.PieceToId('<pad>')
            tokens = tokens + [pad_id] * (self.max_length - len(tokens))

        text_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(item['label'], dtype=torch.long)

        return text_tensor, label_tensor

    

train_dataset = JsonLDataset('train.jsonl')
test_dataset = JsonLDataset('test.jsonl')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)


## Model