## Second try

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

import sentencepiece as spm
#import pandas as pd

import json


#from sklearn.model_selection import train_test_split


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Tokenizer

In [None]:
texts = []
with open('train.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        data = json.loads(line.strip())
        texts.append(data['text'])

with open('corpus.txt', 'w', encoding='utf-8') as f:
    for text in texts:
        f.write(text + '\n')


In [None]:
spm.SentencePieceTrainer.train(
    input='corpus.txt',
    model_prefix='tokenizer',
    vocab_size=20000,
    model_type='unigram',
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3,
    )

## Load tokenizer

In [None]:
sp = spm.SentencePieceProcessor()
sp.Load('tokenizer.model')  # –§–∞–π–ª –º–æ–¥–µ–ª–∏

##### TEST

In [None]:
# –ó–∞–≥—Ä—É–∑–∫–∞
sp = spm.SentencePieceProcessor()
sp.Load('tokenizer.model')  # –§–∞–π–ª –º–æ–¥–µ–ª–∏

# test
text = "–û—Ç–ª–∏—á–Ω—ã–π —Ä—É—Å—Å–∫–∏–π —Ç–µ–∫—Å—Ç –¥–ª—è TextCNN –∫–ª–∞—Å—Å–∏—Ñ–∏–∫–∞—Ü–∏–∏!"
tokens = sp.Encode(text, out_type=int)  # IDs
pieces = sp.Encode(text, out_type=str)  # –ü–æ–¥—Å–ª–æ–≤–∞

print("–û–†–ò–ì–ò–ù–ê–õ:", text)
print("IDS:", tokens)
print("–¢–û–ö–ï–ù–´:", pieces)
print("Vocab size:", sp.GetPieceSize())
print("–î–ª–∏–Ω–∞:", len(tokens))


#### Data loader

In [None]:
class JsonLDataset(Dataset):
    def __init__(self, file_path, tokenizer_path='tokenizer.model', max_length=256):
        self.file_path = file_path
        self.max_length = max_length
        self.data = []

        self.sp = spm.SentencePieceProcessor()
        self.sp.load(tokenizer_path)

        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                sample = json.loads(line.strip())
                self.data.append({
                    'text': sample['text'],
                    'label': sample['label']
                })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        text = item['text']
        tokens = self.sp.encode(text, out_type=int)

        if len(tokens) > self.max_length:
            tokens = tokens[:self.max_length]
        else:
            pad_id = self.sp.PieceToId('<pad>')
            tokens = tokens + [pad_id] * (self.max_length - len(tokens))

        text_tensor = torch.tensor(tokens, dtype=torch.long)
        label_tensor = torch.tensor(item['label'], dtype=torch.long)

        return text_tensor, label_tensor

    

train_dataset = JsonLDataset('train.jsonl')
test_dataset = JsonLDataset('test.jsonl')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)


## Model

In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        
        # padding_idx=pad_idx –≥–æ–≤–æ—Ä–∏—Ç –º–æ–¥–µ–ª–∏ –∏–≥–Ω–æ—Ä–∏—Ä–æ–≤–∞—Ç—å –Ω—É–ª–∏ (—É—Å–∫–æ—Ä–µ–Ω–∏–µ!)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        
        # –ü–∞—Ä–∞–ª–ª–µ–ª—å–Ω—ã–µ —Å–≤–µ—Ä—Ç–∫–∏ (–¥–ª—è 2, 3 –∏ 4 —Å–ª–æ–≤)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, 
                      out_channels=n_filters, 
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        
        self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text: [batch, len] -> embedded: [batch, len, dim] -> [batch, dim, len]
        embedded = self.embedding(text).permute(0, 2, 1)
        
        # –°–≤–µ—Ä—Ç–∫–∞ + ReLU + MaxPool –¥–ª—è –∫–∞–∂–¥–æ–≥–æ —Ä–∞–∑–º–µ—Ä–∞ —Ñ–∏–ª—å—Ç—Ä–∞
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # –û–±—ä–µ–¥–∏–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [None]:
VOCAB_SIZE = sp.GetPieceSize()
model = TextCNN(
    vocab_size=VOCAB_SIZE,
    embed_dim=300,
    n_filters=100,
    filter_sizes= [3,4,5],
    output_dim=3,
    dropout= 0.5,
    pad_idx=0
).to(device)


##### Stats of model:

In [None]:
print(model)
print(f"–ü–∞—Ä–∞–º–µ—Ç—Ä—ã: {sum(p.numel() for p in model.parameters())}")

##### Optimizer and Loss

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss().to(device)

## THE CYCLE üåÄ

In [None]:
best_val_loss = float('inf')
best_model_path = 'textcnn_best.pth'


N_EPOCHS = 10

for epoch in tqdm(range(N_EPOCHS), desc="Epoch"):
    model.train()
    
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    
    for batch_idx, (tokens, labels) in enumerate(train_loader):
        tokens, labels = tokens.to(device), labels.to(device)
        
        # 1. –û–±–Ω—É–ª—è–µ–º –≥—Ä–∞–¥–∏–µ–Ω—Ç—ã
        optimizer.zero_grad()
        
        # 2. Forward
        predictions = model(tokens)
        
        # 3. Loss
        loss = criterion(predictions, labels)
        
        # 4. Backward
        loss.backward()
        
        # 5. –®–∞–≥ –æ–ø—Ç–∏–º–∏–∑–∞—Ç–æ—Ä–∞
        optimizer.step()
        
        # –ú–µ—Ç—Ä–∏–∫–∏
        train_loss += loss.item()
        _, predicted = predictions.max(1)
        train_total += labels.size(0)
        train_correct += predicted.eq(labels).sum().item()
    
    if 1==1:
        model.eval()

        test_loss = 0.0
        test_correct = 0
        test_total = 0

        with torch.no_grad():
            for batch_idx, (tokens, labels) in enumerate(test_loader):
                tokens, labels = tokens.to(device), labels.to(device)

                predictions = model(tokens)
                loss = criterion(predictions, labels)


                test_loss += loss.item()
                _, predicted = predictions.max(1)
                test_total += labels.size(0)
                test_correct += predicted.eq(labels).sum().item()
        
        test_acc = 100. * test_correct / test_total
        val_loss_avg = test_loss / len(test_loader)
        if val_loss_avg < best_val_loss:
            best_val_loss = val_loss_avg
            torch.save(model.state_dict(), best_model_path)
            print(f'‚úÖ –ù–û–í–ê–Ø –õ–£–ß–®–ê–Ø! Val Loss: {val_loss_avg:.4f}')
        print(f'EpochT: {epoch+1:02}, Loss: {test_loss/len(test_loader):.4f}, Acc: {test_acc:.2f}%')
            
    
    # –≠–ø–æ—Ö–∞ –∑–∞–∫–æ–Ω—á–µ–Ω–∞
    train_acc = 100. * train_correct / train_total
    print(f'Epoch: {epoch+1:02}, Loss: {train_loss/len(train_loader):.4f}, Acc: {train_acc:.2f}%')

print("‚úÖDONE‚úÖ")