In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
from torch.utils.data import Dataset, random_split, DataLoader
from transformers import BertTokenizer
import unicodedata

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
classes = {
    'Công văn': 0,
    'Quyết định': 1,
    'Báo cáo': 2,
    'Thông báo': 3,
    'Kế hoạch': 4,
    'Tờ trình': 5,
    'Thư mời': 6,
    'Đơn': 7,
    'Giấy mời': 8
}

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CreateData(Dataset):
    def __init__(self, csv_file):
        self.csv_file = csv_file

        self.data_frame = pd.read_csv(csv_file, encoding='utf-8')

    def __getitem__(self, index):
        input_text = self.data_frame.loc[index, 'TRICH_YEU']
        label = self.data_frame.loc[index, 'HINHTHUC']

        input_tok, imput_msk, label = transforms_data(input_text, label)
        return input_tok, imput_msk, label
    
    def __len__(self):
        return len(self.data_frame)
    

def transforms_data(input_text, label):
    input_text = input_text.lower()
    input_text = unicodedata.normalize('NFD', input_text)
    input_text = ''.join(c for c in input_text if unicodedata.category(c) != 'Mn')
    token = tokenizer(input_text, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

    input_tok = token['input_ids'][0]
    imput_msk = token['attention_mask'][0]

    label = classes[label]

    return input_tok, imput_msk, label

In [16]:
dataset = CreateData(csv_file='/home/duypd/ThisPC-DuyPC/khangta-thesis/Dataset/final_data.csv')
dataset.__len__()

101243

In [17]:
input_tok, imput_msk, label = dataset.__getitem__(78)
input_tok

tensor([  101,  2310, 20098,  2278, 27699,  2072, 24209,  6672,  2102,  1102,
         2239, 12731,  2050,  6187,  2278,  7570,  4907, 13843, 19610,  2654,
         1102, 19098,  3070,  2084,  2232, 15990,  2319,  4229,  1010, 11382,
         2368, 12835,  4048,  1102,  4887, 10722,  2002, 27468, 27793,  4017,
        16371, 10085, 22794,  2100,  1996, 14163,  5063, 27793,  4017, 16371,
        10085,  1010,  6887, 19098,  3070,  2084,  2232, 15990,  2319,  1010,
        24110,  2260,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [18]:
imput_msk

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
# Chia tập dữ liệu thành 80% train và 20% validation
train_size = int(0.8 * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

# Tạo DataLoader cho train và validation
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

In [22]:
for sample, msk, label in train_loader:
    print(sample.size())
    print(msk.size())
    print(label)
    break

torch.Size([32, 128])
torch.Size([32, 128])
tensor([2, 3, 7, 2, 3, 3, 3, 5, 1, 7, 5, 2, 7, 6, 6, 5, 2, 5, 3, 7, 1, 1, 0, 7,
        8, 5, 5, 1, 5, 1, 5, 2])
