In [1]:
import os
import pandas as pd

# ฟังก์ชันสำหรับอ่านไฟล์จากโฟลเดอร์ตามลำดับ
def read_data_from_folder(folder_path):
    data = []
    # เรียงลำดับชื่อไฟล์ตามลำดับตัวเลข
    file_names = sorted(os.listdir(folder_path))
    for file_name in file_names:
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()
                    if line:  # ข้ามแถวว่าง
                        parts = line.split("\t")
                        if len(parts) == 4:  # ถ้ามี 4 คอลัมน์
                            data.append(parts)
                        elif len(parts) == 3:  # ถ้ามี 3 คอลัมน์ เติมค่า default สำหรับ `tag`
                            parts.insert(2, "O")  # ใส่ค่า "O" ที่ตำแหน่ง index 2
                            data.append(parts)
                        else:
                            print(f"Invalid line in {file_name}: {line}")
    return data

# ฟังก์ชันสำหรับรวบรวมและบันทึกข้อมูล
def process_and_save_data(input_folder, output_file):
    data = read_data_from_folder(input_folder)
    df = pd.DataFrame(data, columns=["word", "pos", "tag", "class"])
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved processed data to {output_file}")

# โฟลเดอร์ข้อมูล (แก้ไขให้ตรงกับโครงสร้างใน Kaggle)
train_folder = "train"
test_folder = "test"
eval_folder = "eval"

# เซฟข้อมูลเป็นไฟล์ CSV
process_and_save_data(train_folder, "train_data.csv")
process_and_save_data(eval_folder, "eval_data.csv")
process_and_save_data(test_folder, "test_data.csv")


Saved processed data to train_data.csv
Saved processed data to eval_data.csv
Saved processed data to test_data.csv


In [2]:
import pandas as pd

# โหลดข้อมูล
train_data = pd.read_csv('train_data.csv')
eval_data = pd.read_csv('eval_data.csv')
test_data = pd.read_csv('test_data.csv')

# ตรวจสอบตัวอย่างข้อมูล
print(train_data.head())
print(eval_data.head())
print(test_data.head())

                          word pos    tag  class
0  สภาสังคมสงเคราะห์แห่งประเทศ  NN  B_ORG  B_CLS
1                          ไทย  NN  E_ORG  I_CLS
2                          จี้  VV      O  I_CLS
3                          ศาล  NN      O  I_CLS
4                      ไฟเขียว  VV      O  I_CLS
    word pos    tag  class
0   โฆษก  NN      O  B_CLS
1   กอส.  NN  B_ORG  I_CLS
2  ตำหนิ  VV      O  I_CLS
3   แมนฯ  NN  B_ORG  I_CLS
4      _  NN  I_ORG  I_CLS
     word pos tag  class
0     รัฐ  NN   O  B_CLS
1  ถังแตก  VV   O  I_CLS
2     วิก  NN   O  I_CLS
3       _  NN   O  I_CLS
4       7  NN   O  I_CLS


In [3]:
def group_sentences(data):
    sentences = []
    sentence = []
    
    for _, row in data.iterrows():
        word, tag, cls = row['word'], row['tag'], row['class']
        
        # Start new sentence if B_CLS found
        if cls == 'B_CLS':
            if sentence:  # Save previous sentence if exists
                sentences.append(sentence)
            sentence = [(word, tag)]
            
        # Continue current sentence for I_CLS
        elif cls == 'I_CLS':
            sentence.append((word, tag))
            
        # End sentence at E_CLS
        elif cls == 'E_CLS':
            sentence.append((word, tag))
            sentences.append(sentence)
            sentence = []
    
    # Add last sentence if exists
    if sentence:
        sentences.append(sentence)
    
    return sentences

# Create sentence groups
train_sentences = group_sentences(train_data)
eval_sentences = group_sentences(eval_data) 
test_sentences = group_sentences(test_data)

# Print sample to verify
print("Sample sentence:")
print(train_sentences[0])
print(f"\nTotal sentences: {len(train_sentences)}")

Sample sentence:
[('สภาสังคมสงเคราะห์แห่งประเทศ', 'B_ORG'), ('ไทย', 'E_ORG'), ('จี้', 'O'), ('ศาล', 'O'), ('ไฟเขียว', 'O'), ('ขาย', 'O'), ('สินทรัพย์', 'O'), ('ปู', 'O'), ('ทาง', 'O'), ('ปรับ', 'O'), ('โครงสร้าง', 'O')]

Total sentences: 220789


In [4]:
%pip install -U transformers datasets
%pip install sentencepiece
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# โหลด Tokenizer และ Model
tokenizer = AutoTokenizer.from_pretrained("lst-nectec/HoogBERTa-NER-lst20")
model = AutoModelForTokenClassification.from_pretrained("lst-nectec/HoogBERTa-NER-lst20").to("cuda")


# ดูรายละเอียดโมเดล (เช่น จำนวน Labels)
print(model.config.num_labels)


  from .autonotebook import tqdm as notebook_tqdm


48


In [6]:
# Tokenize ข้อมูล
def tokenize_and_align_labels(sentences):
    tokenized_inputs = []
    labels = []
    
    for sentence in sentences:
        words, tags = zip(*sentence)
        tokenized_input = tokenizer(list(words), is_split_into_words=True, truncation=True, padding=True, max_length=128)
        word_ids = tokenized_input.word_ids()  # ติดตาม index ของคำ
        label_ids = []
        
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # สำหรับตำแหน่ง padding
            else:
                label_ids.append(get_tag_id(tags[word_id]))  # ใช้ฟังก์ชัน safe mapping
        tokenized_inputs.append(tokenized_input)
        labels.append(label_ids)
    
    return tokenized_inputs, labels

# Read tag mapping from CSV
tag_mapping_df = pd.read_csv('tag_list.csv')

# Create mapping dictionaries
tag_to_id = dict(zip(tag_mapping_df['tag'], tag_mapping_df['class']))
id_to_tag = dict(zip(tag_mapping_df['class'], tag_mapping_df['tag']))

# Create safe mapping function with default value 0
def get_tag_id(tag):
    return tag_to_id.get(tag, 0)  # Returns 0 for unknown tags

# Tokenize train, eval, และ test
train_encodings, train_labels = tokenize_and_align_labels(train_sentences)
eval_encodings, eval_labels = tokenize_and_align_labels(eval_sentences)
test_encodings, test_labels = tokenize_and_align_labels(test_sentences)

In [7]:
train_labels

[[-100, 1, 1, 1, 1, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100],
 [-100, 1, 6, 6, 6, 6, 6, 7, 0, 0, 0, 0, 0, 0, 0, -100],
 [-100, 0, 0, 0, 0, -100],
 [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100],
 [-100, 0, 0, -100],
 [-100, 1, 1, 1, 0, 0, 1, 0, 0, 0, -100],
 [-100, 0, 0, 0, 0, 0, -100],
 [-100, 0, 1, 1, 0, 0, -100],
 [-100, 0, 0, 0, -100],
 [-100, 0, 0, -100],
 [-100, 0, 0, -100],
 [-100, 0, 0, 0, 0, 0, 0, 0, 0, 13, 5, 5, 5, 16, -100],
 [-100, 0, 0, 0, -100],
 [-100, 0, 1, 1, 0, -100],
 [-100, 0, 1, 0, 0, 0, 0, 1, 6, 7, 0, 0, 0, 0, 0, 0, -100],
 [-100, 0, 0, 0, 0, 0, 0, 0, 1, 7, 7, 7, 0, 0, -100],
 [-100, 2, 2, 2, 2, 8, 10, 10, 0, 0, 0, 1, 0, 0, 0, 0, 0, -100],
 [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100],
 [-100, 0, 0, 0, 0, 0, 0, 1, -100],
 [-100, 6, 7, 7, 7, -100],
 [-100,
  0,
  0,
  0,
  0,
  0,
  4,
  15,
  15,
  15,
  15,
  17,
  0,
  0,
  0,
  0,
  4,
  4,
  4,
  15,
  15,
  15,
  17,
  0,
  -100],
 [-100,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  6,
  7,
  0,
  0,
  0,
  4,
  4,
  4,
  15,
  15,
 

In [8]:
test_encodings

[{'input_ids': [0, 762, 32392, 855, 8999, 4, 282, 4, 107, 11623, 2751, 1246, 2989, 4215, 112, 4, 640, 15781, 1027, 55, 12011, 5008, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 45, 98, 3483, 2], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]},
 {'input_ids': [0, 406, 4055, 3219, 4, 49608, 14118, 4, 31, 988, 4, 208, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 20, 26756, 5, 4027, 3470, 3616, 6663, 7520, 4, 30, 5899, 20300, 29, 4, 423, 15, 2751, 20312, 304, 64, 4, 2982, 4, 235, 4, 229, 4, 845, 95, 372, 231, 4, 12, 225, 209, 423, 11, 4, 2098, 1440, 2912, 6394, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'att

In [9]:
print("Sample mappings:")
print("\nTag to ID examples:")
print({k: v for k, v in list(tag_to_id.items())[:5]})
print("\nID to Tag examples:")
print({k: v for k, v in list(id_to_tag.items())[:5]})

# Test unknown tag handling
print("\nUnknown tag test:")
print(f"Unknown tag 'TEST' maps to: {get_tag_id('TEST')}")

Sample mappings:

Tag to ID examples:
{'O': 0, 'B_ORG': 1, 'B_PER': 2, 'B_LOC': 3, 'B_MEA': 4}

ID to Tag examples:
{0: 'O', 1: 'B_ORG', 2: 'B_PER', 3: 'B_LOC', 4: 'B_MEA'}

Unknown tag test:
Unknown tag 'TEST' maps to: 0


In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

# 1. Modified NERDataset with proper padding
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, max_length=128):
        self.encodings = encodings
        self.labels = labels
        self.max_length = max_length

    def __getitem__(self, idx):
        # Get current encoding and labels
        encoding = self.encodings[idx]
        label = self.labels[idx]
        
        # Pad input_ids and attention_mask
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        
        # Ensure consistent length
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
            attention_mask = attention_mask[:self.max_length]
            label = label[:self.max_length]
        else:
            # Pad sequences
            padding_length = self.max_length - len(input_ids)
            input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
            attention_mask = attention_mask + [0] * padding_length
            label = label + [-100] * padding_length
        
        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(label)
        }

    def __len__(self):
        return len(self.encodings)

# 2. Create datasets with fixed length
train_dataset = NERDataset(train_encodings, train_labels)
eval_dataset = NERDataset(eval_encodings, eval_labels)
test_dataset = NERDataset(test_encodings, test_labels)

# 3. Create DataLoader with fixed batch size
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16)
# DataLoader สำหรับ test ต้องไม่มีการสุ่ม
test_loader = DataLoader(
    test_dataset, 
    batch_size=16,
    shuffle=False,  # ตั้งค่าเป็น False เพื่อรักษาลำดับ
    drop_last=False  # ไม่ทิ้ง batch สุดท้ายแม้จะไม่เต็ม batch_size
)


In [13]:
import torch
from torch.utils.data import Subset

# คำนวณจำนวนตัวอย่างที่ต้องการ (20% ของ train_loader)
num_samples = int(len(train_dataset) * 0.1)

# สุ่มเลือกตัวอย่าง
indices = torch.randperm(len(train_dataset)).tolist()[:num_samples]

# สร้าง Subset ของ train_dataset
subset_train_dataset = Subset(train_dataset, indices)

# สร้าง DataLoader ใหม่สำหรับ subset
train_loader = DataLoader(subset_train_dataset, batch_size=16, shuffle=True)

# ตรวจสอบจำนวนตัวอย่างใน subset_train_loader
print(f"Number of samples in subset_train_loader: {len(train_loader.dataset)}")

Number of samples in subset_train_loader: 22078


In [15]:
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer

def evaluate(model, dataloader, id_to_tag):
    model.eval()
    true_labels = []
    pred_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            labels = batch['labels'].cpu().numpy()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=2).cpu().numpy()
            
            # Process each sequence in the batch
            for label, pred in zip(labels, predictions):
                # Filter out padding (-100)
                valid_indices = label != -100
                true_sequence = label[valid_indices]
                pred_sequence = pred[valid_indices]
                
                # Convert to tag names
                true_tags = [id_to_tag[l] for l in true_sequence]
                pred_tags = [id_to_tag[p] for p in pred_sequence]
                
                true_labels.append(true_tags)
                pred_labels.append(pred_tags)
    
    # Convert to binary format using MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    true_binary = mlb.fit_transform([[tag] for tag in sum(true_labels, [])])
    pred_binary = mlb.transform([[tag] for tag in sum(pred_labels, [])])
    
    # Calculate metrics
    f1 = f1_score(true_binary, pred_binary, average='macro')
    report = classification_report(true_binary, pred_binary, 
                                target_names=mlb.classes_)
    
    return f1, report

In [16]:
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup
import torch.nn as nn

# 1. Initialize gradient scaler for mixed precision
scaler = GradScaler()

# 2. Setup optimized parameters
batch_size = 64  # increased batch size
accumulation_steps = 2  # gradient accumulation
num_epochs = 1
max_grad_norm = 1.0  # gradient clipping
warmup_ratio = 0.1

# 3. Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-3)
total_steps = len(train_loader) * num_epochs // accumulation_steps
warmup_steps = int(total_steps * warmup_ratio)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

  scaler = GradScaler()


In [17]:
# 4. Optimized training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}')
    optimizer.zero_grad()
    
    for i, batch in enumerate(progress_bar):
        # Move batch to GPU
        input_ids = batch['input_ids'].to("cuda")
        attention_mask = batch['attention_mask'].to("cuda")
        labels = batch['labels'].to("cuda")
        
        # Mixed precision training
        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / accumulation_steps
        
        # Scaled backward pass
        scaler.scale(loss).backward()
        
        if (i + 1) % accumulation_steps == 0:
            # Gradient clipping
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
            # Optimizer and scheduler step
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * accumulation_steps
        progress_bar.set_postfix({
            'loss': loss.item() * accumulation_steps,
            'lr': scheduler.get_last_lr()[0]
        })
        
    # Evaluation
    avg_loss = total_loss / len(train_loader)
    print(f"Average loss: {avg_loss:.4f}")

  with autocast():
Epoch 1: 100%|██████████| 1380/1380 [03:23<00:00,  6.78it/s, loss=0.761, lr=0]       

Average loss: 1.3698





In [18]:
val_f1, val_report = evaluate(model, eval_loader, id_to_tag)
print(f"\nEpoch {epoch+1}:")
print(f"Validation F1: {val_f1:.4f}")
print("\nValidation Report:")
print(val_report)


Epoch 1:
Validation F1: 0.0274

Validation Report:
              precision    recall  f1-score   support

       B_BRN       0.00      0.00      0.00        73
       B_DES       0.00      0.00      0.00      2620
       B_DTM       0.00      0.00      0.00      1979
       B_LOC       0.00      0.00      0.00      4611
       B_MEA       0.00      0.00      0.00      4042
       B_NUM       0.00      0.00      0.00      1642
       B_ORG       0.00      0.00      0.00      6390
       B_PER       0.00      0.00      0.00      7632
       B_TRM       0.00      0.00      0.00       327
       B_TTL       0.00      0.00      0.00      2235
       E_BRN       0.00      0.00      0.00        13
       E_DES       0.00      0.00      0.00       982
       E_DTM       0.00      0.00      0.00      1725
       E_LOC       0.00      0.00      0.00      4374
       E_MEA       0.00      0.00      0.00       866
       E_NUM       0.00      0.00      0.00       170
       E_ORG       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
test_df = pd.read_csv('test_data.csv')
test_df

Unnamed: 0,word,pos,tag,class
0,รัฐ,NN,O,B_CLS
1,ถังแตก,VV,O,I_CLS
2,วิก,NN,O,I_CLS
3,_,NN,O,I_CLS
4,7,NN,O,I_CLS
...,...,...,...,...
213086,ครหา,VV,O,I_CLS
213087,เกี่ยวกับ,VV,O,I_CLS
213088,ความ,FX,O,I_CLS
213089,ไม่,NG,O,I_CLS


In [14]:
def predict_tags(model, dataloader, id_to_tag):
    model.eval()
    all_predictions = []
    
    with torch.no_grad():
        # ใช้ enumerate เพื่อติดตามลำดับ batch
        for batch_idx, batch in enumerate(dataloader):
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=2).cpu().numpy()
            
            # Process each sequence in order
            for pred, mask in zip(predictions, batch['attention_mask'].numpy()):
                valid_indices = mask == 1
                pred_sequence = pred[valid_indices]
                pred_tags = [id_to_tag[p] for p in pred_sequence]
                all_predictions.extend(pred_tags)
    
    return all_predictions

# Get predictions
predictions = predict_tags(model, test_loader, id_to_tag)

# Update test_df with predictions
test_df['tag'] = predictions[:len(test_df)]

# Save updated test_df
test_df.to_csv('test_data_with_predictions.csv', index=False)

print(f"Total predictions: {len(predictions)}")
print("\nSample of updated test data:")
print(test_df[['word', 'tag']].head(10))

Total predictions: 257378

Sample of updated test data:
     word tag
0     รัฐ   O
1  ถังแตก   O
2     วิก   O
3       _   O
4       7   O
5       _   O
6      สี   O
7     ชวด   O
8   โบนัส   O
9  ธนาคาร   O


In [15]:
tag_mapping = {
    'O': 0,
    'B_ORG': 1, 
    'B_PER': 2,
    'B_LOC': 3,
    'B_MEA': 4,
    'I_DTM': 5,
    'I_ORG': 6,
    'E_ORG': 7,
    'I_PER': 8,
    'B_TTL': 9,
    'E_PER': 10,
    'B_DES': 11,
    'E_LOC': 12,
    'B_DTM': 13,
    'B_NUM': 14,
    'I_MEA': 15,
    'E_DTM': 16,
    'E_MEA': 17,
    'I_LOC': 18,
    'I_DES': 19,
    'E_DES': 20,
    'I_NUM': 21,
    'E_NUM': 22,
    'B_TRM': 23,
    'B_BRN': 24,
    'I_TRM': 25,
    'E_TRM': 26,
    'I_TTL': 27,
    'I_BRN': 28,
    'E_BRN': 29,
    'E_TTL': 30,
    'B_NAME': 31
}

In [16]:
test_df['numeric_tag'] = test_df['tag'].map(lambda x: tag_mapping.get(x, 0))
test_df

Unnamed: 0,word,pos,tag,class,numeric_tag
0,รัฐ,NN,O,B_CLS,0
1,ถังแตก,VV,O,I_CLS,0
2,วิก,NN,O,I_CLS,0
3,_,NN,O,I_CLS,0
4,7,NN,O,I_CLS,0
...,...,...,...,...,...
213086,ครหา,VV,O,I_CLS,0
213087,เกี่ยวกับ,VV,O,I_CLS,0
213088,ความ,FX,O,I_CLS,0
213089,ไม่,NG,O,I_CLS,0


In [17]:
out = pd.read_csv('sample_submission.csv')
out

Unnamed: 0,id,ne
0,03795_0,0.0
1,03795_1,0.0
2,03795_2,1.0
3,03795_3,6.0
4,03795_4,6.0
...,...,...
213086,04276_844,
213087,04276_845,
213088,04276_846,
213089,04276_847,


In [18]:
out['ne'] = test_df['numeric_tag']
out

Unnamed: 0,id,ne
0,03795_0,0
1,03795_1,0
2,03795_2,0
3,03795_3,0
4,03795_4,0
...,...,...
213086,04276_844,0
213087,04276_845,0
213088,04276_846,0
213089,04276_847,0


In [19]:
out.to_csv('sample_submission.csv', index=False)