In [1]:
import os
import pandas as pd

# ฟังก์ชันสำหรับอ่านไฟล์จากโฟลเดอร์
def read_data_from_folder(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()
                    if line:  # ข้ามแถวว่าง
                        parts = line.split("\t")
                        if len(parts) == 4:  # ถ้ามี 4 คอลัมน์
                            data.append(parts)
                        elif len(parts) == 3:  # ถ้ามี 3 คอลัมน์ เติมค่า default สำหรับ `tag`
                            parts.insert(2, "O")  # ใส่ค่า "O" ที่ตำแหน่ง index 2
                            data.append(parts)
                        else:
                            print(f"Invalid line in {file_name}: {line}")
    return data

# ฟังก์ชันสำหรับรวบรวมและบันทึกข้อมูล
def process_and_save_data(input_folder, output_file):
    data = read_data_from_folder(input_folder)
    df = pd.DataFrame(data, columns=["word", "pos", "tag", "class"])
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved processed data to {output_file}")

# โฟลเดอร์ข้อมูล
train_folder = "train"
test_folder = "test"
eval_folder = "eval"

# เซฟข้อมูลเป็นไฟล์ CSV
process_and_save_data(train_folder, "train_data.csv")
process_and_save_data(eval_folder, "eval_data.csv")
process_and_save_data(test_folder, "test_data.csv")


Saved processed data to train_data.csv
Saved processed data to eval_data.csv
Saved processed data to test_data.csv


In [2]:
import pandas as pd

# โหลดข้อมูล
train_data = pd.read_csv('train_data.csv')
eval_data = pd.read_csv('eval_data.csv')
test_data = pd.read_csv('test_data.csv')

# ตรวจสอบตัวอย่างข้อมูล
print(train_data.head())
print(eval_data.head())
print(test_data.head())

                          word pos    tag  class
0  สภาสังคมสงเคราะห์แห่งประเทศ  NN  B_ORG  B_CLS
1                          ไทย  NN  E_ORG  I_CLS
2                          จี้  VV      O  I_CLS
3                          ศาล  NN      O  I_CLS
4                      ไฟเขียว  VV      O  I_CLS
    word pos    tag  class
0   โฆษก  NN      O  B_CLS
1   กอส.  NN  B_ORG  I_CLS
2  ตำหนิ  VV      O  I_CLS
3   แมนฯ  NN  B_ORG  I_CLS
4      _  NN  I_ORG  I_CLS
     word pos tag  class
0     รัฐ  NN   O  B_CLS
1  ถังแตก  VV   O  I_CLS
2     วิก  NN   O  I_CLS
3       _  NN   O  I_CLS
4       7  NN   O  I_CLS


In [3]:
eval_data['tag'][62605] = 'I_LOC'
eval_data['tag'][155660] = 'O'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  eval_data['tag'][62605] = 'I_LOC'
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Serie

In [4]:
eval_data['word'][62605]
eval_data['word'][155660]

'อาร์พี'

In [5]:
def group_sentences(data):
    sentences = []
    sentence = []
    for _, row in data.iterrows():
        word, tag = row['word'], row['tag']
        if word == "_":  # ตรวจสอบว่าจบประโยค
            if sentence:
                sentences.append(sentence)
                sentence = []
        else:
            sentence.append((word, tag))
    if sentence:
        sentences.append(sentence)  # เพิ่มประโยคสุดท้าย
    return sentences

train_sentences = group_sentences(train_data)
eval_sentences = group_sentences(eval_data)
test_sentences = group_sentences(test_data)

# ตัวอย่างประโยคหลังจัดกลุ่ม
print(train_sentences[0])  # [('สภา', 'B_ORG'), ('สังคมสงเคราะห์', 'I_ORG'), ...]


[('สภาสังคมสงเคราะห์แห่งประเทศ', 'B_ORG'), ('ไทย', 'E_ORG'), ('จี้', 'O'), ('ศาล', 'O'), ('ไฟเขียว', 'O'), ('ขาย', 'O'), ('สินทรัพย์', 'O'), ('ปู', 'O'), ('ทาง', 'O'), ('ปรับ', 'O'), ('โครงสร้าง', 'O'), ('สำนักงาน', 'B_ORG'), ('องค์การ', 'I_ORG'), ('พุทธศาสนิกสัมพันธ์', 'I_ORG'), ('แห่ง', 'I_ORG'), ('โลก', 'E_ORG'), ('วอน', 'O'), ('ศาล', 'O'), ('เร่ง', 'O'), ('เปิด', 'O'), ('ทาง', 'O'), ('สาย', 'O'), ('สินทรัพย์', 'O')]


In [None]:
# %pip install -U transformers datasets
# %pip install sentencepiece
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [7]:
import torch
print(torch.cuda.is_available())  # ถ้าผลลัพธ์เป็น True แสดงว่ามี GPU
print(torch.version.cuda)         # แสดงเวอร์ชันของ CUDA ที่รองรับ


True
11.8


In [8]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# โหลด Tokenizer และ Model
tokenizer = AutoTokenizer.from_pretrained("lst-nectec/HoogBERTa-NER-lst20")
model = AutoModelForTokenClassification.from_pretrained("lst-nectec/HoogBERTa-NER-lst20").to("cuda")


# ดูรายละเอียดโมเดล (เช่น จำนวน Labels)
print(model.config.num_labels)


  from .autonotebook import tqdm as notebook_tqdm


48


In [13]:
# Tokenize ข้อมูล
def tokenize_and_align_labels(sentences):
    tokenized_inputs = []
    labels = []
    
    for sentence in sentences:
        words, tags = zip(*sentence)
        tokenized_input = tokenizer(list(words), is_split_into_words=True, truncation=True, padding=True, max_length=128)
        word_ids = tokenized_input.word_ids()  # ติดตาม index ของคำ
        label_ids = []
        
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # สำหรับตำแหน่ง padding
            else:
                label_ids.append(get_tag_id(tags[word_id]))  # ใช้ฟังก์ชัน safe mapping
        tokenized_inputs.append(tokenized_input)
        labels.append(label_ids)
    
    return tokenized_inputs, labels

# Read tag mapping from CSV
tag_mapping_df = pd.read_csv('tag_list.csv')

# Create mapping dictionaries
tag_to_id = dict(zip(tag_mapping_df['tag'], tag_mapping_df['class']))
id_to_tag = dict(zip(tag_mapping_df['class'], tag_mapping_df['tag']))

# Create safe mapping function with default value 0
def get_tag_id(tag):
    return tag_to_id.get(tag, 0)  # Returns 0 for unknown tags

# Tokenize train, eval, และ test
train_encodings, train_labels = tokenize_and_align_labels(train_sentences)
eval_encodings, eval_labels = tokenize_and_align_labels(eval_sentences)
test_encodings, test_labels = tokenize_and_align_labels(test_sentences)

In [15]:
id_to_tag

{0: 'O',
 1: 'B_ORG',
 2: 'B_PER',
 3: 'B_LOC',
 4: 'B_MEA',
 5: 'I_DTM',
 6: 'I_ORG',
 7: 'E_ORG',
 8: 'I_PER',
 9: 'B_TTL',
 10: 'E_PER',
 11: 'B_DES',
 12: 'E_LOC',
 13: 'B_DTM',
 14: 'B_NUM',
 15: 'I_MEA',
 16: 'E_DTM',
 17: 'E_MEA',
 18: 'I_LOC',
 19: 'I_DES',
 20: 'E_DES',
 21: 'I_NUM',
 22: 'E_NUM',
 23: 'B_TRM',
 24: 'B_BRN',
 25: 'I_TRM',
 26: 'E_TRM',
 27: 'I_TTL',
 28: 'I_BRN',
 29: 'E_BRN',
 30: 'E_TTL',
 31: 'B_NAME'}

In [16]:
train_encodings

[{'input_ids': [0, 3791, 42611, 5791, 191, 112, 3524, 1218, 20018, 137, 4407, 1290, 56, 435, 1276, 1812, 1781, 3432, 52253, 1030, 156, 190, 4920, 1218, 1537, 202, 56, 255, 4407, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 124, 1161, 10259, 10839, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 96, 22, 2], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]},
 {'input_ids': [0, 624, 1618, 7542, 1105, 1618, 2509, 96, 9893, 8266, 897, 3685, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'input_ids': [0, 30, 40800, 29, 2], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]},
 {'input_ids': [0, 2367, 15, 1218, 1537, 202, 56, 15, 3791, 499

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

# 1. Modified NERDataset with proper padding
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, max_length=128):
        self.encodings = encodings
        self.labels = labels
        self.max_length = max_length

    def __getitem__(self, idx):
        # Get current encoding and labels
        encoding = self.encodings[idx]
        label = self.labels[idx]
        
        # Pad input_ids and attention_mask
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        
        # Ensure consistent length
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]
            attention_mask = attention_mask[:self.max_length]
            label = label[:self.max_length]
        else:
            # Pad sequences
            padding_length = self.max_length - len(input_ids)
            input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
            attention_mask = attention_mask + [0] * padding_length
            label = label + [-100] * padding_length
        
        return {
            'input_ids': torch.tensor(input_ids),
            'attention_mask': torch.tensor(attention_mask),
            'labels': torch.tensor(label)
        }

    def __len__(self):
        return len(self.encodings)

# 2. Create datasets with fixed length
train_dataset = NERDataset(train_encodings, train_labels)
eval_dataset = NERDataset(eval_encodings, eval_labels)
test_dataset = NERDataset(test_encodings, test_labels)

# 3. Create DataLoader with fixed batch size
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16)
test_loader = DataLoader(
    test_dataset, 
    batch_size=16,
    shuffle=False,  # ตั้งค่าเป็น False เพื่อรักษาลำดับ
    drop_last=False  # ไม่ทิ้ง batch สุดท้ายแม้จะไม่เต็ม batch_size
)

In [18]:
model = AutoModelForTokenClassification.from_pretrained("lst-nectec/HoogBERTa-NER-lst20").to("cuda")

In [20]:
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer

def evaluate(model, dataloader, id_to_tag):
    model.eval()
    true_labels = []
    pred_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            labels = batch['labels'].cpu().numpy()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=2).cpu().numpy()
            
            # Process each sequence in the batch
            for label, pred in zip(labels, predictions):
                # Filter out padding (-100)
                valid_indices = label != -100
                true_sequence = label[valid_indices]
                pred_sequence = pred[valid_indices]
                
                # Convert to tag names
                true_tags = [id_to_tag[l] for l in true_sequence]
                pred_tags = [id_to_tag[p] for p in pred_sequence]
                
                true_labels.append(true_tags)
                pred_labels.append(pred_tags)
    
    # Convert to binary format using MultiLabelBinarizer
    mlb = MultiLabelBinarizer()
    true_binary = mlb.fit_transform([[tag] for tag in sum(true_labels, [])])
    pred_binary = mlb.transform([[tag] for tag in sum(pred_labels, [])])
    
    # Calculate metrics
    f1 = f1_score(true_binary, pred_binary, average='macro')
    report = classification_report(true_binary, pred_binary, 
                                target_names=mlb.classes_)
    
    return f1, report

In [29]:
from torch.cuda.amp import autocast, GradScaler
from transformers import get_linear_schedule_with_warmup
import torch.nn as nn

# 1. Initialize gradient scaler for mixed precision
scaler = GradScaler()

# 2. Setup optimized parameters
batch_size = 64  # increased batch size
accumulation_steps = 2  # gradient accumulation
num_epochs = 3
max_grad_norm = 1.0  # gradient clipping
warmup_ratio = 0.1

# 3. Setup optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-3)
total_steps = len(train_loader) * num_epochs // accumulation_steps
warmup_steps = int(total_steps * warmup_ratio)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

  scaler = GradScaler()


In [30]:
from torch.utils.data import Subset
import numpy as np

# Calculate size of 10% subset
total_size = len(train_dataset)
subset_size = int(0.1 * total_size)

# Create random indices for 10% of data
indices = np.random.choice(total_size, subset_size, replace=False)

# Create subset dataset
train_subset = Subset(train_dataset, indices)

# Create DataLoader with subset
train_loader = DataLoader(
    train_subset, 
    batch_size=16, 
    shuffle=True
)

print(f"Original dataset size: {total_size}")
print(f"Subset size (10%): {subset_size}")
print(f"Number of batches in subset: {len(train_loader_small)}")

Original dataset size: 406422
Subset size (10%): 40642
Number of batches in subset: 2541


In [28]:
# 4. Optimized training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}')
    optimizer.zero_grad()
    
    for i, batch in enumerate(progress_bar):
        # Move batch to GPU
        input_ids = batch['input_ids'].to("cuda")
        attention_mask = batch['attention_mask'].to("cuda")
        labels = batch['labels'].to("cuda")
        
        # Mixed precision training
        with autocast():
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss / accumulation_steps
        
        # Scaled backward pass
        scaler.scale(loss).backward()
        
        if (i + 1) % accumulation_steps == 0:
            # Gradient clipping
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            
            # Optimizer and scheduler step
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * accumulation_steps
        progress_bar.set_postfix({
            'loss': loss.item() * accumulation_steps,
            'lr': scheduler.get_last_lr()[0]
        })
        
    # Evaluation
    avg_loss = total_loss / len(train_loader)
    print(f"Average loss: {avg_loss:.4f}")

  with autocast():
Epoch 1:  36%|███▌      | 903/2541 [02:12<03:42,  7.36it/s, loss=1.85, lr=0.00196]  

In [26]:
from torch.utils.data import Subset
import numpy as np

# Calculate size of 10% subset
total_size = len(eval_dataset)
subset_size = int(0.1 * total_size)

# Create random indices for 10% of data
indices = np.random.choice(total_size, subset_size, replace=False)

# Create subset dataset
eval_subset = Subset(eval_dataset, indices)

# Create DataLoader with subset
eval_loader = DataLoader(
    eval_subset, 
    batch_size=16, 
    shuffle=True
)

print(f"Original dataset size: {total_size}")
print(f"Subset size (10%): {subset_size}")
print(f"Number of batches in subset: {len(eval_loader)}")

Original dataset size: 39990
Subset size (10%): 3999
Number of batches in subset: 250


In [27]:
val_f1, val_report = evaluate(model, eval_loader, id_to_tag)
print(f"\nEpoch {epoch+1}:")
print(f"Validation F1: {val_f1:.4f}")
print("\nValidation Report:")
print(val_report)


Epoch 1:
Validation F1: 0.0279

Validation Report:
              precision    recall  f1-score   support

       B_BRN       0.00      0.00      0.00         9
       B_DES       0.00      0.00      0.00       281
       B_DTM       0.00      0.00      0.00       231
       B_LOC       0.00      0.00      0.00       461
       B_MEA       0.00      0.00      0.00       403
       B_NUM       0.00      0.00      0.00       155
       B_ORG       0.00      0.00      0.00       571
       B_PER       0.00      0.00      0.00       763
       B_TRM       0.00      0.00      0.00        19
       B_TTL       0.00      0.00      0.00       220
       E_BRN       0.00      0.00      0.00         3
       E_DES       0.00      0.00      0.00        86
       E_DTM       0.00      0.00      0.00       192
       E_LOC       0.00      0.00      0.00       433
       E_MEA       0.00      0.00      0.00       111
       E_NUM       0.00      0.00      0.00        18
       E_ORG       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [47]:
# predictions = []
# model.eval()
# with torch.no_grad():
#     for batch in test_loader:
#         input_ids = batch['input_ids'].to("cuda")
#         attention_mask = batch['attention_mask'].to("cuda")

#         outputs = model(input_ids, attention_mask=attention_mask)
#         preds = torch.argmax(outputs.logits, dim=2).cpu().numpy()

#         for p in preds:
#             pred_seq = [id_to_tag[x] for x in p]
#             predictions.append(pred_seq)

In [59]:
# แสดงข้อมูลใน test_loader
for batch in test_loader:
    print(batch) 
    break  # แสดงเฉพาะ batch แรกเพื่อไม่ให้ข้อมูลเยอะเกินไป

{'input_ids': tensor([[    0,   762, 32392,  ...,     1,     1,     1],
        [    0,   282,     2,  ...,     1,     1,     1],
        [    0,   107, 11623,  ...,     1,     1,     1],
        ...,
        [    0,    12,   225,  ...,     1,     1,     1],
        [    0,  2098,  1440,  ...,     1,     1,     1],
        [    0,    19,    35,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[-100,   35,   35,  ..., -100, -100, -100],
        [-100,   35, -100,  ..., -100, -100, -100],
        [-100,   35,   35,  ..., -100, -100, -100],
        ...,
        [-100,   35,   35,  ..., -100, -100, -100],
        [-100,   35,   35,  ..., -100, -100, -100],
        [-100,   35,   35,  ..., -100, -100, -100]])}


In [84]:
test_df = pd.read_csv('test_data.csv')
test_df

Unnamed: 0,word,pos,tag,class
0,รัฐ,NN,O,B_CLS
1,ถังแตก,VV,O,I_CLS
2,วิก,NN,O,I_CLS
3,_,NN,O,I_CLS
4,7,NN,O,I_CLS
...,...,...,...,...
213086,ครหา,VV,O,I_CLS
213087,เกี่ยวกับ,VV,O,I_CLS
213088,ความ,FX,O,I_CLS
213089,ไม่,NG,O,I_CLS


In [85]:
def predict_tags(model, dataloader, id_to_tag):
    model.eval()
    all_predictions = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to("cuda")
            attention_mask = batch['attention_mask'].to("cuda")
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=2).cpu().numpy()
            
            # Process each sequence
            for pred, mask in zip(predictions, batch['attention_mask'].numpy()):
                # Filter out padding
                valid_indices = mask == 1
                pred_sequence = pred[valid_indices]
                
                # Convert to tag names
                pred_tags = [id_to_tag[p] for p in pred_sequence]
                all_predictions.extend(pred_tags)
    
    return all_predictions

# Get predictions
predictions = predict_tags(model, test_loader, id_to_tag)

# Update test_df with predictions
test_df['tag'] = predictions[:len(test_df)]

# Save updated test_df
test_df.to_csv('test_data_with_predictions.csv', index=False)

print(f"Total predictions: {len(predictions)}")
print("\nSample of updated test data:")
print(test_df[['word', 'tag']].head(10))

KeyError: 35

In [82]:
test_df

Unnamed: 0,word,pos,tag,class,numeric_tag
0,รัฐ,NN,O,B_CLS,0
1,ถังแตก,VV,O,I_CLS,0
2,วิก,NN,O,I_CLS,0
3,_,NN,O,I_CLS,0
4,7,NN,B_ORG,I_CLS,1
...,...,...,...,...,...
213086,ครหา,VV,O,I_CLS,0
213087,เกี่ยวกับ,VV,O,I_CLS,0
213088,ความ,FX,O,I_CLS,0
213089,ไม่,NG,O,I_CLS,0


In [79]:
tag_mapping = {
    'O': 0,
    'B_ORG': 1, 
    'B_PER': 2,
    'B_LOC': 3,
    'B_MEA': 4,
    'I_DTM': 5,
    'I_ORG': 6,
    'E_ORG': 7,
    'I_PER': 8,
    'B_TTL': 9,
    'E_PER': 10,
    'B_DES': 11,
    'E_LOC': 12,
    'B_DTM': 13,
    'B_NUM': 14,
    'I_MEA': 15,
    'E_DTM': 16,
    'E_MEA': 17,
    'I_LOC': 18,
    'I_DES': 19,
    'E_DES': 20,
    'I_NUM': 21,
    'E_NUM': 22,
    'B_TRM': 23,
    'B_BRN': 24,
    'I_TRM': 25,
    'E_TRM': 26,
    'I_TTL': 27,
    'I_BRN': 28,
    'E_BRN': 29,
    'E_TTL': 30,
    'B_NAME': 31
}

In [90]:
test_df['numeric_tag'] = test_df['tag'].map(lambda x: tag_mapping.get(x, 0))
test_df

Unnamed: 0,word,pos,tag,class,numeric_tag
0,รัฐ,NN,O,B_CLS,0
1,ถังแตก,VV,O,I_CLS,0
2,วิก,NN,O,I_CLS,0
3,_,NN,O,I_CLS,0
4,7,NN,O,I_CLS,0
...,...,...,...,...,...
213086,ครหา,VV,O,I_CLS,0
213087,เกี่ยวกับ,VV,O,I_CLS,0
213088,ความ,FX,O,I_CLS,0
213089,ไม่,NG,O,I_CLS,0


In [91]:
out = pd.read_csv('sample_submission.csv')
out

Unnamed: 0,id,ne
0,03795_0,0.0
1,03795_1,0.0
2,03795_2,1.0
3,03795_3,6.0
4,03795_4,6.0
...,...,...
213086,04276_844,
213087,04276_845,
213088,04276_846,
213089,04276_847,


In [92]:
out['ne'] = test_df['numeric_tag']

In [93]:
out

Unnamed: 0,id,ne
0,03795_0,0
1,03795_1,0
2,03795_2,0
3,03795_3,0
4,03795_4,0
...,...,...
213086,04276_844,0
213087,04276_845,0
213088,04276_846,0
213089,04276_847,0


In [None]:
out.to_csv('submission.csv', index=False)

In [None]:
# # 1. Prepare test data without tags
# def prepare_test_data(test_df):
#     # Remove existing tag column
#     test_df = test_df.drop('tag', axis=1)
#     # Add dummy tag column with 'O' for tokenization format
#     test_df['tag'] = 'O'
#     return test_df

# # 2. Prediction function
# def predict_safe(model, test_loader, id_to_tag):
#     model.eval()
#     all_predictions = []
    
#     with torch.no_grad():
#         for batch in tqdm(test_loader, desc="Predicting"):
#             input_ids = batch['input_ids'].to("cuda")
#             attention_mask = batch['attention_mask'].to("cuda")
            
#             outputs = model(input_ids=input_ids, attention_mask=attention_mask)
#             batch_preds = torch.argmax(outputs.logits, dim=2).cpu().numpy()
#             masks = batch['attention_mask'].cpu().numpy()
            
#             # Process each sequence
#             for pred, mask in zip(batch_preds, masks):
#                 valid_pred = []
#                 valid_indices = mask == 1
#                 pred = pred[valid_indices]
#                 valid_pred = [id_to_tag.get(p, 'O') for p in pred]
#                 all_predictions.append(valid_pred)
    
#     return all_predictions

# def format_results(test_df, predictions):
#     results = []
#     sentence_idx = 0
#     sentence_groups = test_df.groupby((test_df['word'] == '_').cumsum())
    
#     for _, group in sentence_groups:
#         if sentence_idx >= len(predictions):
#             print(f"Warning: More sentences than predictions. Stopping at {sentence_idx}")
#             break
            
#         pred_tags = predictions[sentence_idx]
#         word_idx = 0
        
#         for _, row in group.iterrows():
#             if row['word'] != '_':
#                 if word_idx < len(pred_tags):
#                     results.append({
#                         'word': row['word'],
#                         'pos': row['pos'],
#                         'predicted_tag': pred_tags[word_idx],
#                         'class': row['class']
#                     })
#                     word_idx += 1
#                 else:
#                     print(f"Warning: More words than predictions in sentence {sentence_idx}")
#                     results.append({
#                         'word': row['word'],
#                         'pos': row['pos'],
#                         'predicted_tag': 'O',
#                         'class': row['class']
#                     })
        
#         sentence_idx += 1
    
#     return pd.DataFrame(results)

# # 3. Process and save predictions
# test_df = pd.read_csv('test_data.csv')
# test_df_clean = prepare_test_data(test_df)

# # Re-process test data
# test_sentences = group_sentences(test_df_clean)
# test_encodings, test_labels = tokenize_and_align_labels(test_sentences)
# test_dataset = NERDataset(test_encodings, test_labels)
# test_loader = DataLoader(test_dataset, batch_size=16)

# # Get predictions
# predictions = predict_safe(model, test_loader, id_to_tag)
# results_df = format_results(test_df, predictions)
# # 4. Format results
# results = []
# current_pred_idx = 0

# for _, group in test_df.groupby((test_df['word'] == '_').cumsum()):
#     sentence_preds = predictions[current_pred_idx]
    
#     for word, pos, cls in zip(group['word'], group['pos'], group['class']):
#         if word != '_':  # Skip sentence separators
#             results.append({
#                 'word': word,
#                 'pos': pos,
#                 'predicted_tag': sentence_preds[len(results) % len(sentence_preds)],
#                 'class': cls
#             })
#     current_pred_idx += 1

# # 5. Save results
# results_df = pd.DataFrame(results)
# results_df.to_csv('predictions.csv', index=False)

# print("Sample predictions:")
# print(results_df.head(10))

Unnamed: 0,word,pos,tag,class
0,รัฐ,NN,O,B_CLS
1,ถังแตก,VV,O,I_CLS
2,วิก,NN,O,I_CLS
3,_,NN,O,I_CLS
4,7,NN,O,I_CLS
...,...,...,...,...
213086,ครหา,VV,O,I_CLS
213087,เกี่ยวกับ,VV,O,I_CLS
213088,ความ,FX,O,I_CLS
213089,ไม่,NG,O,I_CLS
