In [22]:
import os
import pandas as pd

# ฟังก์ชันสำหรับอ่านไฟล์จากโฟลเดอร์
def read_data_from_folder(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()
                    if line:  # ข้ามแถวว่าง
                        parts = line.split("\t")
                        if len(parts) == 4:  # ถ้ามี 4 คอลัมน์
                            data.append(parts)
                        elif len(parts) == 3:  # ถ้ามี 3 คอลัมน์ เติมค่า default สำหรับ `tag`
                            parts.insert(2, "O")  # ใส่ค่า "O" ที่ตำแหน่ง index 2
                            data.append(parts)
                        else:
                            print(f"Invalid line in {file_name}: {line}")
    return data

# ฟังก์ชันสำหรับรวบรวมและบันทึกข้อมูล
def process_and_save_data(input_folder, output_file):
    data = read_data_from_folder(input_folder)
    df = pd.DataFrame(data, columns=["word", "pos", "tag", "class"])
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved processed data to {output_file}")

# โฟลเดอร์ข้อมูล
train_folder = "train"
test_folder = "test"
eval_folder = "eval"

# เซฟข้อมูลเป็นไฟล์ CSV
process_and_save_data(train_folder, "train_data.csv")
process_and_save_data(eval_folder, "eval_data.csv")
process_and_save_data(test_folder, "test_data.csv")


Saved processed data to train_data.csv
Saved processed data to eval_data.csv
Saved processed data to test_data.csv


In [23]:
import pandas as pd
train = pd.read_csv("train_data.csv")
train

Unnamed: 0,word,pos,tag,class
0,สภาสังคมสงเคราะห์แห่งประเทศ,NN,B_ORG,B_CLS
1,ไทย,NN,E_ORG,I_CLS
2,จี้,VV,O,I_CLS
3,ศาล,NN,O,I_CLS
4,ไฟเขียว,VV,O,I_CLS
...,...,...,...,...
2804216,ระหว่าง,PS,O,I_CLS
2804217,เรียน,VV,O,I_CLS
2804218,กับ,PS,O,I_CLS
2804219,งาน,NN,O,I_CLS


In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Read tag list
tag_df = pd.read_csv('tag_list.csv')
print("Number of unique tags:", len(tag_df))
print("\nFirst few tags:")
print(tag_df.head())

Number of unique tags: 32

First few tags:
     tag  class
0      O      0
1  B_ORG      1
2  B_PER      2
3  B_LOC      3
4  B_MEA      4


In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# อ่านไฟล์ tag_list.csv
df_tags = pd.read_csv('tag_list.csv')

# สร้าง mapping dictionaries ตาม class ที่มีอยู่แล้วใน CSV
tag2idx = dict(zip(df_tags['tag'], df_tags['class']))
idx2tag = dict(zip(df_tags['class'], df_tags['tag']))

# แสดงผลลัพธ์
print("Tag to Index mapping:")
for tag, idx in tag2idx.items():
    print(f"{tag}: {idx}")

print("\nIndex to Tag mapping:")
for idx, tag in idx2tag.items():
    print(f"{idx}: {tag}")

# เก็บ mapping ไว้ใช้ต่อ
np.save('tag2idx.npy', tag2idx) 
np.save('idx2tag.npy', idx2tag)

# ตัวอย่างการใช้งาน
print("\nExample usage:")
print(f"Convert 'B_ORG' to index: {tag2idx['B_ORG']}")
print(f"Convert index 1 back to tag: {idx2tag[1]}")

Tag to Index mapping:
O: 0
B_ORG: 1
B_PER: 2
B_LOC: 3
B_MEA: 4
I_DTM: 5
I_ORG: 6
E_ORG: 7
I_PER: 8
B_TTL: 9
E_PER: 10
B_DES: 11
E_LOC: 12
B_DTM: 13
B_NUM: 14
I_MEA: 15
E_DTM: 16
E_MEA: 17
I_LOC: 18
I_DES: 19
E_DES: 20
I_NUM: 21
E_NUM: 22
B_TRM: 23
B_BRN: 24
I_TRM: 25
E_TRM: 26
I_TTL: 27
I_BRN: 28
E_BRN: 29
E_TTL: 30
B_NAME: 31

Index to Tag mapping:
0: O
1: B_ORG
2: B_PER
3: B_LOC
4: B_MEA
5: I_DTM
6: I_ORG
7: E_ORG
8: I_PER
9: B_TTL
10: E_PER
11: B_DES
12: E_LOC
13: B_DTM
14: B_NUM
15: I_MEA
16: E_DTM
17: E_MEA
18: I_LOC
19: I_DES
20: E_DES
21: I_NUM
22: E_NUM
23: B_TRM
24: B_BRN
25: I_TRM
26: E_TRM
27: I_TTL
28: I_BRN
29: E_BRN
30: E_TTL
31: B_NAME

Example usage:
Convert 'B_ORG' to index: 1
Convert index 1 back to tag: B_ORG


In [28]:
import pandas as pd
import numpy as np

# 1. โหลด mapping ที่สร้างไว้
tag2idx = np.load('tag2idx.npy', allow_pickle=True).item()
idx2tag = np.load('idx2tag.npy', allow_pickle=True).item()

# 2. ฟังก์ชันสำหรับอ่านและแปลงข้อมูล
def process_data_file(file_path):
    # อ่านข้อมูลจาก CSV
    df = pd.read_csv(file_path)
    
    # แปลง tags เป็น numeric โดยใช้ tag2idx mapping
    df['numeric_tag'] = df['tag'].map(tag2idx)
    
    return df

# 3. อ่านและแปลงข้อมูลทั้ง 3 ชุด
train_df = process_data_file('train_data.csv')
test_df = process_data_file('test_data.csv')
eval_df = process_data_file('eval_data.csv')

# 4. แสดงตัวอย่างผลลัพธ์
print("Sample from training data:")
print(train_df[['pos', 'tag', 'numeric_tag']].head())

# 5. ตรวจสอบการแปลงกลับ
print("\nVerify conversion back to tags:")
sample_numeric = train_df['numeric_tag'].iloc[0]
print(f"Numeric tag: {sample_numeric}")
print(f"Original tag: {idx2tag[sample_numeric]}")

# 6. บันทึกข้อมูลที่แปลงแล้ว
train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)
eval_df.to_csv('processed_eval.csv', index=False)

# 7. แสดงสถิติ
print("\nDataset statistics:")
print(f"Train samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Eval samples: {len(eval_df)}")

Sample from training data:
  pos    tag  numeric_tag
0  NN  B_ORG          1.0
1  NN  E_ORG          7.0
2  VV      O          0.0
3  NN      O          0.0
4  VV      O          0.0

Verify conversion back to tags:
Numeric tag: 1.0
Original tag: B_ORG

Dataset statistics:
Train samples: 2804221
Test samples: 213091
Eval samples: 248470


In [31]:
import pandas as pd

# โหลดข้อมูลจากไฟล์ CSV
input_csv = "train_data.csv"  # ชื่อไฟล์ CSV ของคุณ
output_txt = "train_ner.txt"  # ชื่อไฟล์ที่จะแปลง

# อ่านข้อมูล CSV
df = pd.read_csv(input_csv)

# เลือกเฉพาะคอลัมน์ที่ต้องการ (word, pos, tag)
columns_to_use = ["word", "pos", "tag"]
df = df[columns_to_use]

# ตรวจสอบว่าข้อมูลเรียงตามลำดับประโยคหรือไม่ (ถ้ามีข้อมูลแบ่งประโยค)
# ถ้าไม่มีข้อมูลแบ่งประโยค ให้ใช้ logic แทรกบรรทัดว่างเองตามต้องการ

# เขียนข้อมูลให้อยู่ในรูปแบบ NER
with open(output_txt, "w", encoding="utf-8") as file:
    for _, row in df.iterrows():
        if row["word"] == "":
            file.write("\n")  # บรรทัดว่างสำหรับแยกประโยค
        else:
            file.write(f"{row['word']} {row['pos']} {row['tag']}\n")

print(f"แปลงข้อมูลเสร็จสิ้นและบันทึกไว้ใน {output_txt}")


Unnamed: 0,word,pos,tag,class,numeric_tag
0,สภาสังคมสงเคราะห์แห่งประเทศ,NN,B_ORG,B_CLS,1.0
1,ไทย,NN,E_ORG,I_CLS,7.0
2,จี้,VV,O,I_CLS,0.0
3,ศาล,NN,O,I_CLS,0.0
4,ไฟเขียว,VV,O,I_CLS,0.0
...,...,...,...,...,...
2804216,ระหว่าง,PS,O,I_CLS,0.0
2804217,เรียน,VV,O,I_CLS,0.0
2804218,กับ,PS,O,I_CLS,0.0
2804219,งาน,NN,O,I_CLS,0.0


In [32]:
# 1. Import required libraries
import pandas as pd
import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

# 2. Load data
train_df = pd.read_csv('processed_train.csv')
test_df = pd.read_csv('processed_test.csv')
eval_df = pd.read_csv('processed_eval.csv')

print("Data loaded. Shapes:")
print(f"Train: {train_df.shape}")
print(f"Test: {test_df.shape}")
print(f"Eval: {eval_df.shape}")

# 3. Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

  from .autonotebook import tqdm as notebook_tqdm


Data loaded. Shapes:
Train: (2804221, 5)
Test: (213091, 5)
Eval: (248470, 5)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [36]:
train_df

Unnamed: 0,word,pos,tag,class,numeric_tag
0,สภาสังคมสงเคราะห์แห่งประเทศ,NN,B_ORG,B_CLS,1.0
1,ไทย,NN,E_ORG,I_CLS,7.0
2,จี้,VV,O,I_CLS,0.0
3,ศาล,NN,O,I_CLS,0.0
4,ไฟเขียว,VV,O,I_CLS,0.0
...,...,...,...,...,...
2804216,ระหว่าง,PS,O,I_CLS,0.0
2804217,เรียน,VV,O,I_CLS,0.0
2804218,กับ,PS,O,I_CLS,0.0
2804219,งาน,NN,O,I_CLS,0.0


In [34]:
# 4. Create Dataset class
class NERDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Group by sentence
        self.sentences = []
        self.labels = []
        current_sent = []
        current_labels = []
        
        for _, row in df.iterrows():
            if row['word'] in ['.', '"']:  # End of sentence markers
                if current_sent:
                    self.sentences.append(current_sent)
                    self.labels.append(current_labels)
                    current_sent = []
                    current_labels = []
            else:
                current_sent.append(row['word'])
                current_labels.append(int(row['numeric_tag']))
        
        # Encode all sentences
        self.encodings = tokenizer(
            self.sentences,
            is_split_into_words=True,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        
        # Align labels
        self.aligned_labels = self._align_labels()
    
    def _align_labels(self):
        aligned_labels = []
        for i, label in enumerate(self.labels):
            word_ids = self.encodings.word_ids(i)
            previous_word_idx = None
            label_ids = []
            
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
                
            aligned_labels.append(label_ids)
        return aligned_labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': torch.tensor(self.aligned_labels[idx])
        }

In [37]:
# 1. Check NaN values
print("Checking NaN values:")
print(train_df.isnull().sum())

# 2. Clean data by dropping NaN or filling them
train_df = train_df.fillna(0)  # หรือใช้ dropna() ถ้าต้องการลบแถวที่มี NaN
test_df = test_df.fillna(0)
eval_df = eval_df.fillna(0)

# 3. Convert float labels to integers
train_df['numeric_tag'] = train_df['numeric_tag'].astype(int)
test_df['numeric_tag'] = test_df['numeric_tag'].astype(int)
eval_df['numeric_tag'] = eval_df['numeric_tag'].astype(int)

print("\nData types after conversion:")
print(train_df.dtypes)

# 4. Verify no NaN values remain
print("\nVerifying NaN values after cleaning:")
print(train_df.isnull().sum())

# 5. Create datasets again
train_dataset = NERDataset(train_df, tokenizer)
test_dataset = NERDataset(test_df, tokenizer)
eval_dataset = NERDataset(eval_df, tokenizer)

print("\nDatasets created successfully!")
print(f"Train dataset size: {len(train_dataset)}")

Checking NaN values:
word              0
pos               0
tag               0
class             0
numeric_tag    1810
dtype: int64

Data types after conversion:
word           object
pos            object
tag            object
class          object
numeric_tag     int32
dtype: object

Verifying NaN values after cleaning:
word           0
pos            0
tag            0
class          0
numeric_tag    0
dtype: int64

Datasets created successfully!
Train dataset size: 19941


In [38]:
# 6. Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

# 7. Verify data
print("\nVerifying data:")
sample_batch = next(iter(train_loader))
print(f"Batch input shape: {sample_batch['input_ids'].shape}")
print(f"Batch label shape: {sample_batch['labels'].shape}")

# 8. Save prepared data
torch.save({
    'train_dataset': train_dataset,
    'test_dataset': test_dataset,
    'eval_dataset': eval_dataset
}, 'prepared_datasets.pt')

print("\nData preparation completed and saved!")


Verifying data:
Batch input shape: torch.Size([16, 128])
Batch label shape: torch.Size([16, 128])

Data preparation completed and saved!


<torch.utils.data.dataloader.DataLoader at 0x1f2001cd670>

In [39]:
import os

folder_path = "test/test/"  

all_data = []
all_sentences = []

i = 0

for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)  



    if file_name.endswith(".txt"):
        s = []  
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                i += 1
                tokens = line.strip().split("\t")  
                if len(tokens) == 3:
                    word, pos, label = tokens
                    all_data.append({"word": word, "pos": pos, "label": label})
                    s.append(word)  
        all_sentences.append(s) 


print(all_sentences[0][0:10]) 


print(i)

<torch.utils.data.dataloader.DataLoader at 0x1f1b3167380>