In [1]:
import os
import pandas as pd

# ฟังก์ชันสำหรับอ่านไฟล์จากโฟลเดอร์ตามลำดับ
def read_data_from_folder(folder_path):
    data = []
    # เรียงลำดับชื่อไฟล์ตามลำดับตัวเลข
    file_names = sorted(os.listdir(folder_path))
    for file_name in file_names:
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)

            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()
                    if line:  # ข้ามแถวว่าง
                        parts = line.split("\t")
                        if len(parts) == 4:  # ถ้ามี 4 คอลัมน์
                            data.append(parts)
                        elif len(parts) == 3:  # ถ้ามี 3 คอลัมน์ เติมค่า default สำหรับ `tag`
                            parts.insert(2, "O")  # ใส่ค่า "O" ที่ตำแหน่ง index 2
                            data.append(parts)
                        else:
                            print(f"Invalid line in {file_name}: {line}")
    return data

# ฟังก์ชันสำหรับรวบรวมและบันทึกข้อมูล
def process_and_save_data(input_folder, output_file):
    data = read_data_from_folder(input_folder)
    df = pd.DataFrame(data, columns=["word", "pos", "tag", "class"])
    df.to_csv(output_file, index=False, encoding="utf-8")
    print(f"Saved processed data to {output_file}")

# โฟลเดอร์ข้อมูล (แก้ไขให้ตรงกับโครงสร้างใน Kaggle)
train_folder = "train"
test_folder = "test"
eval_folder = "eval"

# เซฟข้อมูลเป็นไฟล์ CSV
process_and_save_data(train_folder, "train_data.csv")
process_and_save_data(eval_folder, "eval_data.csv")
process_and_save_data(test_folder, "test_data.csv")


Saved processed data to train_data.csv
Saved processed data to eval_data.csv
Saved processed data to test_data.csv


In [2]:
tag_list = [
    ('O', 0),
    ('B_ORG', 1),  ('B_PER', 2),  ('B_LOC', 3),  ('B_MEA', 4),
    ('I_DTM', 5),  ('I_ORG', 6),  ('E_ORG', 7),  ('I_PER', 8),
    ('B_TTL', 9),  ('E_PER', 10), ('B_DES', 11), ('E_LOC', 12),
    ('B_DTM', 13), ('B_NUM', 14), ('I_MEA', 15), ('E_DTM', 16),
    ('E_MEA', 17), ('I_LOC', 18), ('I_DES', 19), ('E_DES', 20),
    ('I_NUM', 21), ('E_NUM', 22), ('B_TRM', 23), ('B_BRN', 24),
    ('I_TRM', 25), ('E_TRM', 26), ('I_TTL', 27), ('I_BRN', 28),
    ('E_BRN', 29), ('E_TTL', 30), ('B_NAME', 31)
]
tag_to_id = dict(tag_list)
id_to_tag = {v: k for k, v in tag_to_id.items()}

def get_tag_id(tag):
    # Map unknown tags to 0 (O)
    return tag_to_id.get(tag, 0)

In [3]:
tag_to_id

{'O': 0,
 'B_ORG': 1,
 'B_PER': 2,
 'B_LOC': 3,
 'B_MEA': 4,
 'I_DTM': 5,
 'I_ORG': 6,
 'E_ORG': 7,
 'I_PER': 8,
 'B_TTL': 9,
 'E_PER': 10,
 'B_DES': 11,
 'E_LOC': 12,
 'B_DTM': 13,
 'B_NUM': 14,
 'I_MEA': 15,
 'E_DTM': 16,
 'E_MEA': 17,
 'I_LOC': 18,
 'I_DES': 19,
 'E_DES': 20,
 'I_NUM': 21,
 'E_NUM': 22,
 'B_TRM': 23,
 'B_BRN': 24,
 'I_TRM': 25,
 'E_TRM': 26,
 'I_TTL': 27,
 'I_BRN': 28,
 'E_BRN': 29,
 'E_TTL': 30,
 'B_NAME': 31}

In [4]:
import pandas as pd

# โหลดข้อมูล
train_data = pd.read_csv('train_data.csv')
eval_data = pd.read_csv('eval_data.csv')
test_data = pd.read_csv('test_data.csv')

# ตรวจสอบตัวอย่างข้อมูล
print(train_data.head())
print(eval_data.head())
print(test_data.head())

                          word pos    tag  class
0  สภาสังคมสงเคราะห์แห่งประเทศ  NN  B_ORG  B_CLS
1                          ไทย  NN  E_ORG  I_CLS
2                          จี้  VV      O  I_CLS
3                          ศาล  NN      O  I_CLS
4                      ไฟเขียว  VV      O  I_CLS
    word pos    tag  class
0   โฆษก  NN      O  B_CLS
1   กอส.  NN  B_ORG  I_CLS
2  ตำหนิ  VV      O  I_CLS
3   แมนฯ  NN  B_ORG  I_CLS
4      _  NN  I_ORG  I_CLS
     word pos tag  class
0     รัฐ  NN   O  B_CLS
1  ถังแตก  VV   O  I_CLS
2     วิก  NN   O  I_CLS
3       _  NN   O  I_CLS
4       7  NN   O  I_CLS


In [5]:
tags = [
    'O',
    'B_ORG', 'I_ORG', 'E_ORG',
    'B_PER', 'I_PER', 'E_PER', 
    'B_LOC', 'I_LOC', 'E_LOC',
    'B_MEA', 'I_MEA', 'E_MEA',
    'B_DTM', 'I_DTM', 'E_DTM',
    'B_NUM', 'I_NUM', 'E_NUM',
    'B_TTL', 'I_TTL', 'E_TTL',
    'B_DES', 'I_DES', 'E_DES',
    'B_TRM', 'I_TRM', 'E_TRM',
    'B_BRN', 'I_BRN', 'E_BRN',
    'B_NAME'
]

tag_to_id = {tag: idx for idx, tag in enumerate(tags)}


In [6]:
tag_to_id

{'O': 0,
 'B_ORG': 1,
 'I_ORG': 2,
 'E_ORG': 3,
 'B_PER': 4,
 'I_PER': 5,
 'E_PER': 6,
 'B_LOC': 7,
 'I_LOC': 8,
 'E_LOC': 9,
 'B_MEA': 10,
 'I_MEA': 11,
 'E_MEA': 12,
 'B_DTM': 13,
 'I_DTM': 14,
 'E_DTM': 15,
 'B_NUM': 16,
 'I_NUM': 17,
 'E_NUM': 18,
 'B_TTL': 19,
 'I_TTL': 20,
 'E_TTL': 21,
 'B_DES': 22,
 'I_DES': 23,
 'E_DES': 24,
 'B_TRM': 25,
 'I_TRM': 26,
 'E_TRM': 27,
 'B_BRN': 28,
 'I_BRN': 29,
 'E_BRN': 30,
 'B_NAME': 31}

In [None]:
import pandas as pd

def group_sentences_with_id(data, is_test=False):
    sentences = []
    sentence = []
    sentence_id = 0

    for idx, row in data.iterrows():
        word, tag, cls = row['word'], row['tag'], row['class']
        
        if idx >= 200000 and not is_test:  # Limit test data to 200,000 rows
            break

        if tag not in tag_to_id:
            continue

        # Start new sentence if B_CLS found
        if cls == 'B_CLS':
            if sentence:  # Save previous sentence if exists
                sentences.append({'sentence_id': sentence_id, 'words': sentence})
                sentence_id += 1
            sentence = [(word, tag)]

        # Continue current sentence for I_CLS
        elif cls == 'I_CLS':
            sentence.append((word, tag))

        # End sentence at E_CLS
        elif cls == 'E_CLS':
            sentence.append((word, tag))
            sentences.append({'sentence_id': sentence_id, 'words': sentence})
            sentence = []
            sentence_id += 1

        

    # Add last sentence if exists
    if sentence:
        sentences.append({'sentence_id': sentence_id, 'words': sentence})

    return sentences

def create_dataframe(sentences):
    data = []

    for sentence in sentences:
        sentence_id = sentence['sentence_id']
        tokens = [word for word, tag in sentence['words']]
        ner_tags = [tag for word, tag in sentence['words']]

        data.append({'id': sentence_id, 'tokens': tokens, 'ner_tags': ner_tags})

    return pd.DataFrame(data)

# Create sentence groups with sentence_id
train_sentences = group_sentences_with_id(train_data)
eval_sentences = group_sentences_with_id(eval_data)
test_sentences = group_sentences_with_id(test_data, is_test=True)

# Convert to DataFrame
train_df = create_dataframe(train_sentences)
eval_df = create_dataframe(eval_sentences)
test_df = create_dataframe(test_sentences)

# Save to CSV (optional)
train_df.to_csv('train_processed.csv', index=False)
eval_df.to_csv('eval_processed.csv', index=False)
test_df.to_csv('test_processed.csv', index=False)

# Print sample to verify
print(train_df.head())

   id                                             tokens  \
0   0  [สภาสังคมสงเคราะห์แห่งประเทศ, ไทย, จี้, ศาล, ไ...   
1   1  [สำนักงาน, องค์การ, พุทธศาสนิกสัมพันธ์, แห่ง, ...   
2   2                           [ก่อน, กิจการ, พังพินาศ]   
3   3              [หลัง, จาก, _, พยายาม, กระเสือกกระสน]   
4   4                                   [หลัง, ล้มละลาย]   

                                            ner_tags  
0          [B_ORG, E_ORG, O, O, O, O, O, O, O, O, O]  
1  [B_ORG, I_ORG, I_ORG, I_ORG, E_ORG, O, O, O, O...  
2                                          [O, O, O]  
3                                    [O, O, O, O, O]  
4                                             [O, O]  


In [8]:
train_df

Unnamed: 0,id,tokens,ner_tags
0,0,"[สภาสังคมสงเคราะห์แห่งประเทศ, ไทย, จี้, ศาล, ไ...","[B_ORG, E_ORG, O, O, O, O, O, O, O, O, O]"
1,1,"[สำนักงาน, องค์การ, พุทธศาสนิกสัมพันธ์, แห่ง, ...","[B_ORG, I_ORG, I_ORG, I_ORG, E_ORG, O, O, O, O..."
2,2,"[ก่อน, กิจการ, พังพินาศ]","[O, O, O]"
3,3,"[หลัง, จาก, _, พยายาม, กระเสือกกระสน]","[O, O, O, O, O]"
4,4,"[หลัง, ล้มละลาย]","[O, O]"
...,...,...,...
15831,15831,"[ให้, ทัน, เพื่อน]","[O, O, O]"
15832,15832,"[หลังจาก, ทิ้ง, มา, นาน]","[O, O, O, O]"
15833,15833,"[ส่วน, อนาคต, ต้อง, ฝึกซ้อม]","[O, O, O, O]"
15834,15834,"[ให้, หนัก, มาก, กว่า, นี้]","[O, O, O, O, O]"


In [9]:
def convert_data_to_df(df):
  data_df = pd.DataFrame()
  sentence_id = []
  words = []
  labels = []

  for sentence in range(len(df)):
    for token in range(len(df['tokens'][sentence])):
      sentence_id.append(sentence)
      words.append(df['tokens'][sentence][token])
      labels.append(df['ner_tags'][sentence][token]) #Map 0 to "O", 1 to "B_BRN"

  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )

In [10]:
eval_data = convert_data_to_df(eval_df)
train_data = convert_data_to_df(train_df)
train_data.head(5)

Unnamed: 0,sentence_id,words,labels
0,0,สภาสังคมสงเคราะห์แห่งประเทศ,B_ORG
1,0,ไทย,E_ORG
2,0,จี้,O
3,0,ศาล,O
4,0,ไฟเขียว,O


In [11]:
tag_mapping = {
    'O': 0,
    'B_ORG': 1, 
    'B_PER': 2,
    'B_LOC': 3,
    'B_MEA': 4,
    'I_DTM': 5,
    'I_ORG': 6,
    'E_ORG': 7,
    'I_PER': 8,
    'B_TTL': 9,
    'E_PER': 10,
    'B_DES': 11,
    'E_LOC': 12,
    'B_DTM': 13,
    'B_NUM': 14,
    'I_MEA': 15,
    'E_DTM': 16,
    'E_MEA': 17,
    'I_LOC': 18,
    'I_DES': 19,
    'E_DES': 20,
    'I_NUM': 21,
    'E_NUM': 22,
    'B_TRM': 23,
    'B_BRN': 24,
    'I_TRM': 25,
    'E_TRM': 26,
    'I_TTL': 27,
    'I_BRN': 28,
    'E_BRN': 29,
    'E_TTL': 30,
    'B_NAME': 31
}

In [12]:
%pip install simpletransformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from simpletransformers.ner import NERModel
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [18]:
list(tag_to_id.keys())

['O',
 'B_ORG',
 'I_ORG',
 'E_ORG',
 'B_PER',
 'I_PER',
 'E_PER',
 'B_LOC',
 'I_LOC',
 'E_LOC',
 'B_MEA',
 'I_MEA',
 'E_MEA',
 'B_DTM',
 'I_DTM',
 'E_DTM',
 'B_NUM',
 'I_NUM',
 'E_NUM',
 'B_TTL',
 'I_TTL',
 'E_TTL',
 'B_DES',
 'I_DES',
 'E_DES',
 'B_TRM',
 'I_TRM',
 'E_TRM',
 'B_BRN',
 'I_BRN',
 'E_BRN',
 'B_NAME']

In [29]:
# 1. Import required libraries
import os
import pandas as pd
from simpletransformers.ner import NERModel

# 2. Create directories
os.makedirs('outputs', exist_ok=True)
os.makedirs('cache', exist_ok=True)

In [30]:
model_args = {
    'output_dir': 'outputs/ner_model',
    'cache_dir': 'cache',
    'num_train_epochs': 5,
    'learning_rate': 1e-4,
    'max_seq_length': 128,
    'train_batch_size': 16,
    'eval_batch_size': 16,
    'overwrite_output_dir': True,
    'save_eval_checkpoints': False,
    'save_model_every_epoch': True
}

# 5. Initialize and train model
model = NERModel(
    "bert",
    "bert-base-multilingual-cased",
    args=model_args,
    labels=list(tag_to_id.keys())
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
train_data

Unnamed: 0,sentence_id,words,labels
0,0,สภาสังคมสงเคราะห์แห่งประเทศ,B_ORG
1,0,ไทย,E_ORG
2,0,จี้,O
3,0,ศาล,O
4,0,ไฟเขียว,O
...,...,...,...
194533,15835,คน,O
194534,15835,เก่ง,O
194535,15835,ๆ,O
194536,15835,_,O


In [36]:
train_data

Unnamed: 0,sentence_id,words,labels
0,0,สภาสังคมสงเคราะห์แห่งประเทศ,B_ORG
1,0,ไทย,E_ORG
2,0,จี้,O
3,0,ศาล,O
4,0,ไฟเขียว,O
...,...,...,...
194533,15835,คน,O
194534,15835,เก่ง,O
194535,15835,ๆ,O
194536,15835,_,O


In [37]:
def convert_to_st_format(df):
    st_data = []
    sentence_id = 0
    current_sentence = []
    
    for _, row in df.iterrows():
        word, tag = row['word'], row['tag']
        if row['class'] == 'B_CLS':
            if current_sentence:
                st_data.extend([(sentence_id, word, tag) for word, tag in current_sentence])
                sentence_id += 1
            current_sentence = [(word, tag)]
        else:
            current_sentence.append((word, tag))
            
    return pd.DataFrame([
        [sent_id, word, tag] for sent_id, word, tag in st_data
    ], columns=['sentence_id', 'words', 'labels'])

In [38]:
train_df = convert_to_st_format(train_data)
eval_df = convert_to_st_format(eval_data)


KeyError: 'word'

In [32]:
model.train_model(train_data, eval_data=eval_data)

100%|██████████| 29/29 [00:21<00:00,  1.37it/s]


FailedPreconditionError: runs is not a directory

In [16]:
# from simpletransformers.ner import NERModel
# import pandas as pd
# model = NERModel(
#     "bert", 
#     "bert-base-multilingual-cased", 
#     labels=tag_mapping,
#     args={"overwrite_output_dir": True, "train_batch_size": 32, "num_train_epochs": 3},
#     use_cuda=True
# )

In [17]:
# model.train_model(train_data, eval_data=eval_data)

In [18]:
def split_into_sentences(tokens, tokens_per_sentence=26):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        # Slice the tokens list into chunks of the specified size
        sentence = tokens[i:i + tokens_per_sentence]
        sentences.append(sentence)
    return sentences

In [19]:
def prepare_data_for_model(data, tokens_per_sentence=26):
    processed_data = []
    for _, row in data.iterrows():
        tokens = row['tokens']
        ner_tags = row['ner_tags']
        sentences = split_into_sentences(tokens, tokens_per_sentence)
        tags = split_into_sentences(ner_tags, tokens_per_sentence)

        for sentence, tag in zip(sentences, tags):
            processed_data.append({'tokens': sentence, 'ner_tags': tag})

    return pd.DataFrame(processed_data)

train_df_prepared = prepare_data_for_model(train_df)
eval_df_prepared = prepare_data_for_model(eval_df)

KeyError: 'tokens'

In [83]:
eval_df_prepared

Unnamed: 0,tokens,ner_tags
0,"[โฆษก, กอส., ตำหนิ, แมนฯ, _, ซิตี้]","[O, B_ORG, O, B_ORG, I_ORG, E_ORG]"
1,"[โฆษก, หนังสือพิมพ์, _, ASTV, ตำหนิ, สพก.]","[O, B_ORG, I_ORG, E_ORG, O, B_ORG]"
2,"[ที่, บิดเบือน, ข้อเท็จจริง, ต่อ, คณะ, ฑูต, ต่...","[O, O, O, O, O, O, O, O, O, O, O, O]"
3,"[แต่, ยัง, ยินดี]","[O, O, O]"
4,"[ที่, จะ, สนับสนุน, การ, ดำเนินการ, เกี่ยวกับ,...","[O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
16472,"[เพราะ, ยัง, ไม่, เป็น]","[O, O, O, O]"
16473,"[ที่, แน่ชัด]","[O, O]"
16474,"[ว่า, ตลาด, หุ้น, จะ, บวก, ขึ้น, ได้, ต่อเนื่อ...","[O, O, O, O, O, O, O, O, O, O, O, B_TTL, B_PER..."
16475,"[นาง, ศาสตรา, _, เทียนทอง, _, กรรมการ, และ, ผู...","[B_TTL, B_PER, I_PER, E_PER, O, O, O, O, O, O,..."


In [None]:
out = pd.read_csv('sample_submission.csv')
out

Unnamed: 0,id,ne
0,03795_0,0
1,03795_1,0
2,03795_2,0
3,03795_3,0
4,03795_4,6
...,...,...
213086,04276_844,0
213087,04276_845,0
213088,04276_846,0
213089,04276_847,0


In [None]:
out['ne'] = test_df['numeric_tag']
out

KeyError: 'numeric_tag'

In [23]:
out.to_csv('sample_submission.csv', index=False)