## Code for : https://www.kaggle.com/competitions/super-ai-ss-5-named-entity-recognition

# Preparation

In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->simpletransformers)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading simpletransformers-0.70.1-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316

In [2]:
from simpletransformers.ner import NERModel, NERArgs
import numpy as np
import pandas as pd
import os

# Load Data

In [3]:
def load_tab_separated_dataset(folder_path):
    data = []
    for i, file_name in enumerate(os.listdir(folder_path)):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    parts = line.strip().split('\t')
                    data.append([str(i)] + parts)
    columns = ['sentence_id', 'words', 'pos', 'labels', 'cls']
    df = pd.DataFrame(data, columns=columns)
    
    return df

In [4]:
train_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition/train/train"
eval_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition/eval/eval"  

train = load_tab_separated_dataset(train_path)
eval_ = load_tab_separated_dataset(eval_path)

In [5]:
labels_map = {
    'O': 0,'B_ORG': 1,'B_PER': 2,'B_LOC': 3,'B_MEA': 4,'I_DTM': 5,'I_ORG': 6,'E_ORG': 7,'I_PER': 8,
    'B_TTL': 9,'E_PER': 10,'B_DES': 11,'E_LOC': 12,'B_DTM': 13,'B_NUM': 14,'I_MEA': 15,'E_DTM': 16,
    'E_MEA': 17,'I_LOC': 18,'I_DES': 19,'E_DES': 20,'I_NUM': 21,'E_NUM': 22,'B_TRM': 23,'B_BRN': 24,
    'I_TRM': 25,'E_TRM': 26,'I_TTL': 27,'I_BRN': 28,'E_BRN': 29,'E_TTL': 30,'B_NAME': 31
}
labels = list(labels_map.keys())

In [6]:
train['labels'] = train['labels'].apply(lambda x: x if x in labels else 'O')
train['words'].dropna(inplace = True)

eval_['labels'] = eval_['labels'].apply(lambda x: x if x in labels else 'O')
eval_['words'].dropna(inplace = True)

In [7]:
train

Unnamed: 0,sentence_id,words,pos,labels,cls
0,0,สั่ง,VV,O,B_CLS
1,0,ขัง,VV,O,I_CLS
2,0,หนุ่ม,NN,O,I_CLS
3,0,อังกฤษ,NN,O,I_CLS
4,0,โกง,VV,O,I_CLS
...,...,...,...,...,...
2875114,3793,_,PU,O,I_CLS
2875115,3793,WWW.KOMCHADLUEK.NET,NN,O,E_CLS
2875116,3793,,,O,
2875117,3793,,,O,


In [8]:

eval_

Unnamed: 0,sentence_id,words,pos,labels,cls
0,0,บึง,NN,B_LOC,B_CLS
1,0,พลาญ,NN,I_LOC,I_CLS
2,0,ชัย,NN,E_LOC,I_CLS
3,0,โต้,VV,O,I_CLS
4,0,ไม่,NG,O,I_CLS
...,...,...,...,...,...
255033,473,เพื่อ,NN,I_ORG,I_CLS
255034,473,ประชาธิปไตย,NN,E_ORG,E_CLS
255035,473,,,O,
255036,473,,,O,


# Modeling

In [9]:
ner_args = NERArgs()
ner_args.train_batch_size = 64
ner_args.eval_batch_size = 512
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.num_train_epochs = 15
ner_args.learning_rate = 1e-4
ner_args.overwrite_output_dir = True
ner_args.max_seq_length = 500

In [10]:
model = NERModel(
     "auto", "thanaphatt1/WangchanBERTa-LST20", 
    args = ner_args, use_cuda = True, 
    labels = labels,
    ignore_mismatched_sizes = True
)

config.json:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at thanaphatt1/WangchanBERTa-LST20 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([31]) in the checkpoint and torch.Size([32]) in the model instantiated
- classifier.weight: found shape torch.Size([31, 768]) in the checkpoint and torch.Size([32, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

In [11]:
model.train_model(train.drop(columns=['pos', 'cls']), eval_data=eval_.drop(columns=['pos', 'cls']))

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 4 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 5 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 6 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 7 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 8 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 9 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 10 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 11 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 12 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 13 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 14 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 15 of 15:   0%|          | 0/60 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


(900,
 defaultdict(list,
             {'global_step': [60,
               120,
               180,
               240,
               300,
               360,
               420,
               480,
               540,
               600,
               660,
               720,
               780,
               840,
               900],
              'train_loss': [0.25341618061065674,
               0.25406312942504883,
               0.1823754459619522,
               0.18113280832767487,
               0.11146297305822372,
               0.12987066805362701,
               0.12396552413702011,
               0.10839895904064178,
               0.11914973706007004,
               0.08421271294355392,
               0.06612429022789001,
               0.06171085685491562,
               0.06216401606798172,
               0.049355242401361465,
               0.04384682700037956],
              'eval_loss': [0.24252592027187347,
               0.2000247836112976,
               0.1865

In [12]:
result, model_outputs, preds_list = model.eval_model(eval_)
result

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


{'eval_loss': 0.11978733539581299,
 'precision': 0.8139078498293515,
 'recall': 0.8483635716826752,
 'f1_score': 0.8307786099982581}

# Submission

In [13]:
def load_tab_separated_test(folder_path):
    data = []    
    for i, file_name in enumerate(sorted(os.listdir(folder_path))):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    parts = line.strip().split('\t')
                    data.append([str(i)] + parts)
    columns = ['sentence_id', 'words', 'pos', 'cls']
    df = pd.DataFrame(data, columns=columns)
    return df

In [14]:
test_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition/test/test"
test_df = load_tab_separated_test(test_path)
test_df.dropna(inplace = True)

In [15]:
txt = []
for i in test_df['words']:
    txt.append(i)
print(txt[:30],len(txt))

['รัฐ', 'ถังแตก', 'วิก', '_', '7', '_', 'สี', 'ชวด', 'โบนัส', 'ธนาคาร', 'นครหลวงไทย', '_', 'สาขา', 'สยามสแควร์', 'ดัน', 'เข้า', 'สพช.', 'แต่', 'ถูก', 'เบรก', 'นาย', 'แอนศิริ', '_', 'วลัยกนก', '_', 'ผู้', 'รัฐมนตรี', '_', 'กล่าว', 'ว่า'] 213091


In [16]:
def split_into_sentences(tokens, tokens_per_sentence):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences

In [17]:
test_tokens = split_into_sentences(txt, 200)
predictions,_ = model.predict(test_tokens, False)
len(_)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/3 [00:00<?, ?it/s]

  with amp.autocast():


1066

In [18]:
sum(len(sublist) for sublist in test_tokens) , sum(len(sublist) for sublist in predictions)

(213091, 213091)

In [19]:
answers = []
for sentence in predictions:
    for token_dict in sentence:
        for _, tag in token_dict.items():
            answers.append(tag)

In [20]:
submission = pd.read_csv("/kaggle/input/super-ai-ss-5-named-entity-recognition/sample_submission.csv")

In [21]:
submission['ne'] = answers
submission['ne'] = submission['ne'].apply(lambda x: labels_map.get(x, -1))
submission.to_csv("submission.csv", index=False)
submission

Unnamed: 0,id,ne
0,03795_0,0
1,03795_1,0
2,03795_2,1
3,03795_3,6
4,03795_4,6
...,...,...
213086,04276_844,0
213087,04276_845,0
213088,04276_846,0
213089,04276_847,0
