In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->simpletransformers)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading simpletransformers-0.70.1-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.3/316

In [3]:
def load_tab_separated_dataset(folder_path):
    all_data = []
    
    for i, file_name in enumerate(os.listdir(folder_path)):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    parts = line.strip().split('\t')
                    all_data.append([str(i)] + parts)

    columns = ['sentence_id', 'words', 'pos', 'labels', 'cls']
    df = pd.DataFrame(all_data, columns=columns)
    
    return df

train_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition/train/train"
eval_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition/eval/eval"  

In [4]:
train = load_tab_separated_dataset(train_path)
eval_ = load_tab_separated_dataset(eval_path)

In [5]:
train

Unnamed: 0,sentence_id,words,pos,labels,cls
0,0,สั่ง,VV,O,B_CLS
1,0,ขัง,VV,O,I_CLS
2,0,หนุ่ม,NN,O,I_CLS
3,0,อังกฤษ,NN,O,I_CLS
4,0,โกง,VV,O,I_CLS
...,...,...,...,...,...
2875114,3793,_,PU,O,I_CLS
2875115,3793,WWW.KOMCHADLUEK.NET,NN,O,E_CLS
2875116,3793,,,,
2875117,3793,,,,


In [6]:
labels_map = {
    'O': 0,'B_ORG': 1,'B_PER': 2,'B_LOC': 3,'B_MEA': 4,'I_DTM': 5,'I_ORG': 6,'E_ORG': 7,'I_PER': 8,'B_TTL': 9,'E_PER': 10,
    'B_DES': 11,'E_LOC': 12,'B_DTM': 13,'B_NUM': 14,'I_MEA': 15,'E_DTM': 16,'E_MEA': 17,'I_LOC': 18,'I_DES': 19,'E_DES': 20,
    'I_NUM': 21,'E_NUM': 22,'B_TRM': 23,'B_BRN': 24,'I_TRM': 25,'E_TRM': 26,'I_TTL': 27,'I_BRN': 28,'E_BRN': 29,'E_TTL': 30,'B_NAME': 31
}
labels = list(labels_map.keys())

In [7]:
train['labels'] = train['labels'].apply(lambda x: x if x in labels else 'O')
train['words'].dropna(inplace = True)
train

Unnamed: 0,sentence_id,words,pos,labels,cls
0,0,สั่ง,VV,O,B_CLS
1,0,ขัง,VV,O,I_CLS
2,0,หนุ่ม,NN,O,I_CLS
3,0,อังกฤษ,NN,O,I_CLS
4,0,โกง,VV,O,I_CLS
...,...,...,...,...,...
2875114,3793,_,PU,O,I_CLS
2875115,3793,WWW.KOMCHADLUEK.NET,NN,O,E_CLS
2875116,3793,,,O,
2875117,3793,,,O,


In [8]:
eval_['labels'] = eval_['labels'].apply(lambda x: x if x in labels else 'O')
eval_['words'].dropna(inplace = True)
eval_

Unnamed: 0,sentence_id,words,pos,labels,cls
0,0,บึง,NN,B_LOC,B_CLS
1,0,พลาญ,NN,I_LOC,I_CLS
2,0,ชัย,NN,E_LOC,I_CLS
3,0,โต้,VV,O,I_CLS
4,0,ไม่,NG,O,I_CLS
...,...,...,...,...,...
255033,473,เพื่อ,NN,I_ORG,I_CLS
255034,473,ประชาธิปไตย,NN,E_ORG,E_CLS
255035,473,,,O,
255036,473,,,O,


In [9]:
from simpletransformers.ner import NERModel, NERArgs

In [10]:
ner_args = NERArgs()
ner_args.train_batch_size = 128
ner_args.eval_batch_size = 128
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.num_train_epochs = 15
ner_args.learning_rate = 1e-4
ner_args.overwrite_output_dir = True
ner_args.n_gpu = 2

In [11]:
#ner_args.max_seq_length = 500
model = NERModel(
     "auto", "thanaphatt1/WangchanBERTa-LST20", 
     # "camembert", "airesearch/wangchanberta-base-att-spm-uncased", 
    args = ner_args, use_cuda = True, 
    labels = labels,
    ignore_mismatched_sizes = True
)

config.json:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at thanaphatt1/WangchanBERTa-LST20 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([31]) in the checkpoint and torch.Size([32]) in the model instantiated
- classifier.weight: found shape torch.Size([31, 768]) in the checkpoint and torch.Size([32, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

In [12]:
model.train_model(train.drop(columns=['pos', 'cls']), eval_data=eval_.drop(columns=['pos', 'cls']))

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 4 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 5 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 6 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 7 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 8 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 9 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 10 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 11 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 12 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 13 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 14 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 15 of 15:   0%|          | 0/30 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


(450,
 defaultdict(list,
             {'global_step': [30,
               60,
               90,
               120,
               150,
               180,
               210,
               240,
               270,
               300,
               330,
               360,
               390,
               420,
               450],
              'train_loss': [0.22100892663002014,
               0.1278696060180664,
               0.10790757834911346,
               0.08101481199264526,
               0.053763747215270996,
               0.05676133930683136,
               0.04728815704584122,
               0.03401298075914383,
               0.03931182995438576,
               0.03444664925336838,
               0.02379881776869297,
               0.025143597275018692,
               0.02678578346967697,
               0.02201424166560173,
               0.022041240707039833],
              'eval_loss': [0.23746337741613388,
               0.15417292341589928,
               0.144

In [13]:
result, model_outputs, preds_list = model.eval_model(eval_)
result

  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/4 [00:00<?, ?it/s]

  with amp.autocast():


{'eval_loss': 0.16562408581376076,
 'precision': 0.8471421080230729,
 'recall': 0.8685483870967742,
 'f1_score': 0.8577117069285903}

**Submission**

In [16]:
def load_tab_separated_dataset(folder_path,test_set = False):
    all_data = []
    sentence_id = 0
    for i, file_name in enumerate(sorted(os.listdir(folder_path))):
        if file_name.endswith('.txt'):
            file_path = os.path.join(folder_path, file_name)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    if not line.strip():
                        sentence_id += 1
                        continue
                    
                    parts = line.strip().split('\t')
                    all_data.append([str(sentence_id)] + parts)
    if test_set:
        columns = ['sentence_id', 'words', 'pos' ,'cls']
    else:
        columns = ['sentence_id', 'words', 'pos', 'labels', 'cls']
    df = pd.DataFrame(all_data, columns=columns)
    return df.drop(columns=['pos', 'cls'])

test_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition/test/test"
test_df = load_tab_separated_dataset(test_path,True)
test_df.dropna(inplace = True)
test_df

Unnamed: 0,sentence_id,words
0,0,รัฐ
1,0,ถังแตก
2,0,วิก
3,0,_
4,0,7
...,...,...
213086,5241,ครหา
213087,5241,เกี่ยวกับ
213088,5241,ความ
213089,5241,ไม่


In [17]:
# txt = []
# for i in test_df['words']:
#     txt.append(i)
# print(txt[:30],len(txt))

In [18]:
# def split_into_sentences(tokens, tokens_per_sentence):
#     sentences = []
#     for i in range(0, len(tokens), tokens_per_sentence):
#         sentence = tokens[i:i+tokens_per_sentence]
#         sentences.append(sentence)
#     return sentences

In [19]:
def group_words_by_sentence(df):
    if not {'sentence_id', 'words'}.issubset(df.columns):
        raise ValueError("The DataFrame must contain 'sentence_id' and 'words' columns.")

    grouped_sentences = df.groupby('sentence_id')['words'].apply(list).tolist()

    return grouped_sentences

group_sentences = group_words_by_sentence(test_df)
# group_sentences = split_into_sentences(txt, 50)
predictions,_ = model.predict(group_sentences, False)
len(_)

  0%|          | 0/5 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/41 [00:00<?, ?it/s]

  with amp.autocast():


5242

In [20]:
len(group_sentences),len(predictions),len(_)

(5242, 5242, 5242)

In [21]:
sum(len(sublist) for sublist in predictions),sum(len(sublist) for sublist in group_sentences)

(202204, 213091)

In [22]:
predictions[0]

[{'รัฐ': 'O'},
 {'ถังแตก': 'O'},
 {'วิก': 'B_ORG'},
 {'_': 'I_ORG'},
 {'7': 'I_ORG'},
 {'_': 'I_ORG'},
 {'สี': 'E_ORG'},
 {'ชวด': 'O'},
 {'โบนัส': 'O'},
 {'ธนาคาร': 'B_ORG'},
 {'นครหลวงไทย': 'I_ORG'},
 {'_': 'I_ORG'},
 {'สาขา': 'I_ORG'},
 {'สยามสแควร์': 'E_ORG'},
 {'ดัน': 'O'},
 {'เข้า': 'O'},
 {'สพช.': 'B_ORG'},
 {'แต่': 'O'},
 {'ถูก': 'O'},
 {'เบรก': 'O'}]

In [23]:
submission = pd.read_csv("/kaggle/input/super-ai-ss-5-named-entity-recognition/sample_submission.csv")
answers = []
for sentence in predictions:
    for token_dict in sentence:
        for _, tag in token_dict.items():
            answers.append(tag)
submission

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,ne
0,03795_0,0.0
1,03795_1,0.0
2,03795_2,1.0
3,03795_3,6.0
4,03795_4,6.0
...,...,...
213086,04276_844,
213087,04276_845,
213088,04276_846,
213089,04276_847,


In [24]:
answers[:10]

['O', 'O', 'B_ORG', 'I_ORG', 'I_ORG', 'I_ORG', 'E_ORG', 'O', 'O', 'B_ORG']

In [25]:
len(answers)

202204

In [26]:
submission['ne'] = answers
submission['ne'] = submission['ne'].apply(lambda x: labels_map.get(x, -1))
submission.to_csv("submission.csv", index=False)
submission

ValueError: Length of values (202204) does not match length of index (213091)