In [1]:
import pandas as pd
from transformers import RobertaTokenizerFast, RobertaForTokenClassification, pipeline
from attacut import tokenize
import torch
from sklearn.metrics import f1_score


2024-02-29 19:48:42.078882: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
sample_submission = pd.read_csv('csvs/sample_submission.csv')
df = pd.read_csv('csvs/test.csv')
df['pred'] = sample_submission['pred']
sample_submission.head()


Unnamed: 0,i,pred
0,0,O
1,1,O
2,2,O
3,3,
4,4,


In [7]:
df.head()

Unnamed: 0,i,word,pred
0,0,ใน,O
1,1,สมัย,O
2,2,หนึ่ง,O
3,3,_,
4,4,พระพุทธเจ้า,


In [8]:
df['end'] = 0
df['start'] = 0

In [10]:
all_sentences = []
sent = ''
for i, row in df.iterrows():
    all_sentences.append(str(row['word']).replace("_","[!und:]"))
    df.at[i, 'start'] = len(sent)
    sent += all_sentences[-1]
    df.at[i, 'end'] = len(sent)
    sent += ' _ '
sentence = ' _ '.join(all_sentences)

In [11]:
sentence[:100]

'ใน _ สมัย _ หนึ่ง _ [!und:] _ พระพุทธเจ้า _ ประทับ _ อยู่ _ วัด _ เชตวัน _ [!und:] _ เมือง _ สาวัตถี'

In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained("lst-nectec/HoogBERTa-NER-lst20")
model = RobertaForTokenClassification.from_pretrained("lst-nectec/HoogBERTa-NER-lst20")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [21]:
tag_list = pd.read_csv('csvs/tag_list.csv')
tags = {row['tag']:row['class'] for _, row in tag_list.iterrows()}
tags

{'O': 0,
 'B_ORG': 1,
 'B_PER': 2,
 'B_LOC': 3,
 'B_MEA': 4,
 'I_DTM': 5,
 'I_ORG': 6,
 'E_ORG': 7,
 'I_PER': 8,
 'B_TTL': 9,
 'E_PER': 10,
 'B_DES': 11,
 'E_LOC': 12,
 'B_DTM': 13,
 'B_NUM': 14,
 'I_MEA': 15,
 'E_DTM': 16,
 'E_MEA': 17,
 'I_LOC': 18,
 'I_DES': 19,
 'E_DES': 20,
 'I_NUM': 21,
 'E_NUM': 22,
 'B_TRM': 23,
 'B_BRN': 24,
 'I_TRM': 25,
 'E_TRM': 26,
 'I_TTL': 27,
 'I_BRN': 28,
 'E_BRN': 29,
 'E_TTL': 30,
 'B_NAME': 31}

In [13]:
nlp = pipeline('token-classification', model=model, tokenizer=tokenizer, aggregation_strategy="none")

In [29]:
outputs = nlp(sentence)
print(outputs)

[{'entity': 'B_LOC', 'score': 0.9159466, 'index': 19, 'word': 'วัด</w>', 'start': 60, 'end': 63}, {'entity': 'I_LOC', 'score': 0.93994606, 'index': 20, 'word': '_</w>', 'start': 64, 'end': 65}, {'entity': 'E_LOC', 'score': 0.84722865, 'index': 21, 'word': 'เช', 'start': 66, 'end': 68}, {'entity': 'E_LOC', 'score': 0.8021171, 'index': 22, 'word': 'ตวัน</w>', 'start': 68, 'end': 72}, {'entity': 'B_LOC', 'score': 0.917551, 'index': 30, 'word': 'เมือง</w>', 'start': 85, 'end': 90}, {'entity': 'I_LOC', 'score': 0.8523873, 'index': 31, 'word': '_</w>', 'start': 91, 'end': 92}, {'entity': 'E_LOC', 'score': 0.80220824, 'index': 32, 'word': 'สาวัตถี</w>', 'start': 93, 'end': 100}, {'entity': 'B_NUM', 'score': 0.67212677, 'index': 161, 'word': '3</w>', 'start': 495, 'end': 496}, {'entity': 'B_NUM', 'score': 0.5691418, 'index': 222, 'word': '4</w>', 'start': 629, 'end': 630}, {'entity': 'B_NUM', 'score': 0.53849363, 'index': 446, 'word': '3</w>', 'start': 1214, 'end': 1215}, {'entity': 'B_MEA', '

In [30]:
output_dicts = {out['start']: out for out in outputs}

In [31]:
sent[:100]

'ใน _ สมัย _ หนึ่ง _ [!und:] _ พระพุทธเจ้า _ ประทับ _ อยู่ _ วัด _ เชตวัน _ [!und:] _ เมือง _ สาวัตถี'

In [34]:
df.iloc[8]

i             8
word     เชตวัน
pred         12
end          72
start        66
Name: 8, dtype: object

In [33]:
df['pred'] = 0
dummy_word = ''
for i, row in df.iterrows():
    if row['start'] in output_dicts:
        df.at[i, 'pred'] = tags[output_dicts[row['start']]['entity']]
        output_dicts.pop(row['start'])
    elif str(row['word']).isdigit():
        df.at[i, 'pred'] = tags['B_NUM']


In [35]:
df.head()

Unnamed: 0,i,word,pred,end,start
0,0,ใน,0,2,0
1,1,สมัย,0,9,5
2,2,หนึ่ง,0,17,12
3,3,_,0,27,20
4,4,พระพุทธเจ้า,0,41,30


In [36]:
df[['i', 'pred']].to_csv('csvs/baseline.csv', index=False)