In [None]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Install Simple Transformers

In [None]:
!pip install -q simpletransformers

# Read Data

In [None]:
from datasets import load_dataset

lst20 = load_dataset("lst20", data_dir="/kaggle/input/lst20-magi/LST20_Corpus")
lst20

In [None]:
train_df = pd.DataFrame(lst20['train'])
validation_df = pd.DataFrame(lst20['validation'])
train_df

# Format Data

In [None]:
df_filter = ['id', 'tokens', 'ner_tags']
train_df = train_df[df_filter]
validation_df = validation_df[df_filter]
train_df

# Convert Data to df

In [None]:
NER_TAGS = [
       "O",
        "B_BRN",        "B_DES",        "B_DTM",        "B_LOC",        "B_MEA",        "B_NUM",        "B_ORG",        "B_PER",        "B_TRM",        "B_TTL",
       "I_BRN",        "I_DES",        "I_DTM",        "I_LOC",        "I_MEA",        "I_NUM",        "I_ORG",        "I_PER",        "I_TRM",        "I_TTL",
        "E_BRN",        "E_DES",        "E_DTM",        "E_LOC",        "E_MEA",        "E_NUM",        "E_ORG",        "E_PER",        "E_TRM",        "E_TTL"]
print(NER_TAGS)

In [None]:
def convert_data_to_df(df):
  data_df = pd.DataFrame()
  sentence_id = []
  words = []
  labels = []
  for sentence in range(len(df)):
    for token in range(len(df['tokens'][sentence])):
      sentence_id.append(sentence)
      words.append(df['tokens'][sentence][token])
      labels.append(NER_TAGS[df['ner_tags'][sentence][token]])
  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )

In [None]:
train_df = convert_data_to_df(train_df)
eval_data = convert_data_to_df(validation_df )
train_df

# Fine-tuning "xlm-roberta-large"

In [None]:
import logging
from simpletransformers.ner import NERModel, NERArgs
import torch

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
ner_args = NERArgs()
ner_args.train_batch_size = 128
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.eval_batch_size = 128
ner_args.num_train_epochs = 4
ner_args.learning_rate = 1e-4

model = NERModel(
     "roberta",
     "xlm-roberta-base",
     args=ner_args, use_cuda=torch.cuda.is_available(), labels=NER_TAGS
)

In [None]:
model.train_model(train_df, eval_data=eval_data)

## Evaluate Model

In [None]:
result, model_outputs, preds_list = model.eval_model(eval_data)
print(result)

# Prediction and Submission

## Read Model

In [None]:
ner_args = NERArgs()
ner_args.eval_batch_size = 128
ner_args.use_multiprocessing = True
ner_args.max_seq_length = 512
model1 = NERModel(
     "auto", "/kaggle/working/outputs/best_model", args=ner_args, use_cuda=torch.cuda.is_available(), labels= NER_TAGS
)

In [None]:
ner_args = NERArgs()
ner_args.eval_batch_size = 128
ner_args.use_multiprocessing = True
#ner_args.max_seq_length = 500
model2 = NERModel(
     "auto", "thanaphatt1/WangchanBERTa-LST20", args=ner_args, use_cuda=torch.cuda.is_available(), labels= NER_TAGS
)

In [None]:
ner_args = NERArgs()
ner_args.eval_batch_size = 128
ner_args.use_multiprocessing = True
ner_args.max_seq_length = 512
model3 = NERModel(
     "auto", "thanaphatt1/WangchanBERTa-LST20", args=ner_args, use_cuda=torch.cuda.is_available(), labels= NER_TAGS
)

## Join text

In [None]:
test_df = pd.read_csv('/kaggle/input/nithan-chadok-name-entity-recognition/test.csv')
test_df

In [None]:
txt = []

for i in test_df['word']:
    txt.append(i)
print(txt[:30])

## Split sentences and Predict

In [None]:
def split_into_sentences(tokens, tokens_per_sentence):
    sentences = []
    for i in range(0, len(tokens), tokens_per_sentence):
        sentence = tokens[i:i+tokens_per_sentence]
        sentences.append(sentence)
    return sentences

test_tokens1 = split_into_sentences(txt, 257)
predictions1 = model1.predict(test_tokens1, False)

In [None]:
test_tokens2 = split_into_sentences(txt, 15)
predictions2 = model2.predict(test_tokens2, False)

In [None]:
test_tokens3 = split_into_sentences(txt, 225)
predictions3 = model3.predict(test_tokens3, False)

## Add a tag

In [None]:
tag_df = pd.read_csv('/kaggle/input/nithan-chadok-name-entity-recognition/tag_list.csv')
tag_df

In [None]:
final_test_df1 = []

for i in predictions1[0]:
    for j in i:
        for k in j.values():
            result = tag_df[tag_df['tag'] == k]['class'].values[0]
            final_test_df1.append(result)

print(len(final_test_df1))
print(test_df[60:80])
print(final_test_df1[60:80])
print(set(final_test_df1))

In [None]:
final_test_df2 = []

for i in predictions2[0]:
    for j in i:
        for k in j.values():
            result = tag_df[tag_df['tag'] == k]['class'].values[0]
            final_test_df2.append(result)

print(len(final_test_df2))
print(test_df[60:80])
print(final_test_df2[60:80])
print(set(final_test_df2))

In [None]:
final_test_df3 = []

for i in predictions3[0]:
    for j in i:
        for k in j.values():
            result = tag_df[tag_df['tag'] == k]['class'].values[0]
            final_test_df3.append(result)

print(len(final_test_df3))
print(test_df[60:80])
print(final_test_df3[60:80])
print(set(final_test_df3))

## Submit and Ensemble

In [None]:
submit_df1 = pd.read_csv('/kaggle/input/nithan-chadok-name-entity-recognition/sample_submission.csv')
submit_df2 = pd.read_csv('/kaggle/input/nithan-chadok-name-entity-recognition/sample_submission.csv')
submit_df3 = pd.read_csv('/kaggle/input/nithan-chadok-name-entity-recognition/sample_submission.csv')
submit_df1

In [None]:
final_result1 = pd.DataFrame(final_test_df1)
submit_df1['pred'] = final_result1
final_result2 = pd.DataFrame(final_test_df2)
submit_df2['pred'] = final_result2
final_result3 = pd.DataFrame(final_test_df3)
submit_df3['pred'] = final_result3
submit_df1

In [None]:
df_ensemble = pd.concat([submit_df1, submit_df2, submit_df3])
df_ensemble = df_ensemble.groupby("i")["pred"].apply(lambda x: x.mode()[0]).reset_index()
df_ensemble

In [None]:
df_ensemble.to_csv('Wanghan15_Wanghan225_Robert257.csv', index=False)