# Setting

In [None]:
# Install
!pip install kaggle -qq
!pip install datasets -qq #สำหรับโหลดdataset
!pip install transformers -qq #สำหรับเทรนโมเดล
!pip install seqeval -qq #สำหรับevaluation
!pip install pythainlp

In [None]:
!pip install python-crfsuite

In [None]:
!kaggle competitions download -c ss3-hackathon-online-natural-language-processing

In [None]:
!unzip -o /content/ss3-hackathon-online-natural-language-processing.zip

In [None]:
# Extract LST20Corpus
!tar -xzf "/content/AIFORTHAI-LST20Corpus.tar.gz"

In [None]:
# Import Python library
import pandas as pd
import glob
import plotly.express as px
from tqdm import tqdm
from pathlib import Path
from collections import Counter, defaultdict
from datasets import load_dataset, load_metric
from pythainlp.tokenize import sent_tokenize

In [None]:
samplesubmit = pd.read_csv('/content/ne_sample_submission.csv')
samplesubmit.head(5)

In [None]:
ne_list = pd.read_csv('/content/ne_list.txt')
ne_list

# Prepare model for predict named entity

In [None]:
#Dowload dataset of LST20
datasets = load_dataset("lst20", data_dir="/content/LST20_Corpus") 
datasets

In [None]:
#Convert dataset to DataFrame
train_df = pd.DataFrame(datasets['train'])
validation_df = pd.DataFrame(datasets['validation'])
test_df = pd.DataFrame(datasets['test'])

In [None]:
_POS_TAGS = ["NN", "VV", "PU", "CC", "PS", "AX", "AV", "FX", "NU", "AJ", "CL", "PR", "NG", "PA", "XX", "IJ"]
_NER_TAGS = [ "O",
        "B_BRN",
        "B_DES",
        "B_DTM",
        "B_LOC",
        "B_MEA",
        "B_NUM",
        "B_ORG",
        "B_PER",
        "B_TRM",
        "B_TTL",
        "I_BRN",
        "I_DES",
        "I_DTM",
        "I_LOC",
        "I_MEA",
        "I_NUM",
        "I_ORG",
        "I_PER",
        "I_TRM",
        "I_TTL",
        "E_BRN",
        "E_DES",
        "E_DTM",
        "E_LOC",
        "E_MEA",
        "E_NUM",
        "E_ORG",
        "E_PER",
        "E_TRM",
        "E_TTL",]
_CLAUSE_TAGS = ["O", "B_CLS", "I_CLS", "E_CLS"]

In [None]:
def convert_to_simple_transformer_format(df):
  data_df = pd.DataFrame()
  sentence_id = []
  words = []
  labels = []

  for sentence in range(len(df)):
    for token in range(len(df['tokens'][sentence])):
      sentence_id.append(sentence)
      words.append(df['tokens'][sentence][token])
      labels.append(_NER_TAGS[df['ner_tags'][sentence][token]])

  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )

In [None]:
train_data = convert_to_simple_transformer_format(train_df)
validation_data = convert_to_simple_transformer_format(validation_df)
test_data = convert_to_simple_transformer_format(test_df)

# Model

In [None]:
!pip install simpletransformers
!pip install python-crfsuite

In [None]:
import torch
import pandas as pd
from simpletransformers.ner import NERModel, NERArgs

# Configure the model
ner_args = NERArgs()
ner_args.train_batch_size = 16
ner_args.eval_batch_size_batch_size = 16
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.overwrite_output_dir = True
ner_args.num_train_epochs = 1
ner_args.max_seq_length = 128

model = NERModel(
    "camembert", "airesearch/wangchanberta-base-att-spm-uncased", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)

In [None]:
# Train the model
model.train_model(train_data, eval_data = validation_data)

In [None]:
# Evaluate the model
result, model_outputs, preds_list = model.eval_model(validation_data)

# Prepare test data by manage ne_test.txt file

In [None]:
#Check word inside list
def data_inside(data_list):
  x = 0
  for i in range(len(data_list)):
    a = len(data_list[i])
    x = x+a 
  return x

In [None]:
def blank_space(x):
  if x == '':
    x = '_'
  return x

In [None]:
texts_test_raw = []
f = open("/content/ne_test.txt", 'r')
file_data = f.readlines()
for line in file_data:
  line = line.strip().replace(u'\xa0', u' ')
  texts_test_raw.append(line)

In [None]:
texts_test_raw = texts_test_raw[:-1]

In [None]:
for i in range (len(texts_test_raw)):
  texts_test_raw[i] = blank_space(texts_test_raw[i])

In [None]:
print(len(texts_test_raw))
print(texts_test_raw)

In [None]:
from pythainlp.tokenize import clause_tokenize

# Sentence Cut by LST20 dataset
my_token = clause_tokenize(texts_test_raw)


# Make predictions

In [None]:
#Test model
# Configure the model
ner_args = NERArgs()
ner_args.train_batch_size = 16
ner_args.eval_batch_size_batch_size = 16
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.overwrite_output_dir = True
ner_args.num_train_epochs = 3
ner_args.max_seq_length = 512

best_model = NERModel(
    "camembert", "/content/outputs/best_model", args=ner_args, use_cuda=torch.cuda.is_available(), labels=_NER_TAGS
)

In [None]:
# Make predictions with the model
predictions, raw_outputs = best_model.predict(my_token, False)

In [None]:
predictions

In [None]:
len(predictions)

3648

In [None]:
data_inside(predictions)

69561

In [None]:
final_test = []

for i in range (len(predictions)):
  for j in range (len(predictions[i])):
    data = predictions[i][j]
    value = data.values()
    final_test += value

len(final_test)

69561

# Submission

In [None]:
samplesubmit

Unnamed: 0,Id,Predicted
0,1,B_TTL
1,2,B_PER
2,3,I_PER
3,4,
4,5,
...,...,...
69556,69557,
69557,69558,
69558,69559,
69559,69560,


In [None]:
predicts = samplesubmit
predicts['Predicted'] = final_test
predicts

Unnamed: 0,Id,Predicted
0,1,B_TTL
1,2,B_PER
2,3,I_PER
3,4,E_PER
4,5,O
...,...,...
69556,69557,O
69557,69558,O
69558,69559,O
69559,69560,O


In [None]:
predicts.to_csv('submission_nlp_02.csv',index=False)