# Setup the workspace
Here is for google colab. Skip if you are not working on google colab.

In [None]:
import os
from google.colab import drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
os.chdir("/content/drive/MyDrive/AICUP2020/")

# Prepare Environments

In [None]:
# !pip install -r requirements.txt
from datetime import datetime
from configparser import ConfigParser
import re
import numpy as np
import pandas as pd
import tensorflow as tf

import kashgari
from kashgari.embeddings import BertEmbedding, TransformerEmbedding
from bilstm_crf_model import BiLSTM_CRF_Model
from model_acceptance_callback import NERAcceptanceCallBack

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from utils import load_test_file, format_result, split_chunks, save_model, load_model
from data_generator import read_data, generate_dataset

In [None]:
# Define const
MODEL_SAVE_DIR = 'model/'

# Get config
config = ConfigParser()
config.read('config.ini')
LSTM_UNITS = int(config['MODEL']['lstm_units'])
DROPOUT = float(config['MODEL']['dropout'])
EPOCHS = int(config['TRAIN']['epochs'])
BATCH_SIZE = int(config['TRAIN']['batch_size'])
PREDICT_BATCH_SIZE = int(config['PREDICT']['batch_size'])

# Read Training Data

In [None]:
# x is a list of characters, e.g ['你', '好', '嗎', ...]
# y is a list of labels of each characters, e.g ['O', 'O', 'O', ...]
training_data = read_data(['data/TRAIN_FINAL'], end_flag='')
train_x = list(map(lambda x: [wl[0] for wl in x], training_data))
train_y = list(map(lambda x: [wl[1] for wl in x], training_data))

validation_data = read_data(['data/TEST_FINAL'], end_flag='')
valid_x = list(map(lambda x: [wl[0] for wl in x], validation_data))
valid_y = list(map(lambda x: [wl[1] for wl in x], validation_data))

In [None]:
train_x_chunks = []
for article in train_x:
  article_chunks = split_chunks(article)
  train_x_chunks.extend(article_chunks)

train_y_chunks = []
for article in train_y:
  article_chunks = split_chunks(article)
  train_y_chunks.extend(article_chunks)

valid_x_chunks = []
for article in valid_x:
  article_chunks = split_chunks(article)
  valid_x_chunks.extend(article_chunks)

valid_y_chunks = []
for article in valid_y:
  article_chunks = split_chunks(article)
  valid_y_chunks.extend(article_chunks)

# Build Model

In [None]:
# Bert + BiLSTM-CRF
hyper_parameters = {
  'layer_blstm': {
    'units': LSTM_UNITS,
    'return_sequences': True
  },
  'layer_dropout': {
    'rate': DROPOUT
  },
  'layer_time_distributed': {

  },
  'layer_activation': {
    'activation': 'softmax'
  }
}
bert_embed = BertEmbedding('chinese_L-12_H-768_A-12/')
model = BiLSTM_CRF_Model(bert_embed, hyper_parameters=hyper_parameters)

# Load Model

In [None]:
# load model
# model, model_info = load_model(f'{MODEL_SAVE_DIR}/', BiLSTM_CRF_Model)
# model.crf_layer = model.layer_crf
# model.compile_model()
# if model_info is not None:
#   start_epoch = model_info['epoch']
#   monitor = model_info['monitor']
#   threshold = model_info['best']
# else:
#   start_epoch = 0
#   threshold = -np.Inf

# print(f'epoch: {start_epoch}\nmonitor: {monitor}\nbest: {threshold}')

# Train Model

In [None]:
# for loaded model
# ner_cb = NERAcceptanceCallBack(f'{MODEL_SAVE_DIR}/', 
#                 kash_model=model, 
#                 validate_data_x=valid_x_chunks, 
#                 validate_data_y=valid_y_chunks,
#                 monitor='f1-score',
#                 threshold=threshold,
#                 history_record_mode='keep')
# history = model.fit(train_x_chunks,
#            train_y_chunks,
#            x_validate=valid_x_chunks,
#            y_validate=valid_y_chunks,
#            epochs=start_epoch + EPOCHS,
#            batch_size=BATCH_SIZE,
#            callbacks=[ner_cb], 
#            fit_kwargs={'initial_epoch': start_epoch})

# for initial-built model
ner_cb = NERAcceptanceCallBack(f'{MODEL_SAVE_DIR}/', 
                kash_model=model,
                validate_data_x=valid_x_chunks,
                validate_data_y=valid_y_chunks,
                monitor='f1-score',
                history_record_mode='new')
history = model.fit(train_x_chunks,
           train_y_chunks,
           x_validate=valid_x_chunks,
           y_validate=valid_y_chunks,
           epochs=1,
           batch_size=BATCH_SIZE,
           callbacks=[ner_cb])

# Check Performation

In [None]:
perf = pd.read_csv(f'{MODEL_SAVE_DIR}/history/history_0.csv')
perf

In [None]:
fig = make_subplots(rows=2, cols=2,
           specs=[[{}, {}], [{'colspan': 2}, None]],
           subplot_titles=('Loss and Validate loss', 'Accuracy and Validate accuracy', 'F1 Score'))
fig.add_trace(go.Scatter(y=perf['loss'], name='loss'), row=1, col=1)
fig.add_trace(go.Scatter(y=perf['val_loss'], name='validate loss'), row=1, col=1)

fig.add_trace(go.Scatter(y=perf['accuracy'], name='accuracy'), row=1, col=2)
fig.add_trace(go.Scatter(y=perf['val_accuracy'], name='validate accuracy'), row=1, col=2)

for prop in perf.columns[4:]:
  fig.add_trace(go.Scatter(y=perf[prop], name=prop), row=2, col=1)

fig.update_layout(height=800, width=1000)

fig.show()

# Prediction

In [None]:
for_predict = load_test_file('./data/1214test.txt')

In [None]:
def predict(model: tf.keras.Model, for_predict):
    output_df = pd.DataFrame()
    for article_id in range(len(for_predict)):
      tokenized_text = list(for_predict[article_id])
      batched_text = split_chunks(tokenized_text)
      batched_labels = model.predict(batched_text)
      labels = [l for batch in batched_labels for l in batch]

      entities_result = format_result(tokenized_text, labels)
      df = pd.DataFrame(entities_result)
      df.insert(0, 'article_id', [article_id for _ in range(len(entities_result))])
      output_df = pd.concat([output_df, df], ignore_index=True)
    
    return output_df

In [None]:
ans = predict(model, for_predict)
ans[['article_id', 'start_position', 'end_position']] = ans[['article_id', 'start_position', 'end_position']].astype(int)

In [None]:
ans

In [None]:
ans.to_csv(f'./output/output.tsv', sep='\t', index=False)