In [1]:
#Import Library
import numpy as np
import pandas as pd
import re
import string
from sklearn.metrics import f1_score,accuracy_score
from simpletransformers.classification import ClassificationModel
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import logging
import warnings
warnings.simplefilter("ignore")
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [2]:
# Load Dataset
train_df = pd.read_csv('DATA/v1/train.csv')
train_df['status'] = train_df['status'].apply(lambda x : str(x).strip())

val_df = pd.read_csv('DATA/v1/val.csv')
test_df = pd.read_csv('DATA/v1/test.csv')

In [3]:
train_df.head()

Unnamed: 0,instansi,reference,status
0,"Kementerian Koordinator Bidang Politik, Hukum,...",Kemenko Polhukam,yes
1,"Kementerian Koordinator Bidang Politik, Hukum,...",Polhukam,yes
2,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Politik,no
3,"Kementerian Koordinator Bidang Politik, Hukum,...",Koordinator Hukum,no
4,Kementerian Luar Negeri,Kemlu,yes


In [4]:
#Ubah nama kolom
train_df = train_df.rename({"instansi" : "sentence1", "reference" : "sentence2", "status" : "label"}, axis=1)
val_df = val_df.rename({"instansi" : "sentence1", "reference" : "sentence2", "status" : "label"}, axis=1)
test_df = test_df.rename({"instansi" : "sentence1", "reference" : "sentence2", "status" : "label"}, axis=1)

In [5]:
#label
train_df['label'] = train_df['label'].map({"no" : 0, "yes" : 1})
val_df['label'] = val_df['label'].map({"no" : 0, "yes" : 1})
test_df['label'] = test_df['label'].map({"no" : 0, "yes" : 1})

In [6]:
train_df[['sentence1', 'sentence2']] = train_df[['sentence1', 'sentence2']].applymap(lambda x : str(x).lower())
val_df[['sentence1', 'sentence2']] = val_df[['sentence1', 'sentence2']].applymap(lambda x : str(x).lower())
test_df[['sentence1', 'sentence2']] = test_df[['sentence1', 'sentence2']].applymap(lambda x : str(x).lower())

In [7]:
train_df.head()

Unnamed: 0,sentence1,sentence2,label
0,"kementerian koordinator bidang politik, hukum,...",kemenko polhukam,1
1,"kementerian koordinator bidang politik, hukum,...",polhukam,1
2,"kementerian koordinator bidang politik, hukum,...",koordinator politik,0
3,"kementerian koordinator bidang politik, hukum,...",koordinator hukum,0
4,kementerian luar negeri,kemlu,1


In [8]:
train_df = train_df.rename({'sentence1':'text_a', 'sentence2':'text_b', 'label' : 'labels'}, axis=1)
val_df = val_df.rename({'sentence1':'text_a', 'sentence2':'text_b', 'label' : 'labels'}, axis=1)

In [9]:
list_model = ['indobenchmark/indobert-base-p2', 'indobenchmark/indobert-base-p1']

In [10]:
#Logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# #Parse command line arguments
# parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
# parser.add_argument("--model_type", default="bert", help="model type")
# parser.add_argument("--model", default='bert_models', help=" Base Model Folder")
# parser.add_argument("--num_epoch" , default=1, help="number of train epoch")
# parser.add_argument("--num_gpu" , default=1, help="number of gpu use")
# parser.add_argument("--cuda_device" , default=2, help="cuda device")
# argss = vars(parser.parse_args())

TRAIN_BATCH = 32
# Create a ClassificationModel
model = ClassificationModel('bert', list_model[1], use_cuda=True, cuda_device=0,
                            args={
    'reprocess_input_data': True,
    "learning_rate": 2e-5,
    "train_batch_size" : TRAIN_BATCH,
    "best_model_dir" : "Models/{}/bestModel".format(list_model[1]),
    "output_dir" : "Models/checkpoints/{}".format(list_model[1]),
    "evaluate_during_training" : True,
    "evaluate_during_training_steps" : int(np.ceil(train_df.shape[0]/TRAIN_BATCH)),
    'overwrite_output_dir': True,
    'num_train_epochs': 10,    "save_eval_checkpoints": False, "save_model_every_epoch" : False, 
    "save_steps": -1,
    "use_multiprocessing": False, 
    "use_multiprocessing_for_evaluation": False, 
    "process_count": 1,
    "no_cache" : True,}
)

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model.train_model(train_df, eval_df=val_df)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Running Epoch 0 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 1 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 2 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 3 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 4 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 5 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 6 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 7 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 8 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Running Epoch 9 of 10:   0%|          | 0/53 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model: Training of bert model complete. Saved to Models/checkpoints/indobenchmark/indobert-base-p1.


(530,
 defaultdict(list,
             {'global_step': [53,
               53,
               106,
               106,
               159,
               159,
               212,
               212,
               265,
               265,
               318,
               318,
               371,
               371,
               424,
               424,
               477,
               477,
               530,
               530],
              'train_loss': [0.33553487062454224,
               0.33553487062454224,
               0.2543051838874817,
               0.2543051838874817,
               0.06867316365242004,
               0.06867316365242004,
               0.015202491544187069,
               0.015202491544187069,
               0.08229800313711166,
               0.08229800313711166,
               0.0024977345019578934,
               0.0024977345019578934,
               0.0017653742106631398,
               0.0017653742106631398,
               0.002242195885628462