In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoTokenizer, BertForSequenceClassification




In [2]:
data = pd.read_csv("../../data/google_review_wisata_jabar/data_review_validated.csv")
data.drop("id", axis=1, inplace=True)
data.head()

Unnamed: 0,text,label
0,bagus air terjun renang luas ramah duduk,5
1,2 minggu abis bencana air bah bangun hancur,5
2,indah banget air terjun habis hujan deres bang...,5
3,salah curug indah jawa barat lokasi sukabumi s...,5
4,destinasi favorit nih klw sukabumi,5


In [3]:
data["label"] = data["label"].apply(lambda x: x-1)
data

Unnamed: 0,text,label
0,bagus air terjun renang luas ramah duduk,4
1,2 minggu abis bencana air bah bangun hancur,4
2,indah banget air terjun habis hujan deres bang...,4
3,salah curug indah jawa barat lokasi sukabumi s...,4
4,destinasi favorit nih klw sukabumi,4
...,...,...
2971,gambar tipu realitas indah gambar,2
2972,indah,4
2973,bagus asli nice,4
2974,seru pacu adrenalin medan keren,4


In [4]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "indolem/indobert-base-uncased", num_labels=5)


Some weights of the model checkpoint at indolem/indobert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indober

In [5]:
text = "jelek"
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
pred = torch.nn.functional.softmax(output.logits, dim=-1)
print(np.argmax(pred.detach().numpy()))

3


In [6]:
X = list(data["text"])
y = list(data["label"])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, return_tensors="pt", max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, return_tensors="pt", max_length=512)

In [7]:
class PariwisataDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])
                for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = PariwisataDataset(X_train_tokenized, y_train)
val_dataset = PariwisataDataset(X_val_tokenized, y_val)

In [8]:
def comp_metrics(y_hat):
    pred, labels = y_hat
    pred = np.argmax(pred, axis=1)

    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"f1": f1}

In [9]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=10,
    label_names=[0, 1, 2, 3, 4, 5],
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=comp_metrics,
    
)


In [10]:
trainer.train()

***** Running training *****
  Num examples = 2380
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2980


  0%|          | 0/2980 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx])
Saving model checkpoint to output\checkpoint-500
Configuration saved in output\checkpoint-500\config.json


{'loss': 0.8456, 'learning_rate': 4.161073825503356e-05, 'epoch': 1.68}


Model weights saved in output\checkpoint-500\pytorch_model.bin
  item = {key: torch.tensor(val[idx])
Saving model checkpoint to output\checkpoint-1000
Configuration saved in output\checkpoint-1000\config.json


{'loss': 0.7744, 'learning_rate': 3.3221476510067115e-05, 'epoch': 3.36}


Model weights saved in output\checkpoint-1000\pytorch_model.bin
  item = {key: torch.tensor(val[idx])
Saving model checkpoint to output\checkpoint-1500
Configuration saved in output\checkpoint-1500\config.json


{'loss': 0.7156, 'learning_rate': 2.4832214765100674e-05, 'epoch': 5.03}


Model weights saved in output\checkpoint-1500\pytorch_model.bin
  item = {key: torch.tensor(val[idx])
Saving model checkpoint to output\checkpoint-2000
Configuration saved in output\checkpoint-2000\config.json


{'loss': 0.6343, 'learning_rate': 1.644295302013423e-05, 'epoch': 6.71}


Model weights saved in output\checkpoint-2000\pytorch_model.bin
  item = {key: torch.tensor(val[idx])
Saving model checkpoint to output\checkpoint-2500
Configuration saved in output\checkpoint-2500\config.json


{'loss': 0.5747, 'learning_rate': 8.053691275167785e-06, 'epoch': 8.39}


Model weights saved in output\checkpoint-2500\pytorch_model.bin
  item = {key: torch.tensor(val[idx])


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 390.6335, 'train_samples_per_second': 60.927, 'train_steps_per_second': 7.629, 'train_loss': 0.6781547290366768, 'epoch': 10.0}


TrainOutput(global_step=2980, training_loss=0.6781547290366768, metrics={'train_runtime': 390.6335, 'train_samples_per_second': 60.927, 'train_steps_per_second': 7.629, 'train_loss': 0.6781547290366768, 'epoch': 10.0})

In [12]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 596
  Batch size = 8
  item = {key: torch.tensor(val[idx])


  0%|          | 0/75 [00:00<?, ?it/s]

{'eval_runtime': 2.2501,
 'eval_samples_per_second': 264.877,
 'eval_steps_per_second': 33.332,
 'epoch': 10.0}

In [14]:
trainer.save_model("bcc-basudara-v1")

Saving model checkpoint to bcc-basudara-v1
Configuration saved in bcc-basudara-v1\config.json
Model weights saved in bcc-basudara-v1\pytorch_model.bin


In [18]:
model_basudara = BertForSequenceClassification.from_pretrained("bcc-basudara-v1", num_labels=5)
model_basudara = model_basudara.to("cpu")

loading configuration file bcc-basudara-v1\config.json
Model config BertConfig {
  "_name_or_path": "indolem/indobert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_ids": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": 

In [30]:
data["label_bert"] = data["text"].apply(lambda x: np.argmax(torch.nn.functional.softmax(model_basudara(**tokenizer([x], return_tensors='pt')).logits, dim=-1).detach().numpy()) + 1)
data

Unnamed: 0,text,label,label_bert
0,bagus air terjun renang luas ramah duduk,4,5
1,2 minggu abis bencana air bah bangun hancur,4,5
2,indah banget air terjun habis hujan deres bang...,4,5
3,salah curug indah jawa barat lokasi sukabumi s...,4,5
4,destinasi favorit nih klw sukabumi,4,5
...,...,...,...
2971,gambar tipu realitas indah gambar,2,3
2972,indah,4,5
2973,bagus asli nice,4,5
2974,seru pacu adrenalin medan keren,4,5


In [31]:
data["label"] = data["label"].apply(lambda x: x+1)
data["selisih"] = abs(data["label"] - data["label_bert"])
data

Unnamed: 0,text,label,label_bert,selisih
0,bagus air terjun renang luas ramah duduk,5,5,0
1,2 minggu abis bencana air bah bangun hancur,5,5,0
2,indah banget air terjun habis hujan deres bang...,5,5,0
3,salah curug indah jawa barat lokasi sukabumi s...,5,5,0
4,destinasi favorit nih klw sukabumi,5,5,0
...,...,...,...,...
2971,gambar tipu realitas indah gambar,3,3,0
2972,indah,5,5,0
2973,bagus asli nice,5,5,0
2974,seru pacu adrenalin medan keren,5,5,0


In [33]:
data[data["label_bert"] == 1]

Unnamed: 0,text,label,label_bert,selisih
152,jaga korupsi bayar 6orang retribusi kasih 2 to...,1,1,0
619,kecewa bersih ganggu nyaman curug biaya bersih,2,1,1
669,toilet awat yg tunggu pintu gaya nunggu bayar ...,3,1,2
675,mahal utk ukur wisata yg 1 dgn harga tiket mob...,1,1,0
681,kawah putih jalan jelek gelombang bawa mobil b...,2,1,1
...,...,...,...,...
2649,usul kang coba jenak pikir susah dar sholat fa...,1,1,0
2814,sdh terkoordinir organisir calo tawar perahu h...,5,1,4
2856,jalan jelek,3,1,2
2864,0 bintang kecewa bgt,1,1,0


In [37]:
tokenizer.save_pretrained("indobert-basudara-v1")

tokenizer config file saved in indobert-basudara-v1\tokenizer_config.json
Special tokens file saved in indobert-basudara-v1\special_tokens_map.json


('indobert-basudara-v1\\tokenizer_config.json',
 'indobert-basudara-v1\\special_tokens_map.json',
 'indobert-basudara-v1\\vocab.txt',
 'indobert-basudara-v1\\added_tokens.json',
 'indobert-basudara-v1\\tokenizer.json')

In [36]:
model_basudara.save_pretrained("indobert-basudara-v1")

Configuration saved in indobert-basudara-v1\config.json
Model weights saved in indobert-basudara-v1\pytorch_model.bin


In [39]:
dummy_model_input = tokenizer("bagus", return_tensors="pt")
torch.onnx.export(
    model_basudara,
    tuple(dummy_model_input.values()),
    f="model_basudara.onnx",
    input_names=['input_ids', 'attention_mask'],
    output_names=['logits'],
    dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'},
                  'attention_mask': {0: 'batch_size', 1: 'sequence'},
                  'logits': {0: 'batch_size', 1: 'sequence'}},
    do_constant_folding=True,
    opset_version=13,
)


In [40]:
import onnxruntime as onnxrt
onnx_session= onnxrt.InferenceSession("model_basudara.onnx")
onnx_inputs= {onnx_session.get_inputs()[0].name:
to_numpy(img)}


ModuleNotFoundError: No module named 'onnxruntime'

In [42]:
data.to_csv("data_sentimen_analisis.csv", index=False)