# **Introduction To Transformer**

Kurniadi Ahmad Wijaya - 1301194024

### **Importing Dataset**

In [1]:
# !pip install transformers
# !pip install datasets
# !pip install indobenchmark-toolkit

In [106]:
from sklearn.model_selection import train_test_split
from transformers import Trainer
from datasets import load_metric

import pandas as pd
import numpy as np

In [107]:
df = pd.read_csv("https://raw.githubusercontent.com/ShinyQ/Final-IFest-2021_Analisis-Sentimen-Kebijkakan-PPKM-Pemerintah/main/Data/Unbalance/NonStopword_Stem_Min2_Clean/clean_train.csv")
df.sample(5)

Unnamed: 0,raw,processed,label
7839,@PIJATMAKASSAR05 Kak Cariin aku Top dong yg ma...,kak carikan saya top dong yang mau boking aku ...,0
8055,@jokowi Yang penting PPKM sampai tahun 2045 pak,yang penting ppkm sampai tahun pak,0
971,Day 3 mas satu2nya hari ini akad nikah tp g bi...,hari mas satu nya hari ini akad nikah tapi bis...,0
4089,@jokowi Pak hentikan PPKM banyak yg kena imbas...,pak hentikan ppkm banyak yang kena imbasnya sa...,0
7271,"@Mayangprsprnt Padahal kalo memang niat, ppkm ...",padahal jika memang niat ppkm bukan halangan u...,1


In [150]:
df_test = pd.read_csv("https://raw.githubusercontent.com/ShinyQ/Final-IFest-2021_Analisis-Sentimen-Kebijkakan-PPKM-Pemerintah/main/Data/Unbalance/NonStopword_Stem_Min2_Clean/clean_test.csv")
df_test.rename(columns={"processed": "text"}, inplace=True)
df_test.sample(5)

Unnamed: 0,raw,text,label
95,"@schfess @movntaine eh tp kan ppkm ny 3-20, pd...",eh tapi kan ppkm nya padahal masuk sama nya hm...,?
15,@NOTASLIMBOY Masih dong… kan proyek strategis ...,masih dong kan proyek strategis nasional di pp...,?
30,"Terbang sekarang pakai PCR, ini baik yang masu...",terbang sekarang pakai pcr ini baik yang masuk...,?
158,Meningkatnya angka covid-19 membuat kebijakan ...,meningkatnya angka covid membuat kebijakan ppk...,?
128,"Intinya, patuhi PPKM dgn serius sebaga upaya m...",intinya patuhi ppkm dengan serius sebaga upaya...,?


In [109]:
df = df.replace(r'^\s*$', np.NaN, regex=True)
df.isna().sum()

raw          0
processed    0
label        0
dtype: int64

## **Split Train Dan Validation**

In [110]:
df_train, df_val = train_test_split(df, shuffle=True, test_size=0.2)

## Preprocessing

In [111]:
df_val.drop("raw", axis=1, inplace=True)
df_train.drop("raw", axis=1, inplace=True)
df_test.drop("raw", axis=1, inplace=True)

df_test["label"] = df_test["label"].map({"?" : 0})

In [112]:
df_train = df_train.rename(columns={
    "processed" : "text",
    "labels" : "label"
})

df_val = df_val.rename(columns={
    "processed" : "text",
    "labels" : "label"
})

df_test = df_test.rename(columns={
    "processed" : "text",
    "labels" : "label"
})

In [113]:
df_train.to_csv("train.csv", index=False)
df_val.to_csv("eval.csv", index=False)
df_test.to_csv("test.csv", index=False)

In [None]:
from datasets import load_dataset

files = {
    "train": "train.csv", 
    "eval": "eval.csv", 
    "test": "test.csv"
}

dataset = load_dataset('csv', data_files=files)

## **Using Pretrained Model**

**Sumber :**

Hungging Face Indo Benchmark : [Dasar Repository Indo Benchmark](https://huggingface.co/indobenchmark).

### **IndoBERT Transformer**

In [115]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

In [116]:
def tokenize_function(text):
    return tokenizer(text["text"], padding='max_length', max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [117]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 7848
    })
    eval: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 1962
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'label', 'text', 'token_type_ids'],
        num_rows: 200
    })
})

In [118]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["eval"]

```
training_args = TrainingArguments(
     output_dir="test_trainer",
     do_train=True,
     evaluation_strategy="steps",
     learning_rate=1e-8,
     num_train_epochs=3,
     warmup_steps=100,
     save_strategy="epoch",
     save_steps=500,
)
```

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", num_labels=3)

In [120]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer")

In [121]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

train_history = trainer.train()
train_evaluate = trainer.evaluate()

Step,Training Loss
500,0.6479
1000,0.5541
1500,0.3869
2000,0.3635
2500,0.1927


In [122]:
train_evaluate

{'eval_loss': 0.9667941927909851,
 'eval_accuracy': 0.7976554536187563,
 'eval_runtime': 6.8601,
 'eval_samples_per_second': 286.002,
 'epoch': 3.0,
 'eval_mem_cpu_alloc_delta': 12107776,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 95265792}

In [123]:
trainer.save_model("IndoBERT")

### **IndoGPT Transformer**

In [124]:
from indobenchmark import IndoNLGTokenizer

In [None]:
tokenizer = IndoNLGTokenizer.from_pretrained("indobenchmark/indogpt")
tokenizer.pad_token = tokenizer.eos_token

In [126]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding='max_length', max_length=256)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [127]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["eval"]

In [None]:
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained("indobenchmark/indogpt", num_labels=3)
model.config.pad_token_id = model.config.eos_token_id

In [129]:
training_args = TrainingArguments("test_trainer")

In [130]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

train_history = trainer.train()
train_evaluate = trainer.evaluate()

Step,Training Loss
500,0.6576
1000,0.531
1500,0.3402
2000,0.3105
2500,0.1323


In [131]:
train_evaluate

{'eval_loss': 1.2066888809204102,
 'eval_accuracy': 0.8037716615698267,
 'eval_runtime': 8.3893,
 'eval_samples_per_second': 233.87,
 'epoch': 3.0,
 'eval_mem_cpu_alloc_delta': 8101888,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 359492608}

In [132]:
trainer.save_model("IndoGPT")

### **IndoBERT Tweet Transformer**

In [133]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased", use_fast=True)

In [134]:
def tokenize_function(examples):
    return tokenizer(str(examples["text"]), padding='max_length', max_length=256, truncation=True)

tokenized_datasets = dataset.map(tokenize_function)

HBox(children=(FloatProgress(value=0.0, max=7848.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1962.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




In [135]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["eval"]
test_dataset = tokenized_datasets["test"]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("indolem/indobertweet-base-uncased", num_labels=3)

In [137]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

training_history = trainer.train()
evaluation_history = trainer.evaluate()

Step,Training Loss
500,0.5608
1000,0.4702
1500,0.2883
2000,0.2794
2500,0.1351


In [138]:
evaluation_history

{'eval_loss': 0.8820319175720215,
 'eval_accuracy': 0.8363914373088684,
 'eval_runtime': 6.1631,
 'eval_samples_per_second': 318.346,
 'epoch': 3.0,
 'eval_mem_cpu_alloc_delta': 12058624,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_peaked_delta': 94479360}

In [139]:
trainer.save_model("IndoBERTTweet")

## **Prediksi Label**

In [149]:
prediction = trainer.predict(test_dataset)
prediction = prediction.predictions.argmax(1)

prediction

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 1,
       2, 1, 1, 2, 0, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 2, 1, 1,
       0, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 2, 0, 2, 0, 1, 0, 0, 2, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       2, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 2, 0, 1, 1, 2, 2, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 2, 0, 2, 1, 1, 0, 0, 2, 2, 0, 0, 0, 0,
       0, 0, 1, 1, 2, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 2, 2, 2, 1, 0, 1, 2,
       0, 0, 1, 1, 1, 1, 0, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 0, 0, 2, 0, 1,
       2, 0])

In [183]:
df_test['prediction'] = prediction
df_test[['text','prediction']].sample(20)

Unnamed: 0,text,prediction
111,karena wisata belanja bukan ritual ibadah jika...,0
180,hari ppkm,1
193,ppkm membunuh ekonomi,0
15,masih dong kan proyek strategis nasional di pp...,2
137,setuju banget pak adhie yang kita tidak punya ...,0
158,meningkatnya angka covid membuat kebijakan ppk...,2
78,ppkm darurat mensyaratkan setiap penumpang pes...,1
31,memang siy ppkm darurat ini menghambat pariwis...,0
26,stabesok iya ppkm darurat ala jokowi padahal a...,0
29,masuk ppkm kota aku sedih banget tapi semoga d...,2


In [184]:
df_test.to_csv('IndoBERTTweet_Predict.csv', index=False)