In [2]:
import pandas as pd

df = pd.read_csv('dataset/indonesian-hate-speech-superset/in_hf.csv')
print(df.head())
print(df['labels'].value_counts())

                                                text  labels     source  \
0  @USER.wood17 knp lo gak berani bersumpah dan b...       1  Instagram   
1  haha, somad somad. Muka dekil otak 0% , kok ya...       1  Instagram   
2  hahaha, kaum sableng 212 kl berita begini mrk ...       1  Instagram   
3  hahaha, makin stress aja  ni umat sableng, dlu...       1  Instagram   
4       HIDUP PSI = partai SAMPAH indonesia..... ...       1  Instagram   

        dataset  nb_annotators  
0  ID_instagram              3  
1  ID_instagram              3  
2  ID_instagram              3  
3  ID_instagram              3  
4  ID_instagram              3  
labels
0    8256
1    6050
Name: count, dtype: int64


In [3]:
# label_mapping = {
#     'Non_HS' : 0,
#     'HS': 1
# }

# df['Label'] = df['labels']
# print(df.head())

df.rename(columns={'text': 'text', 'labels': 'label'}, inplace=True)
df_final = df[['text', 'label']]

print(df_final.head())

                                                text  label
0  @USER.wood17 knp lo gak berani bersumpah dan b...      1
1  haha, somad somad. Muka dekil otak 0% , kok ya...      1
2  hahaha, kaum sableng 212 kl berita begini mrk ...      1
3  hahaha, makin stress aja  ni umat sableng, dlu...      1
4       HIDUP PSI = partai SAMPAH indonesia..... ...      1


In [6]:
import re
import nltk
from nltk.corpus import stopwords as _nltk_stopwords

nltk.download('stopwords', quiet=True)

stopwords = set(_nltk_stopwords.words('indonesian'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^A-Za-z0-9\s]+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

df_final['text'] = df_final['text'].apply(clean_text)
df_final.to_csv('dataset/in_hf_cleaned.csv', index=False)
print(df_final.head())


                                                text  label
0  wood17 knp lo gak berani bersumpah bertaruh kr...      1
1  haha somad somad muka dekil otak 0 ya g malu n...      1
2    hahaha kaum sableng 212 kl berita mrk buta tuli      1
3  hahaha stress aja ni umat sableng dlu raja sal...      1
4                  hidup psi partai sampah indonesia      1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['text'] = df_final['text'].apply(clean_text)


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "indobenchmark/indobert-base-p1"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_final, test_size=0.2, random_state=42, stratify=df_final['label'])

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")

Train size: 11444, Test size: 2862


In [9]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [10]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/11444 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2862 [00:00<?, ? examples/s]

In [11]:
from transformers import TrainingArguments

output_dir = "./results"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=50,
    load_best_model_at_end=True,
)




In [12]:
import torch

print(torch.cuda.is_available())

True


In [13]:
from transformers import Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss,Validation Loss
50,0.6566,0.634098
100,0.6192,0.478592
150,0.4667,0.435579
200,0.395,0.424521
250,0.4092,0.445365
300,0.4614,0.422135
350,0.4553,0.396515
400,0.3725,0.514377
450,0.4655,0.392998
500,0.411,0.378754


TrainOutput(global_step=2864, training_loss=0.2315329417453652, metrics={'train_runtime': 962.3428, 'train_samples_per_second': 47.567, 'train_steps_per_second': 2.976, 'total_flos': 1186199193098400.0, 'train_loss': 0.2315329417453652, 'epoch': 4.0})

In [14]:
import numpy as np

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

predictions = trainer.predict(tokenized_test)
predicted_labels = np.argmax(predictions.predictions, axis=1)

true_labels = tokenized_test['label']

print(classification_report(true_labels, predicted_labels))

print("Accuracy:", accuracy_score(true_labels, predicted_labels))

print("Precision, Recall, F1-Score:", precision_recall_fscore_support(true_labels, predicted_labels, average='weighted'))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      1652
           1       0.81      0.82      0.81      1210

    accuracy                           0.84      2862
   macro avg       0.84      0.84      0.84      2862
weighted avg       0.84      0.84      0.84      2862

Accuracy: 0.8413696715583509
Precision, Recall, F1-Score: (np.float64(0.8416041157241183), np.float64(0.8413696715583509), np.float64(0.8414719096013785), None)


In [19]:
trainer.save_model("./final_model")

final_model = AutoModelForSequenceClassification.from_pretrained("./final_model")
final_tokenizer = AutoTokenizer.from_pretrained("./final_model")

from transformers import pipeline

predictor = pipeline("text-classification", model=final_model, tokenizer=final_tokenizer)

sentences_test_1 = "Saya harap indonesia bisa menjadi lebih baik untuk kedepannya."
sentences_test_2 = "Orang-orang dari suku tertentu adalah pembelah bangsa emang."
sentences_test_3 = "Mereka emang cuman orang kurang berpendidikan, omong doang bisanya bodoh emang."
sentences_test_4 = "Memang seperti itu suku jawa, selalu jadi pusat sorotan. Memang tolol suku itu"
sentences_test_5 = "boro boro ingin maju, ijazah jelek luar biasa udah sok"
sentences_test_6 = "Siapa lagi kalau bukan yang dia yang melakukan, hutang tinggi karena siapa kalau bukan dia yang diagungkan" #sindiran

print(predictor(sentences_test_1))
print(predictor(sentences_test_2))
print(predictor(sentences_test_3))
print(predictor(sentences_test_4))
print(predictor(sentences_test_5))
print(predictor(sentences_test_6))

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.9790983200073242}]
[{'label': 'LABEL_1', 'score': 0.6681391596794128}]
[{'label': 'LABEL_1', 'score': 0.6054683923721313}]
[{'label': 'LABEL_1', 'score': 0.5968902111053467}]
[{'label': 'LABEL_1', 'score': 0.9608490467071533}]
[{'label': 'LABEL_0', 'score': 0.7696002125740051}]


Sindiran tidak terbaca dengan benar (baru mendeteksi ketika kata kata frontal dilontarkan) kemungkinan karena kekurangan data yang memang berbau menyindir. Bagaimana caranya agar dia mendeteksi sindiran juga?