In [None]:
!pip install torch-summary
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-summary
  Downloading torch_summary-1.4.5-py3-none-any.whl (16 kB)
Installing collected packages: torch-summary
Successfully installed torch-summary-1.4.5
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━

In [None]:
!pip install gdown

!gdown --id '11qGfrKx9lMbTzfCLwR6zpOFYBJ9o8jvy'
!gdown --id '19KABP5K_vh2iOTCsjut5AzWH6l9KD3S0'

!gdown --id '1YuAw8mDeNVEGonqWoZbaHVccMBzzFyxv'
!gdown --id '1Q4J4MjQie7I-fsIDUGJWO6YEJvWk-JTu'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading...
From: https://drive.google.com/uc?id=11qGfrKx9lMbTzfCLwR6zpOFYBJ9o8jvy
To: /content/CaseFolded_PunctRemoved_RTUserRemoved_StopwordRemovedTrain.csv
100% 499k/499k [00:00<00:00, 106MB/s]
Downloading...
From: https://drive.google.com/uc?id=19KABP5K_vh2iOTCsjut5AzWH6l9KD3S0
To: /content/CaseFolded_PunctRemoved_RTUserRemoved_StopwordRemoved.csv
100% 11.5k/11.5k [00:00<00:00, 39.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1YuAw8mDeNVEGonqWoZbaHVccMBzzFyxv
To: /content/CaseFolded_PunctRemoved_RTUserRemovedTrain.csv
100% 702k/702k [00:00<00:00, 124MB/s]
Downloading...
From: https://drive.google.com/uc?id=1Q4J4MjQie7I-fsIDUGJWO6YEJvWk-JTu
To: /content/CaseFolded_PunctRemoved_RTUserRemovedTest.csv
100% 16.8k/16.8k [00:00<00:00, 52.7MB/s]


In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

import random
import numpy as np

In [None]:
train_df = pd.read_csv('/content/CaseFolded_PunctRemoved_RTUserRemovedTrain.csv').dropna()
test_df = pd.read_csv('/content/CaseFolded_PunctRemoved_RTUserRemovedTest.csv').dropna()

tweets = train_df['Tweet'].tolist()
targets = train_df[['HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other',
                    'HS_Individual', 'HS_Group', 'HS_Weak', 'HS_Moderate', 'HS_Strong']].values.tolist()

test_tweets = test_df['Tweet'].tolist()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased")
max_length = 128
lr = 0.00005
epochs = 100
batch_size = 32

In [None]:
# Pembuatan Dataset
class HateSpeechDataset(Dataset):
    def __init__(self, tweets, targets, tokenizer, max_length):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, index):
        tweet = self.tweets[index]
        target = self.targets[index]

        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target': torch.tensor(target, dtype=torch.float)
        }

In [None]:
class HateSpeechClassifier(nn.Module):
    def __init__(self, num_classes):
        super(HateSpeechClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("indolem/indobertweet-base-uncased")
        self.dropout = nn.Dropout(0.2)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(768, 256)
        self.fc2 = nn.Linear(256, 512)
        self.fc3 = nn.Linear(512, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, num_classes)
        self.softmax = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)[0]
        pooled_output = outputs[:, 0, :]
        pooled_output = self.dropout(pooled_output)

        linear_output1 = self.relu(self.fc1(pooled_output))
        linear_output2 = self.relu(self.fc2(linear_output1))
        linear_output3 = self.relu(self.fc3(linear_output2))
        linear_output4 = self.relu(self.fc4(linear_output3))
        linear_output5 = self.fc5(linear_output4)

        logits = self.softmax(linear_output5)

        return logits

In [None]:
train_tweets, val_tweets, train_targets, val_targets = train_test_split(tweets, targets, test_size=0.2, random_state=42)

In [None]:
train_dataset = HateSpeechDataset(train_tweets, train_targets, tokenizer, max_length)
val_dataset = HateSpeechDataset(val_tweets, val_targets, tokenizer, max_length)
test_dataset = HateSpeechDataset(test_tweets, [], tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
model = HateSpeechClassifier(num_classes=len(train_targets[0]))
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
criterion = torch.nn.BCELoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
criterion.to(device)
device

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at indolem/indobertweet-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


device(type='cuda')

In [None]:
def checkpoint(model, filename):
    torch.save(model.state_dict(), filename)

def resume(model, filename):
    model.load_state_dict(torch.load(filename))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# Convert logits to corresponding prediction
def finalize_prediction(out):
    # Handle category
    category = out[:5]
    for i in range(5):
        if category[i] > 0.5:
            out[i] = 1
        else:
            out[i] = 0

    # Handle individual/group target
    i = 5
    if out[i] >= out[i+1]:
        out[i] = 1
        out[i+1] = 0
    elif out[i] < out[i+1]:
        out[i] = 0
        out[i+1] = 1

    # Handle level of hate
    i = 7
    max = out[i]
    max_idx = i
    for j in range(1, 3):
        if out[i+j] > max:
            max = out[i+j]
            max_idx = i+j
    for i in range(7, 10):
        if i != max_idx:
            out[i] = 0
        else:
            out[i] = 1

In [None]:
best_accuracy = 0.0
for epoch in range(epochs):
    model.train()
    print(f"Epoch {epoch+1}: Training start")
    train_loss = 0.0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward(retain_graph=True)
        optimizer.step()

        train_loss += loss.item() * input_ids.size(0)

    train_loss /= len(train_dataloader.dataset)
    print(f"Epoch {epoch+1}: Training Done")
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask)
            for output in outputs:
                finalize_prediction(output)
            loss = criterion(outputs, targets)

            val_loss += loss.item() * input_ids.size(0)

            val_preds.extend(targets.cpu().detach().numpy().tolist())
            val_targets.extend(outputs.cpu().detach().numpy().tolist())
    acc = accuracy_score(val_targets, val_preds) * 100
    val_loss /= len(val_dataloader.dataset)

    print(f'Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}')
    print(f"Epoch {epoch+1}: validation accuracy = {acc:.2f}%")
    if acc > best_accuracy:
        best_accuracy = acc
        checkpoint(model, "best_model.pth")
    print(f"Epoch {epoch+1}: Validation end")

resume(model, "best_model.pth")

Epoch 1: Training start
Epoch 1: Training Done
Epoch 1/25 - Train Loss: 0.5825 - Val Loss: 22.7813
Epoch 1: validation accuracy = 44.19%
Epoch 1: Validation end
Epoch 2: Training start
Epoch 2: Training Done
Epoch 2/25 - Train Loss: 0.4191 - Val Loss: 16.6427
Epoch 2: validation accuracy = 46.26%
Epoch 2: Validation end
Epoch 3: Training start
Epoch 3: Training Done
Epoch 3/25 - Train Loss: 0.3494 - Val Loss: 14.7255
Epoch 3: validation accuracy = 49.14%
Epoch 3: Validation end
Epoch 4: Training start
Epoch 4: Training Done
Epoch 4/25 - Train Loss: 0.2918 - Val Loss: 12.8803
Epoch 4: validation accuracy = 52.57%
Epoch 4: Validation end
Epoch 5: Training start
Epoch 5: Training Done
Epoch 5/25 - Train Loss: 0.2232 - Val Loss: 12.2232
Epoch 5: validation accuracy = 53.47%
Epoch 5: Validation end
Epoch 6: Training start
Epoch 6: Training Done
Epoch 6/25 - Train Loss: 0.1792 - Val Loss: 11.7732
Epoch 6: validation accuracy = 54.73%
Epoch 6: Validation end
Epoch 7: Training start
Epoch 7: T

## Fine-tune (further training)

In [None]:
optimizer_finetune = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [None]:
best_loss = 10000
num_epoch_worse = 0  # To stop early if already starting to overfit
for epoch in range(epochs):
    model.train()
    print(f"Epoch {epoch+1}: Training start")
    train_loss = 0.0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['target'].to(device)

        optimizer_finetune.zero_grad()

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer_finetune.step()

        train_loss += loss.item()

    train_loss /= batch_size
    print(f"Epoch {epoch+1}: Training Done")

    model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask)
            for output in outputs:
                finalize_prediction(output)
            loss = criterion(outputs, targets)

            val_loss += loss.item()

            val_preds.extend(targets.cpu().detach().numpy().tolist())
            val_targets.extend(outputs.cpu().detach().numpy().tolist())
    acc = accuracy_score(val_targets, val_preds) * 100
    val_loss /= batch_size

    print(f'Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}')
    print(f"Epoch {epoch+1}: validation accuracy = {acc:.2f}%")

    if val_loss < best_loss:
        num_epoch_worse = 0
        best_loss = val_loss
        checkpoint(model, "best_model.pth")
    else:
        num_epoch_worse += 1

    if num_epoch_worse == 5:
        break
    print(f"Epoch {epoch+1}: Validation end")
resume(model, "best_model.pth")

Epoch 1: Training start
Epoch 1: Training Done
Epoch 1/100 - Train Loss: 0.0730 - Val Loss: 10.2751
Epoch 1: validation accuracy = 68.77%
Epoch 1: Validation end
Epoch 2: Training start
Epoch 2: Training Done
Epoch 2/100 - Train Loss: 0.0538 - Val Loss: 10.0955
Epoch 2: validation accuracy = 68.95%
Epoch 2: Validation end
Epoch 3: Training start
Epoch 3: Training Done
Epoch 3/100 - Train Loss: 0.0504 - Val Loss: 10.4921
Epoch 3: validation accuracy = 67.42%
Epoch 3: Validation end
Epoch 4: Training start
Epoch 4: Training Done
Epoch 4/100 - Train Loss: 0.0530 - Val Loss: 10.0038
Epoch 4: validation accuracy = 68.95%
Epoch 4: Validation end
Epoch 5: Training start
Epoch 5: Training Done
Epoch 5/100 - Train Loss: 0.0450 - Val Loss: 10.7146
Epoch 5: validation accuracy = 67.33%
Epoch 5: Validation end
Epoch 6: Training start
Epoch 6: Training Done
Epoch 6/100 - Train Loss: 0.0454 - Val Loss: 10.5762
Epoch 6: validation accuracy = 68.23%
Epoch 6: Validation end
Epoch 7: Training start
Epoc

In [None]:
unprocessed = pd.read_csv('train.csv')
unprocessed.head()

In [None]:
tokenizer.tokenize(unprocessed['Tweet'][0])

In [None]:
!gdown --id '1lsnSHeMC2w5wj31dT4U6yFtycNigYWRX'

Downloading...
From: https://drive.google.com/uc?id=1lsnSHeMC2w5wj31dT4U6yFtycNigYWRX
To: /content/model.pth
100% 444M/444M [00:06<00:00, 70.2MB/s]


## Evaluation

In [None]:
resume(model, "model.pth")

In [None]:
test_df.head()

Unnamed: 0,No,Tweet
0,Test-1,pemerintah sekarang pro asing sudah tidak bisa...
1,Test-2,cebong dungu picek sudah kalah malah gila
2,Test-3,namanya juga simpang susun bukan bundaran sema...
3,Test-4,yang tidak pakai jilbab komunis megawati sri m...
4,Test-5,ramos yang aku pandang idola dahulu sekarang s...


In [None]:
!gdown --id '1VPGogdg7OuBjktf5iMuEHN2pUSuJzvoE'

submission_sample = pd.read_csv('sample.csv')
submission_sample.head()

Downloading...
From: https://drive.google.com/uc?id=1VPGogdg7OuBjktf5iMuEHN2pUSuJzvoE
To: /content/sample.csv
100% 4.35k/4.35k [00:00<00:00, 23.6MB/s]


Unnamed: 0,No,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,0,0,0,0,0,0,0,0,0,0
1,Test-2,0,0,0,0,0,0,0,0,0,0
2,Test-3,0,0,0,0,0,0,0,0,0,0
3,Test-4,0,0,0,0,0,0,0,0,0,0
4,Test-5,0,0,0,0,0,0,0,0,0,0


In [None]:
# Extract dummy target from submission sample & concat it to cleaned training data
target_list = [
    'HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender',
    'HS_Other', 'HS_Individual', 'HS_Group', 'HS_Weak',
    'HS_Moderate', 'HS_Strong'
]

In [None]:
df_test = pd.read_csv('CaseFolded_PunctRemoved_RTUserRemovedTest.csv')
df_test.head()

Unnamed: 0,No,Tweet
0,Test-1,pemerintah sekarang pro asing sudah tidak bisa...
1,Test-2,cebong dungu picek sudah kalah malah gila
2,Test-3,namanya juga simpang susun bukan bundaran sema...
3,Test-4,yang tidak pakai jilbab komunis megawati sri m...
4,Test-5,ramos yang aku pandang idola dahulu sekarang s...


In [None]:
dummy_target = pd.DataFrame(submission_sample[target_list].values, columns=target_list)
dummy_target

Unnamed: 0,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
145,0,0,0,0,0,0,0,0,0,0
146,0,0,0,0,0,0,0,0,0,0
147,0,0,0,0,0,0,0,0,0,0
148,0,0,0,0,0,0,0,0,0,0


In [None]:
df_test = pd.concat([df_test, dummy_target], axis=1)
df_test.head()

Unnamed: 0,No,Tweet,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,pemerintah sekarang pro asing sudah tidak bisa...,0,0,0,0,0,0,0,0,0,0
1,Test-2,cebong dungu picek sudah kalah malah gila,0,0,0,0,0,0,0,0,0,0
2,Test-3,namanya juga simpang susun bukan bundaran sema...,0,0,0,0,0,0,0,0,0,0
3,Test-4,yang tidak pakai jilbab komunis megawati sri m...,0,0,0,0,0,0,0,0,0,0
4,Test-5,ramos yang aku pandang idola dahulu sekarang s...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_test['Tweet'][2]

'namanya juga simpang susun bukan bundaran semanggi om habib waras dasar kampungan kamu om'

In [None]:
test_tweets = df_test['Tweet'].tolist()
test_dataset = HateSpeechDataset(test_tweets, df_test[target_list].values.tolist(), tokenizer, max_length)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
output_labels = []
with torch.no_grad():
    for batch_data in test_dataloader:
        # Unpack batch_data
        ids = batch_data['input_ids'].to(device)
        mask = batch_data['attention_mask'].to(device)

        # Saving output
        outputs = model(ids, mask)
        for output in outputs:
            finalize_prediction(output)
        output_labels.extend(outputs.cpu().detach().numpy().tolist())

In [None]:
model_preds = pd.DataFrame(output_labels, columns=target_list)
model_preds.head()

Unnamed: 0,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [None]:
model_preds.to_csv('prediction.csv')

In [None]:
df_test = pd.concat([df_test['No'], model_preds], axis=1)
df_test.head()

Unnamed: 0,No,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Individual,HS_Group,HS_Weak,HS_Moderate,HS_Strong
0,Test-1,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1,Test-2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,Test-3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,Test-4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,Test-5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [None]:
df_test.to_csv('result.csv', index=False)