# Training

### Pytorch

In [2]:
!pip install -q transformers

[K     |████████████████████████████████| 4.2 MB 7.6 MB/s 
[K     |████████████████████████████████| 596 kB 69.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 55.8 MB/s 
[K     |████████████████████████████████| 84 kB 3.2 MB/s 
[?25h

In [3]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
df = pd.read_csv('Data_Kalimat_Encoded.csv')
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'list']].copy()
new_df.head()

In [None]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')

In [6]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [None]:
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

In [8]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased',return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

In [14]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [15]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [18]:
torch.save(model, '/content/drive/MyDrive/Proyek/Proyek PLN/bert.pt')

In [19]:
model.eval()

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [61]:
model = torch.load('/content/drive/MyDrive/Proyek/Proyek PLN/bert.pt')

In [None]:
test_comment = "You are such a loser! You'll regret everything you've done to me!"
encoding = tokenizer.encode_plus(
  test_comment,
  add_special_tokens=True,
  max_length=512,
  return_token_type_ids=True,
  padding="max_length",
  return_attention_mask=True,
  return_tensors='pt',
)

ids = torch.tensor(encoding['input_ids'], dtype=torch.long)
mask = torch.tensor(encoding['attention_mask'], dtype=torch.long)
token_type_ids = torch.tensor(encoding["token_type_ids"], dtype=torch.long)

In [None]:
ids = encoding['input_ids'].to(device, dtype = torch.long)
mask = encoding['attention_mask'].to(device, dtype = torch.long)
token_type_ids = encoding['token_type_ids'].to(device, dtype = torch.long)

In [64]:
test_prediction = model(ids, mask,token_type_ids)

In [65]:
test_prediction = torch.sigmoid(test_prediction).cpu().detach().numpy().tolist()

In [None]:
LABEL_COLUMNS = df.columns.tolist()[2:]
LABEL_COLUMNS

In [None]:
predictions = []
for result in test_prediction:
  for prediction in result:
    predictions.append(prediction)
predictions

In [None]:
for label, prediction in zip(LABEL_COLUMNS, predictions):
  print(f"{label}: {prediction}")

# Simple Transformers

In [None]:
import pandas as pd
import random, sys
import os
import re
import string
import nltk
import numpy as np
from simpletransformers.classification import ClassificationModel
import pandas as pd
import sklearn
import logging

In [None]:
df = pd.read_csv('convert_data.csv')
df = df.drop(columns = ['Unnamed: 0'],axis=1)
df = df.rename(columns = {'kalimat1' :'text_a','kalimat2' : 'text_b','label':'labels'})

In [None]:
from sklearn.model_selection import train_test_split
train,val = train_test_split(df, test_size=0.2, random_state=42)
train_new,test = train_test_split(train, test_size=0.1, random_state=42)

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Create a ClassificationModel
bertmodel = ClassificationModel('bert', 'indobenchmark/indobert-base-p1', num_labels=2, use_cuda=True, cuda_device=0, 
                            args={
    'reprocess_input_data': True,
    "learning_rate": 2e-5,
    "train_batch_size" : 32,
    "best_model_dir" : "Models/J-PT/IndoBERT/bestModel",
    "output_dir" : "Models/coba/IndoBERT",
    'overwrite_output_dir': True,
    'num_train_epochs': 5,    "save_eval_checkpoints": False,
    "save_steps": -1,}
)

In [None]:
bertmodel.train_model(train_new, eval_df=val)

In [None]:
string = []
strings = []
for i in test.index:
  string.append(test['text_a'][i])
  string.append(test['text_b'][i])
  strings.append(string)
  string = []

In [None]:
predictions, raw_outputs = bertmodel.predict(strings)