In [None]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Indobert

## Non Handling

In [None]:
df = pd.read_csv("../TEMP/cleaned_datav2_translated_lemarized_stopwords.csv")

In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

# Print label mapping
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(label_mapping)

In [None]:
# Load IndoBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')
model = BertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased', num_labels=len(label_mapping))

# Check if GPU is available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Move model to the same device
model.to(device)

In [None]:
# Tokenize the input texts
inputs = tokenizer(df['text'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
labels = torch.tensor(df['label'].values)

# Create a dataset class with correct label type
class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
dataset = SimpleDataset(inputs, labels)

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=10,              # Number of training epochs
    per_device_train_batch_size=2,   # Batch size for training
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # The instantiated 🤗 Transformers model to be trained
    args=training_args,                  # Training arguments
    train_dataset=dataset,               # Training dataset
)

# Train the model
trainer.train()

In [None]:
# Tokenize input text
text = "jokowi menerapkan kebiakan"
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)

# Move input tensors to the same device as the model
inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

# Perform inference
model.eval()  # Set model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)

# Get predictions
predictions = torch.argmax(outputs.logits, dim=1)

predicted_label = label_encoder.inverse_transform(predictions.cpu())[0]
predicted_label