In [8]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [9]:
df = pd.read_csv('../data/scrapped_spotify.csv', encoding='utf-8')
df = df.dropna(subset=['content', 'score'])

  df = pd.read_csv('../data/scrapped_spotify.csv', encoding='utf-8')


In [10]:
df['sentiment'] = df['score'].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))

In [11]:
df_train, df_temp = train_test_split(df, test_size=0.3, stratify=df['sentiment'], random_state=42)
df_val, df_test = train_test_split(df_temp, test_size=0.5, stratify=df_temp['sentiment'], random_state=42)

In [12]:
class SpotifyDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_length=128):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            str(self.reviews[idx]),
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(self.targets[idx], dtype=torch.long)
        }

In [13]:
tokenizer = AutoTokenizer.from_pretrained("indolem/indobert-base-uncased")

train_dataset = SpotifyDataset(df_train['content'].values, df_train['sentiment'].values, tokenizer)
val_dataset = SpotifyDataset(df_val['content'].values, df_val['sentiment'].values, tokenizer)
test_dataset = SpotifyDataset(df_test['content'].values, df_test['sentiment'].values, tokenizer)


In [14]:
batch_size = 16
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(
    "indolem/indobert-base-uncased", 
    num_labels=3  # 3 kelas: Negatif, Netral, Positif
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

In [None]:
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
    
    # Evaluasi per epoch
    model.eval()
    val_preds, val_labels = [], []
    
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
            
            val_preds.extend(preds)
            val_labels.extend(targets.cpu().numpy())
 
    accuracy = accuracy_score(val_labels, val_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='weighted')
    
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Validation - Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")

In [None]:
model.eval()
test_preds, test_labels = [], []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        
        test_preds.extend(preds)
        test_labels.extend(targets.cpu().numpy())

# Hitung dan tampilkan metrik evaluasi final
accuracy = accuracy_score(test_labels, test_preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average='weighted')

print("\nHasil Evaluasi Final:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

In [None]:
torch.save(model.state_dict(), "indobert_spotify_sentiment.pt")
print("Model berhasil disimpan!")

# 11. Fungsi prediksi
def predict_sentiment(text):
    model.eval()
    encoded_text = tokenizer(
        text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    preds = torch.nn.functional.softmax(outputs.logits, dim=1)
    sentiment_id = torch.argmax(preds, dim=1).item()
    sentiment = ["Negatif", "Netral", "Positif"][sentiment_id]
    
    return sentiment, preds[0][sentiment_id].item()

# Contoh penggunaan
texts = [
    "Aplikasi ini sangat bagus, saya sangat menyukainya",
    "Spotify sering crash dan boros baterai",
    "Aplikasi musik ini lumayan bagus tapi masih ada bug"
]

for text in texts:
    sentiment, confidence = predict_sentiment(text)
    print(f"Text: {text}")
    print(f"Sentimen: {sentiment} (Confidence: {confidence:.4f})\n")