##SENTIMEN ANALISIS

#Modul

In [None]:
!pip install transformers datasets evaluate wordcloud scikit-learn nltk
!pip install Sastrawi

import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import evaluate
import transformers
print(transformers.__version__)

#Load data

In [None]:
# Load Data Train (Udah Bersih, Udah Ada Label)
Train = "https://raw.githubusercontent.com/RakhaFS/TA-AMS/refs/heads/main/Train%20Shell.csv"
Test = "https://raw.githubusercontent.com/RakhaFS/TA-AMS/refs/heads/main/Shell%20Indonesia.csv"

Shell_tr = pd.read_csv(Train)  # Ganti nama sesuai file lo
train = Shell_tr[['favorite_count', 'full_text', 'clean_text', 'sentiment']]  # Pastikan kolom sesuai

# Load Data Test (Data Mentah)
df = pd.read_csv(Test)
df = df[['favorite_count', 'full_text']].dropna().reset_index(drop=True)

print('Data Train sebanyak ', len(train))
print('Data Test sebanyak ', len(df))

In [None]:
po = train['sentiment'].str.contains('positive')
nt = train['sentiment'].str.contains('neutral')
ng = train['sentiment'].str.contains('negative')
# Tampilkan hasilnya

jpo = po.sum()
jnt = nt.sum()
jng = ng.sum()
print(f"Jumlah positive: {jpo}")
print(f"Jumlah neutral: {jnt}")
print(f"Jumlah negative: {jng}")

#Preprocessing

In [None]:
# Preprocessing
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

stop_words = set(stopwords.words('indonesian'))
custom_stopwords = {'yg', 'aja', 'dong', 'nih', 'gk', 'kok', 'sih', 'loh', 'lah', 'nya'}
stop_words.update(custom_stopwords)

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

df['clean_text'] = df['full_text'].apply(preprocess).apply(remove_stopwords)
df.head()

In [None]:
# Pastikan kolom 'clean_text' dan 'sentiment' sudah ada
assert 'clean_text' in train.columns
assert 'sentiment' in train.columns

# Encode label jadi angka (positive = 2, neutral = 1, negative = 0 misalnya)
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['sentiment'])

# Load test data juga, dari shell.csv yang tadi udah di-clean dan belum dilabeli
df_test = df[~df.index.isin(train.index)].reset_index(drop=True)
train.head()

#Load Model

In [None]:
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['clean_text'], truncation=True)

train_dataset = Dataset.from_pandas(train[['clean_text', 'label']])
test_dataset = Dataset.from_pandas(df[['clean_text']])

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # Changed from evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=7,
    weight_decay=0.01,
    save_total_limit=1,
    logging_dir="./logs",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

#Sentiment Analysis dengan BERT

In [None]:
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=-1)
df['sentiment'] = label_encoder.inverse_transform(pred_labels)

# Export CSV hasil
df[['favorite_count', 'full_text', 'clean_text', 'sentiment']].to_csv("hasil_finetune_Shell.csv", index=False)
print("✅ Hasil disimpan ke 'hasil_finetune_Shell.csv'")

In [None]:
look = df[['full_text','clean_text','sentiment']]
look.head()