In [None]:
# 1. Uninstall sisa-sisa lama
!pip uninstall -y -q transformers tokenizers huggingface-hub
!pip install -q peft==0.10.0


# 2. Install ulang dependensi inti (tanpa mem-pin huggingface-hub)
!pip install -q --no-cache-dir \
    protobuf==3.20.3 \
    transformers==4.41.1 \
    datasets==2.14.7 \
    accelerate \
    sentencepiece

In [None]:
import transformers
import inspect
from google import protobuf

print("Transformers :", transformers.__version__)
print("Protobuf    :", protobuf.__version__)
print("evaluation_strategy present? ",
      "evaluation_strategy" in inspect.signature(
          transformers.TrainingArguments).parameters)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

DATA_PATH = "/content/drive/MyDrive/Kuliah/pembelajaran mesin/dataset_review_tokped_labelled.csv"
import os, pandas as pd
assert os.path.exists(DATA_PATH), "Dataset tidak ditemukan, cek path!"
pd.read_csv(DATA_PATH).head()


In [None]:
import pandas as pd, re

def norm(txt:str)->str:
    txt = str(txt).lower()
    txt = re.sub(r"[^a-z0-9\s]", " ", txt)
    return re.sub(r"\s+", " ", txt).strip()

df = (pd.read_csv(DATA_PATH)
        .dropna(subset=["Review", "Sentiment"])
        .drop_duplicates())
df["Review"] = df["Review"].apply(norm)
df.to_csv("tokped_clean_ready.csv", index=False)
print("Rows after cleaning:", len(df))


In [None]:
# ==============================================================
#  SEL 3 — TRAINING (ADAPTIVE, ANTI-ERROR)
# ==============================================================

# 1.  Import library yang sudah ter-install pada Sel 0
from datasets import Dataset
from transformers import (AutoTokenizer,
                          AutoModelForSequenceClassification,
                          TrainingArguments, Trainer)
from sklearn.model_selection import train_test_split
import pandas as pd, inspect

# 2.  Baca dataset hasil cleaning  ➔  mapping label
LABEL2ID = {"positive": 0, "neutral": 1, "negative": 2}

df = pd.read_csv("tokped_clean_ready.csv")[["Review", "Sentiment"]]
df = df[df["Review"].astype(str).str.strip().isin(["", "nan", "none"]) == False]
df["label"] = df["Sentiment"].map(LABEL2ID)

# 3.  Split 80/20  ➔  Dataset HuggingFace
train_df, test_df = train_test_split(
    df, test_size=0.2, stratify=df["label"], random_state=42)

train_ds = Dataset.from_pandas(train_df.reset_index(drop=True))
test_ds  = Dataset.from_pandas(test_df.reset_index(drop=True))

# 4.  Tokenizer IndoBERT  ➔  pastikan SEMUA input berupa string
tok = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

def tok_fn(batch):
    return tok(
        [str(txt) for txt in batch["Review"]],      # force str
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_ds = train_ds.map(tok_fn, batched=True).remove_columns(["Review", "Sentiment"])
test_ds  = test_ds.map(tok_fn,  batched=True).remove_columns(["Review", "Sentiment"])

# 5.  Muat model IndoBERT-Base-P1  (num_labels = 3)
model = AutoModelForSequenceClassification.from_pretrained(
    "indobenchmark/indobert-base-p1", num_labels=3)

# 6.  TrainingArguments — tambahkan evaluation_strategy hanya if supported
args_kwargs = dict(
    output_dir="indoBERT_tokped",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
)

if "evaluation_strategy" in inspect.signature(TrainingArguments).parameters:
    args_kwargs["evaluation_strategy"] = "epoch"   # aman utk v4.x; diabaikan jika tidak ada

args = TrainingArguments(**args_kwargs)

# 7.  Trainer API
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
)

trainer.train()

# 8.  Simpan model & tokenizer utk pemakaian offline
MODEL_DIR = "indoBERT_tokped"
trainer.save_model(MODEL_DIR)
tok.save_pretrained(MODEL_DIR)
print(f"[✓] Model & tokenizer tersimpan di {MODEL_DIR}/")


In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import seaborn as sns, matplotlib.pyplot as plt, torch

tok   = AutoTokenizer.from_pretrained("indoBERT_tokped", local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained("indoBERT_tokped", local_files_only=True)
model.eval()

LABELS = ["positive","neutral","negative"]
y_true, y_pred = [], []
for _, row in test_df.iterrows():
    inp = tok(row["Review"], return_tensors="pt",
              truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        logits = model(**inp).logits
    y_pred.append(int(torch.argmax(logits)))
    y_true.append(int(row["label"]))

print(classification_report(y_true, y_pred, target_names=LABELS))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=LABELS, yticklabels=LABELS)
plt.xlabel("Predicted"); plt.ylabel("Actual")
plt.title("Confusion Matrix — IndoBERT Tokopedia")
plt.show()


In [None]:
import torch

def predict_sentiment(text: str):
    # Tokenisasi
    inp = tok(text.lower(), return_tensors="pt",
              truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        logits = model(**inp).logits                     # Tensor shape [1,3]
        probs_tensor = torch.softmax(logits, dim=1)[0]   # Tensor shape [3]

    # 1) Ambil index kelas tertinggi dari tensor langsung
    idx = int(torch.argmax(probs_tensor))
    # 2) Konversi tensor ke list untuk ditampilkan persentase
    probs = probs_tensor.tolist()

    # Hasil return: label dan dict probabilitas
    return (
        LABELS[idx].capitalize(),
        {
            l.capitalize(): f"{p:.2%}"
            for l, p in zip(LABELS, probs)
        }
    )

# Contoh
print(predict_sentiment("Packaging oke, tapi pengiriman lama sekali"))

In [None]:
!zip -r indoBERT_tokped.zip indoBERT_tokped
from google.colab import files
files.download("indoBERT_tokped.zip")


In [None]:
# ============ Demo (Gradio / Streamlit) =============
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, gradio as gr, os

MODEL_DIR = "/content/indoBERT_tokped"

tok   = AutoTokenizer.from_pretrained(MODEL_DIR, local_files_only=True)
model = AutoModelForSequenceClassification.from_pretrained(
          MODEL_DIR, local_files_only=True).eval()

# otomatis membaca urutan label dari config
LABELS = [model.config.id2label[i] for i in sorted(model.config.id2label)]

def predict(text):
    inputs = tok(text, return_tensors="pt",
                 truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        probs = torch.softmax(model(**inputs).logits, dim=1)[0].tolist()
    idx = int(torch.argmax(torch.tensor(probs)))
    return {LABELS[i]: float(p) for i, p in enumerate(probs)}, LABELS[idx]

gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=3, placeholder="Tulis ulasan Tokopedia…"),
    outputs=[gr.Label(num_top_classes=3), gr.Textbox(label="Label")],
    title="Analisis Sentimen Tokopedia – IndoBERT"
).launch(share=True)


In [None]:
from transformers import AutoModelForSequenceClassification

MODEL_DIR = "/content/indoBERT_tokped"

# 1️⃣ muat model yg sudah dilatih
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_DIR, local_files_only=True)

# 2️⃣ definisikan mapping label yang benar
id2label = {0: "Positive", 1: "Neutral", 2: "Negative"}
label2id = {v: k for k, v in id2label.items()}

model.config.id2label = id2label
model.config.label2id = label2id

# 3️⃣ simpan ulang konfigurasi saja (cepat, < 1 detik)
model.save_pretrained(MODEL_DIR)

print("Mapping baru tersimpan:", model.config.id2label)


In [None]:
!git config --global user.name  "Rangga Egha Permana"
!git config --global user.email "ranggaegha25022003@gmail.com"  # ganti email GitHub-mu


In [None]:
from getpass import getpass
import os, stat

GITHUB_USER = "RanggaEghaPermana"
TOKEN = getpass("<REDACTED>")

# Simpan ke .netrc agar git bisa auth tanpa menulis token di command
with open("/root/.netrc", "w") as f:
    f.write(f"machine github.com\nlogin {GITHUB_USER}\npassword {TOKEN}\n")
os.chmod("/root/.netrc", 0o600)
print("✅ Auth siap. Lanjutkan git clone / git push.")


In [None]:
%cd /content
!git clone https://github.com/RanggaEghaPermana/Tokopedia_Analis_Sentimen.git
%cd Tokopedia_Analis_Sentimen

In [None]:
SOURCE_DIR = "/content/drive/MyDrive/pembelajaran mesin"

!mkdir -p notebooks data
# notebook (kalau namanya tokopedia.ipynb)
!cp -n "$SOURCE_DIR/tokopedia.ipynb" notebooks/ 2>/dev/null || true
# semua CSV yang diawali dataset_review_tokped...
!cp -n "$SOURCE_DIR"/dataset_review_tokped*.csv data/ 2>/dev/null || true


In [None]:
import glob, shutil, os

SOURCE_DIR = "/content/drive/MyDrive/pembelajaran mesin"
os.makedirs("notebooks", exist_ok=True)
os.makedirs("data", exist_ok=True)

# cari notebook
for p in glob.glob(SOURCE_DIR + "/**/tokopedia.ipynb", recursive=True):
    shutil.copy2(p, "notebooks/")
    break  # ambil satu

# cari semua csv yang cocok pola
for p in glob.glob(SOURCE_DIR + "/**/dataset_review_tokped*.csv", recursive=True):
    shutil.copy2(p, "data/")


In [None]:
!apt-get -y install git-lfs
!git lfs install
!git lfs track "*.csv"
!git add .gitattributes
!git add .
!git commit -m "Track large CSV with Git LFS"
!git push -u origin main


In [None]:
from google.colab import drive
drive.mount('/content/drive')

SOURCE_DIR = "/content/drive/MyDrive/Kuliah/pembelajaran mesin"  # ← ada 'Kuliah'-nya
!ls -lah "$SOURCE_DIR"  # harusnya keliatan 4 file: 3 CSV + tokopedia.ipynb
