In [None]:
!zip -r saved_models.zip saved_models

  adding: saved_models/ (stored 0%)
  adding: saved_models/bert-base-multilingual-cased/ (stored 0%)
  adding: saved_models/bert-base-multilingual-cased/vocab.txt (deflated 45%)
  adding: saved_models/bert-base-multilingual-cased/special_tokens_map.json (deflated 42%)
  adding: saved_models/bert-base-multilingual-cased/metrics.json (deflated 25%)
  adding: saved_models/bert-base-multilingual-cased/tokenizer_config.json (deflated 75%)
  adding: saved_models/bert-base-multilingual-cased/tokenizer.json (deflated 67%)
  adding: saved_models/bert-base-multilingual-cased/config.json (deflated 55%)
  adding: saved_models/bert-base-multilingual-cased/model.safetensors (deflated 7%)


In [None]:
!pip install -q datasets transformers psutil matplotlib nltk
!pip install --upgrade transformers

# Imports
import pandas as pd
import numpy as np
import re, time, os, psutil, json
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    TrainerCallback
)
from datasets import Dataset
import torch
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("indonesian"))

# Load dataset
df = pd.read_csv("/content/labeled_data (2).csv")
df.columns = df.columns.str.strip()

assert "review" in df.columns, "Missing 'review' column"
assert "sentiment" in df.columns, "Missing 'sentiment' column"

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return " ".join([word for word in text.split() if word not in stop_words])

# Apply preprocessing
df["clean_text"] = df["review"].astype(str).apply(preprocess_text)

# Label Encoder
labels = df["sentiment"].astype(str).tolist()
label2id = {label: i for i, label in enumerate(sorted(set(labels)))}
id2label = {i: label for label, i in label2id.items()}
df["label"] = df["sentiment"].map(label2id)

# DATA SPLIT: 70% Train, 20% Test, 10% Validation
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["clean_text"].tolist(),
    df["label"].tolist(),
    test_size=0.3,  # 30% temp for test + validation
    stratify=df["label"],
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=2/3,  # From remaining 30% → 20% test, 10% val
    stratify=temp_labels,
    random_state=42
)

# Convert to HuggingFace Dataset
def preprocess_data(texts, labels):
    return Dataset.from_dict({"text": texts, "label": labels})

train_dataset = preprocess_data(train_texts, train_labels)
val_dataset = preprocess_data(val_texts, val_labels)
test_dataset = preprocess_data(test_texts, test_labels)

# Disable W&B logging to save memory
os.environ["WANDB_DISABLED"] = "true"
os.makedirs("saved_models", exist_ok=True)


# CUSTOM CALLBACK TO LOG VALIDATION LOSS EVERY 500 STEPS
class LogValidationLossCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if "eval_loss" in logs:
            print(f"Step {state.global_step}: Validation Loss = {logs['eval_loss']}")
            # Store the validation loss in a file
            with open("validation_loss_log.txt", "a") as log_file:
                log_file.write(f"Step {state.global_step}: Validation Loss = {logs['eval_loss']}\n")

# TRAINING FUNCTION
model_metrics = {}

def train_and_evaluate(model_name, label2id, id2label):
    print(f"\n🔧 Training model: {model_name}")

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Tokenization
    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, padding=True)

    encoded_train = train_dataset.map(tokenize_function, batched=True)
    encoded_val = val_dataset.map(tokenize_function, batched=True)
    encoded_test = test_dataset.map(tokenize_function, batched=True)

    # Load model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )

    # Training configuration
    training_args = TrainingArguments(
        output_dir=f"./results/{model_name.replace('/', '_')}",
        num_train_epochs=2,  # ✅ Epoch is already 2 here
        per_device_train_batch_size=8,  # ✅ Reduced for memory efficiency
        per_device_eval_batch_size=8,
        save_strategy="no",
        logging_dir="./logs",
        seed=42,
        report_to="none",
        eval_steps=500,  # Evaluate every 500 steps
        logging_steps=500,  # Log every 500 steps
    )

    # Data collator for dynamic padding
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    # Initialize Trainer with the custom callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=encoded_train,
        eval_dataset=encoded_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        callbacks=[LogValidationLossCallback]  # Add the custom callback here
    )

    # Track RAM and GPU usage ---    start_time = time.time()
    # Initial memory check before training
    initial_ram_usage = psutil.virtual_memory().used / 1e6
    initial_gpu_mem = torch.cuda.max_memory_allocated() / 1e6 if torch.cuda.is_available() else 0

    trainer.train()

    # Final memory check after training
    end_time = time.time()
    final_ram_usage = psutil.virtual_memory().used / 1e6
    final_gpu_mem = torch.cuda.max_memory_allocated() / 1e6 if torch.cuda.is_available() else 0

    # Evaluation
    preds = trainer.predict(encoded_test)

    # Get prediction labels
    logits = preds.predictions[0] if isinstance(preds.predictions, tuple) else preds.predictions
    pred_labels = np.argmax(logits, axis=1)

    acc = accuracy_score(test_labels, pred_labels)
    f1 = f1_score(test_labels, pred_labels, average="weighted")
    runtime = end_time - start_time

    # Calculate RAM and GPU usage during training
    ram_usage = final_ram_usage - initial_ram_usage
    gpu_mem = final_gpu_mem - initial_gpu_mem

    # Print report
    print("📊 Classification Report:")
    print(classification_report(test_labels, pred_labels, target_names=label2id.keys()))

    # Confusion Matrix
    cm = confusion_matrix(test_labels, pred_labels)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label2id.keys())
    disp.plot(cmap="Blues")
    plt.title(f"Confusion Matrix: {model_name}")
    plt.show()

    # Save metrics
    model_metrics[model_name] = {
        "accuracy": acc,
        "f1_score": f1,
        "runtime_sec": runtime,
        "ram_MB": ram_usage,
        "gpu_MB": gpu_mem
    }

    # Save model and tokenizer
    save_path = f"saved_models/{model_name.replace('/', '_')}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"✅ Saved model and tokenizer to {save_path}")

    # Save metrics to JSON
    metrics_path = os.path.join(save_path, "metrics.json")
    with open(metrics_path, "w") as f:
        json.dump(model_metrics[model_name], f, indent=4)
    print(f"✅ Saved metrics to {metrics_path}")

# TRAIN MODELS ONE AT A TIME
model_names_to_train = [
    #"cahya/distilbert-base-indonesian",
    "bert-base-multilingual-cased"
]

# Run one model at a time
for model_name in model_names_to_train:
    train_and_evaluate(model_name, label2id, id2label)

# PLOTTING RESULTS
# Extract metrics for plotting
model_names = list(model_metrics.keys())
accuracy = [model_metrics[m]["accuracy"] for m in model_names]
f1_scores = [model_metrics[m]["f1_score"] for m in model_names]
runtime = [model_metrics[m]["runtime_sec"] for m in model_names]

# Bar plot
x = np.arange(len(model_names))
width = 0.25

plt.figure(figsize=(10, 5))
plt.bar(x - width, accuracy, width, label="Accuracy")
plt.bar(x, f1_scores, width, label="F1 Score")
plt.bar(x + width, runtime, width, label="Runtime (s)")
plt.xticks(x, model_names, rotation=15)
plt.ylabel("Score / Time")
plt.title("Model Comparison: Accuracy, F1, Runtime")
plt.legend()
plt.tight_layout()
plt.show()




KeyboardInterrupt: 

In [None]:
!unzip saved_models_mBERT.zip -d .

Archive:  saved_models_mBERT.zip
   creating: ./saved_models/
   creating: ./saved_models/bert-base-multilingual-cased/
  inflating: ./saved_models/bert-base-multilingual-cased/vocab.txt  
  inflating: ./saved_models/bert-base-multilingual-cased/special_tokens_map.json  
  inflating: ./saved_models/bert-base-multilingual-cased/metrics.json  
  inflating: ./saved_models/bert-base-multilingual-cased/tokenizer_config.json  
  inflating: ./saved_models/bert-base-multilingual-cased/tokenizer.json  
  inflating: ./saved_models/bert-base-multilingual-cased/config.json  
  inflating: ./saved_models/bert-base-multilingual-cased/model.safetensors  


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Directly specify the path
model_path = "saved_models/bert-base-multilingual-cased"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: saved_models/bert-base-multilingual-cased is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
import pandas as pd
# Replace with your actual CSV file path
df = pd.read_csv("/content/labeled_data (2) (1).csv")  # Make sure it's uploaded to Colab
df.head()

In [None]:
from transformers import pipeline

# Create a pipeline for text classification
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

# Apply the classifier to your DataFrame
df['predictions'] = df['review'].apply(lambda x: classifier(x))

Device set to use cpu


In [None]:
def extract_label(prediction):
    # Safely extract the top label from the nested prediction list
    if prediction and isinstance(prediction[0], list):
        best = max(prediction[0], key=lambda x: x["score"])
        return best["label"]
    return None

# Apply to your DataFrame
df["predicted_label"] = df["predictions"].apply(extract_label)

In [None]:
# Filter false negatives: true = negative, predicted = positive
false_negatives = df[(df["sentiment"] == "negative") & (df["predicted_label"] == "positive")]

print("Total false negatives:", len(false_negatives))
false_negatives[["review", "sentiment", "predicted_label"]].head(50)


Total false negatives: 651


Unnamed: 0,review,sentiment,predicted_label
5,𝙪𝙙𝙖𝙝 𝙗𝙖𝙜𝙪𝙨 𝙖𝙥𝙠 𝙣𝙮𝙖 𝙗𝙖𝙧𝙪 𝙘𝙤𝙗𝙖 𝙠𝙞𝙧𝙖𝙞𝙣 𝙩𝙖𝙣𝙮𝙖 𝙩𝙖𝙣𝙮...,negative,positive
9,Good job,negative,positive
18,Ok membantu sekali ilmu medis,negative,positive
27,Terbantu,negative,positive
42,Aplikasi cukup bagus membantu banget,negative,positive
50,Respon shop/chate dokter kurang cepat,negative,positive
51,Biasalah,negative,positive
54,Aplikasi tidak bagus,negative,positive
89,terimakasih dok,negative,positive
99,"Sudah membantu, tapi sayang untuk surat sakitn...",negative,positive
