In [15]:
import os
import time
import pandas as pd
import google.genai as genai

In [None]:
def init_client():
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        raise RuntimeError("Missing GEMINI_API_KEY in environment variables")
    return genai.Client(api_key=api_key)


def build_batch_prompt(comments):
    numbered = "\n".join([f"{idx+1}. {c}" for idx, c in enumerate(comments)])
    return f"""
Konteks: komentar berikut berasal dari video yang membahas tentang utang proyek kereta cepat Whoosh di Indonesia.

Tugas: tentukan sentimen dari setiap komentar. Untuk setiap komentar, jawab hanya salah satu label berikut:
positive, neutral, negative

Daftar komentar:
{numbered}

Format jawaban:
- Hanya 1 label per baris
- Total baris harus sama dengan jumlah komentar
- Tidak boleh ada nomor, penjelasan, alasan, atau kata lain
- Semua huruf lowercase
- jawaban hanyan negative, positive, neutral (tidak di ikutin dengan apapun dan jangan ada daftar terbutir atau daftar ternomor pada jawaban)

Contoh output yang benar (jika ada 5 komentar):
negative
neutral
positive
neutral
negative

Sekarang berikan 1 label untuk setiap komentar di atas.
"""


def label_batch(client, comments, model="gemini-2.0-flash"):
    prompt = build_batch_prompt(comments)
    response = client.models.generate_content(
        model=model,
        contents=prompt
    )
    labels = [line.strip() for line in response.text.splitlines() if line.strip()]
    return labels


def process_csv(input_path, output_path, client, batch_size=50, delay=10):
    df = pd.read_csv(input_path, encoding="utf-8")
    col = df.columns[0]

    all_labels = []
    total = len(df)

    for i in range(0, total, batch_size):
        batch = df[col].iloc[i:i+batch_size].astype(str).tolist()
        print(f"\nüîÑ Processing batch {i//batch_size + 1} ({len(batch)} comments)...")

        try:
            labels = label_batch(client, batch)
        except Exception as e:
            labels = ["ERROR"] * len(batch)
            print(f"ERROR in batch: {e}")

        # Validate output length
        if len(labels) != len(batch):
            print(f"‚ö†Ô∏è Warning: expected {len(batch)} labels, got {len(labels)}")
            # optionally auto-fix padding
            labels = labels + ["ERROR"] * (len(batch) - len(labels))

        all_labels.extend(labels)

        if i + batch_size < total:
            print(f"‚è≥ Sleeping {delay} seconds before next batch...")
            time.sleep(delay)

    df["label"] = all_labels
    df.to_csv(output_path, index=False, encoding="utf-8")
    print(f"\nSaved labeled file to: {output_path}")

In [17]:
if __name__ == "__main__":
    client = init_client()

    input_csv = "comments_preprocessed.csv"      # file input (1 kolom berisi komentar)
    output_csv = "comments_labeled.csv"          # file output setelah diberi label

    process_csv(
        input_path=input_csv,
        output_path=output_csv,
        client=client,
        batch_size=50,   # jumlah komentar per 1 prompt
        delay=20         # jeda antar batch (detik)
    )


üîÑ Processing batch 1 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 2 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 3 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 4 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 5 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 6 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 7 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 8 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 9 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 10 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 11 (50 comments)...
‚è≥ Sleeping 20 seconds before next batch...

üîÑ Processing batch 12 (50 comments)..