In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install groq pandas

Collecting groq
  Downloading groq-1.0.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-1.0.0-py3-none-any.whl (138 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.3/138.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-1.0.0


In [None]:
import pandas as pd
import json
import time
import os
import csv
from groq import Groq
from dotenv import load_dotenv

load_dotenv()

# ================= CẤU HÌNH =================
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not GROQ_API_KEY:
    raise ValueError("❌ GROQ_API_KEY not found. Check your .env file")

INPUT_FILE_PATH = "/content/drive/MyDrive/Word_label_LLM/final_dataset_cleaned_v3_new_part3.csv"
OUTPUT_FILE_PATH = "/content/drive/MyDrive/Word_label_LLM/output_labeling_part3.csv"


# ================= MODEL POOL =================
MODEL_LIST = [
    "llama-3.1-8b-instant",
    "llama-3.3-70b-versatile",
    "openai/gpt-oss-120b",
    "openai/gpt-oss-20b",
    "meta-llama/llama-4-maverick-17b-128e-instruct",
    "meta-llama/llama-4-scout-17b-16e-instruct",
    "qwen/qwen3-32b"
]

CURRENT_MODEL_INDEX = 0
CURRENT_MODEL = MODEL_LIST[CURRENT_MODEL_INDEX]
# ============================================

client = Groq(api_key=GROQ_API_KEY)

def switch_model():
    """Chuyển sang model tiếp theo khi hết token"""
    global CURRENT_MODEL_INDEX, CURRENT_MODEL
    CURRENT_MODEL_INDEX = (CURRENT_MODEL_INDEX + 1) % len(MODEL_LIST)
    CURRENT_MODEL = MODEL_LIST[CURRENT_MODEL_INDEX]
    print(f"\n[⚠️ SWITCH MODEL] → {CURRENT_MODEL}")

def ask_llama_label(lyrics_text):
    """Gọi AI – chỉ đổi model khi hết token"""

    global CURRENT_MODEL

    system_prompt = "Bạn là chuyên gia ngôn ngữ. Nhiệm vụ: Trích xuất tên riêng, tiếng Anh và từ phiên âm."

    user_prompt = f"""Phân tích đoạn lyrics dưới đây và trích xuất từ vựng vào JSON.

MỤC TIÊU: Tập trung vào:
- Tiếng Anh & Slang
- Phiên âm tiếng nước ngoài
- Tên riêng

CẤU TRÚC JSON:
1. "eng"
2. "nuoc_ngoai_phien_am"
3. "ten_rieng"

Lyrics:
---------------------
{lyrics_text}
---------------------
"""

    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            model=CURRENT_MODEL,
            temperature=0.2,
            response_format={"type": "json_object"}
        )
        return json.loads(chat_completion.choices[0].message.content)

    except Exception as e:
        err = str(e).lower()

        # === CHỈ SWITCH KHI HẾT TOKEN / RATE LIMIT ===
        if any(k in err for k in ["rate", "quota", "token", "limit"]):
            switch_model()
            return "RETRY_WITH_NEW_MODEL"

        print(f"\n[LỖI API KHÁC] {e}")
        return None

def main():

    if not os.path.exists(INPUT_FILE_PATH):
        print("[LỖI] Không tìm thấy file input")
        return

    try:
        df = pd.read_csv(INPUT_FILE_PATH, encoding="utf-8")
    except:
        df = pd.read_csv(INPUT_FILE_PATH, encoding="utf-16")

    df.columns = df.columns.str.strip().str.lower()
    target_col = next((c for c in df.columns if "lyrics" in c or "content" in c), None)

    if not target_col:
        print("[LỖI] Không tìm thấy cột lyrics")
        return

    total_rows = len(df)

    start_row = 0
    if os.path.exists(OUTPUT_FILE_PATH):
        with open(OUTPUT_FILE_PATH, "r", encoding="utf-8-sig") as f:
            start_row = max(0, sum(1 for _ in f) - 1)

    print(f"[*] Start từ dòng {start_row}/{total_rows}")
    print(f"[*] Model ban đầu: {CURRENT_MODEL}")

    with open(OUTPUT_FILE_PATH, "a", encoding="utf-8-sig", newline="") as f:
        fieldnames = ["index_goc", "lyrics", "eng", "nuoc_ngoai_phien_am", "ten_rieng"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)

        if start_row == 0:
            writer.writeheader()

        for i in range(start_row, total_rows):
            lyrics_text = str(df.iloc[i][target_col])

            print(f"\n[*] Dòng {i}/{total_rows} | MODEL: {CURRENT_MODEL}")

            if len(lyrics_text) < 3 or lyrics_text.lower() == "nan":
                writer.writerow({
                    "index_goc": i,
                    "lyrics": "SKIP",
                    "eng": "",
                    "nuoc_ngoai_phien_am": "",
                    "ten_rieng": ""
                })
                continue

            while True:
                res = ask_llama_label(lyrics_text)

                if res == "RETRY_WITH_NEW_MODEL":
                    time.sleep(2)
                    continue
                break

            if res:
                writer.writerow({
                    "index_goc": i,
                    "lyrics": lyrics_text,
                    "eng": res.get("eng", ""),
                    "nuoc_ngoai_phien_am": res.get("nuoc_ngoai_phien_am", ""),
                    "ten_rieng": res.get("ten_rieng", "")
                })
                f.flush()
            else:
                writer.writerow({
                    "index_goc": i,
                    "lyrics": lyrics_text,
                    "eng": "",
                    "nuoc_ngoai_phien_am": "",
                    "ten_rieng": ""
                })

            time.sleep(2)

    print("\n=== HOÀN THÀNH ===")

if __name__ == "__main__":
    main()


[*] Start từ dòng 4802/5253
[*] Model ban đầu: llama-3.1-8b-instant

[*] Dòng 4802/5253 | MODEL: llama-3.1-8b-instant

[*] Dòng 4803/5253 | MODEL: llama-3.1-8b-instant

[*] Dòng 4804/5253 | MODEL: llama-3.1-8b-instant

[*] Dòng 4805/5253 | MODEL: llama-3.1-8b-instant

[*] Dòng 4806/5253 | MODEL: llama-3.1-8b-instant

[*] Dòng 4807/5253 | MODEL: llama-3.1-8b-instant

[⚠️ SWITCH MODEL] → llama-3.3-70b-versatile

[*] Dòng 4808/5253 | MODEL: llama-3.3-70b-versatile

[*] Dòng 4809/5253 | MODEL: llama-3.3-70b-versatile

[*] Dòng 4810/5253 | MODEL: llama-3.3-70b-versatile

[*] Dòng 4811/5253 | MODEL: llama-3.3-70b-versatile

[⚠️ SWITCH MODEL] → openai/gpt-oss-120b

[*] Dòng 4812/5253 | MODEL: openai/gpt-oss-120b

[*] Dòng 4813/5253 | MODEL: openai/gpt-oss-120b

[*] Dòng 4814/5253 | MODEL: openai/gpt-oss-120b

[*] Dòng 4815/5253 | MODEL: openai/gpt-oss-120b

[⚠️ SWITCH MODEL] → openai/gpt-oss-20b

[*] Dòng 4816/5253 | MODEL: openai/gpt-oss-20b

[⚠️ SWITCH MODEL] → meta-llama/llama-4-maverick-1