In [None]:
import json
import regex
import random

from pathlib import Path
from llama_cpp import Llama
from statistics import mean
from datasets import load_dataset
from huggingface_hub import hf_hub_download

from Libraries import Common_Utils as UTL

In [None]:
HISTORIES = "Data/Histories.json"
TOKENS_PATH = Path("Config/keys.json")
CONFIG_PATH = Path("Config/config.json")

LLM_CLIENT = None
HF_DATASET = None
REASON_PROMPT = ""
CRITIC_PROMPT = ""
CONFIG = {}

TOKENS = UTL.read_json(TOKENS_PATH)
CONFIG = UTL.read_json(CONFIG_PATH)

HF_REPO_ID = CONFIG["models"]["local_model"]["hf_repo_id"]
HF_FILENAME = CONFIG["models"]["local_model"]["hf_filename"]
CRITICAL_MODEL = CONFIG["models"]["global_model"]["model_name"]

PROMPT_REASON_PATH = Path(CONFIG["paths"]["prompt_reason"])
PROMPT_CRITIC_PATH = Path(CONFIG["paths"]["prompt_critic"])
LOCAL_MODEL_DIR = Path(CONFIG["paths"]["local_model_dir"])
LOCAL_MODEL_PATH = LOCAL_MODEL_DIR / HF_FILENAME

REASON_PROMPT = UTL.read_text(PROMPT_REASON_PATH)
CRITIC_PROMPT = UTL.read_text(PROMPT_CRITIC_PATH)

match (CONFIG, REASON_PROMPT, CRITIC_PROMPT):
    case (False, _, _):
        print("‚õî D·ª´ng ch∆∞∆°ng tr√¨nh do kh√¥ng t·∫£i ƒë∆∞·ª£c file config.")
        exit()
    case (_, False, _) | (_, _, False):
        print("‚õî D·ª´ng ch∆∞∆°ng tr√¨nh do kh√¥ng t·∫£i ƒë∆∞·ª£c file prompt.")
        exit()
    case _:
        print("‚úÖ T·∫•t c·∫£ file ƒë√£ t·∫£i th√†nh c√¥ng.")
        


### HELPERS

In [None]:
def initialize_dataset():
    """T·∫£i dataset t·ª´ Hugging Face."""
    global HF_DATASET
    DATASET_NAME = CONFIG["dataset"]["name"]
    DATASET_SPLIT = CONFIG["dataset"].get("split", "train")
    
    print(f"üìò ƒêang t·∫£i dataset '{DATASET_NAME}' (split: {DATASET_SPLIT})...")
    try:
        HF_DATASET = load_dataset(DATASET_NAME, split=DATASET_SPLIT)
        print(f"‚úÖ T·∫£i dataset th√†nh c√¥ng. (S·ªë l∆∞·ª£ng: {len(HF_DATASET)} m·∫´u)")
    except Exception as e:
        print(f"‚ùå L·ªói khi t·∫£i dataset: {e}")

In [None]:
def jsonParse(text: str):
    text = text.strip()
    try:
        return json.loads(text)
    except Exception:
        match = regex.search(r"\{(?:[^{}]|(?R))*\}", text, flags=regex.DOTALL)
        if match:
            json_str = match.group(0)
            try:
                return json.loads(json_str)
            except json.JSONDecodeError:
                pass
        raise ValueError(f"‚ùå Kh√¥ng parse ƒë∆∞·ª£c JSON t·ª´ ph·∫£n h·ªìi:\n{text}")
    
def randomContent() -> str:
    if HF_DATASET is None:
        print("‚ùå L·ªói: Dataset ch∆∞a ƒë∆∞·ª£c t·∫£i. (HF_DATASET is None)")
        return None
    try:
        idx = random.randint(0, len(HF_DATASET) - 1)
        sample = HF_DATASET[idx]
        content = sample.get("article") or sample.get("text")
        title = sample.get("headlines", "Kh√¥ng c√≥ ti√™u ƒë·ªÅ")
        if not content:
            print(f"‚ö†Ô∏è M·∫´u ng·∫´u nhi√™n (index {idx}) kh√¥ng c√≥ c·ªôt 'article' ho·∫∑c 'text', th·ª≠ l·∫°i...")
            return randomContent()
        print(f"üìò Ch·ªçn ng·∫´u nhi√™n m·ª•c: {title}")
        return content
    except Exception as e:
        print(f"‚ùå L·ªói khi l·∫•y m·∫´u ng·∫´u nhi√™n t·ª´ dataset: {e}")
        return None

def averageScore(critical_output: dict) -> float:
    """T√≠nh ƒëi·ªÉm trung b√¨nh t·ª´ output JSON l·ªìng nhau c·ªßa critical model."""
    scoring_dict = critical_output.get("scoring")
    if not scoring_dict or not isinstance(scoring_dict, dict):
        if "error" not in critical_output:
             print("‚ö†Ô∏è L·ªói: Kh√¥ng t√¨m th·∫•y key 'scoring' trong output c·ªßa Critical.")
        return 0.0
    keys = ["factuality", "clarity", "logical_coherence", "coverage", "utility", "consistency"]
    vals = [scoring_dict[k] for k in keys if isinstance(scoring_dict.get(k), (int, float))]
    return mean(vals) if vals else 0.0

In [None]:
def initialize_llm_client():
    """Kh·ªüi t·∫°o m√¥ h√¨nh LLM local duy nh·∫•t."""
    global LLM_CLIENT
    print(f"üìò Ki·ªÉm tra model local t·∫°i: {LOCAL_MODEL_PATH}")
    if not LOCAL_MODEL_PATH.exists():
        print(f"‚ö†Ô∏è Model ch∆∞a t·ªìn t·∫°i. B·∫Øt ƒë·∫ßu t·∫£i '{HF_FILENAME}' t·ª´ '{HF_REPO_ID}'...")
        LOCAL_MODEL_DIR.mkdir(parents=True, exist_ok=True)
        try:
            hf_hub_download(
                repo_id=HF_REPO_ID,
                filename=HF_FILENAME,
                local_dir=LOCAL_MODEL_DIR,
                local_dir_use_symlinks=False,
                resume_download=True
            )
            print("‚úÖ T·∫£i model th√†nh c√¥ng.")
        except Exception as e:
            print(f"‚ùå L·ªói khi t·∫£i model: {e}")
            return 
    else:
        print("üëç Model ƒë√£ c√≥ s·∫µn.")

    print(f"üìò ƒêang t·∫£i model local t·ª´: {LOCAL_MODEL_PATH}")
    try:
        LLM_CLIENT = Llama(
            model_path=str(LOCAL_MODEL_PATH),
            n_gpu_layers=CONFIG["llama_cpp_params"]["n_gpu_layers"],
            n_ctx=CONFIG["llama_cpp_params"]["n_ctx"],
            verbose=False
        )
        print("‚úÖ T·∫£i model local (LLM_CLIENT) th√†nh c√¥ng.")
    except Exception as e:
        print(f"‚ùå L·ªói khi t·∫£i model local: {e}")

### RUNNER

In [None]:
def reasoningRun(text: str, feedback: str | None = None) -> str:
    """Ch·∫°y reasoning. N·∫øu c√≥ feedback, n√≥ s·∫Ω ch·∫°y ·ªü ch·∫ø ƒë·ªô 'refine'."""
    if LLM_CLIENT is None:
        print("‚ùå L·ªói: reasoningRun kh√¥ng th·ªÉ ch·∫°y v√¨ LLM_CLIENT ch∆∞a ƒë∆∞·ª£c t·∫£i.")
        return ""
        
    if feedback:
        prompt = (
            f"<|user|>\nB·∫°n ƒë√£ t·∫°o ra m·ªôt b·∫£n t√≥m t·∫Øt tr∆∞·ªõc ƒë√≥. "
            f"B·∫£n t√≥m t·∫Øt ƒë√≥ ƒë√£ ƒë∆∞·ª£c ƒë√°nh gi√° v·ªõi ph·∫£n h·ªìi sau:\n"
            f"---BEGIN FEEDBACK---\n{feedback}\n---END FEEDBACK---\n\n"
            f"Vui l√≤ng t·∫°o l·∫°i reasoning trace v√† t√≥m t·∫Øt, c√≥ t√≠nh ƒë·∫øn ph·∫£n h·ªìi n√†y. "
            f"S·ª≠ d·ª•ng l·∫°i vƒÉn b·∫£n g·ªëc d∆∞·ªõi ƒë√¢y.\n\n"
            f"VƒÉn b·∫£n g·ªëc:\n{text}<|end|>\n<|assistant|>"
        )
    else:
        prompt = f"<|user|>\n{REASON_PROMPT}\n\n{text}<|end|>\n<|assistant|>"
    
    print(f"prompt (local): {prompt}...")

    try:
        completion = LLM_CLIENT(
            prompt,
            max_tokens=CONFIG["generation_params"]["max_new_tokens"],
            temperature=CONFIG["generation_params"]["temperature"],
            top_p=CONFIG["generation_params"]["top_p"],
            stop=["<|end|>", "<|endoftext|>", "</s>"] 
        )
        
        if "choices" in completion and len(completion["choices"]) > 0:
            return completion["choices"][0]["text"].strip()
        else:
            return str(completion).strip()

    except Exception as e:
        print(f"‚ùå L·ªói khi ch·∫°y reasoningRun (local): {e}")
        return ""

In [None]:
def criticalRun(source_text: str, reasoning_output: str) -> dict:
    """Ch·∫°y critical B·∫∞NG M√î H√åNH LOCAL, tr·∫£ v·ªÅ dict JSON."""
    if LLM_CLIENT is None:
        return {"error": "LLM client not initialized"}
    
    user_prompt = f"""
    {CRITIC_PROMPT}

    [Reasoning trace]:
    {reasoning_output}

    [VƒÉn b·∫£n g·ªëc]:
    {source_text}
    """
    
    final_prompt = f"<|user|>\n{user_prompt}<|end|>\n<|assistant|>"

    try:
        completion = LLM_CLIENT(
            final_prompt,
            max_tokens=1024,
            temperature=0.2,
            top_p=0.9,
            stop=["<|end|>", "```"]
        )

        if "choices" not in completion or len(completion["choices"]) == 0:
            raise ValueError("M√¥ h√¨nh local (critical) kh√¥ng tr·∫£ v·ªÅ n·ªôi dung.")

        raw_output = completion["choices"][0]["text"].strip()
        if not raw_output:
            raise ValueError("M√¥ h√¨nh local (critical) tr·∫£ v·ªÅ ph·∫£n h·ªìi r·ªóng.")

        try:
            parsed_json = jsonParse(raw_output)
            return parsed_json
            
        except ValueError as e:
            print(f"‚ö†Ô∏è L·ªói parse JSON t·ª´ m√¥ h√¨nh local: {e}")
            print(f"--- RAW OUTPUT T·ª™ PHI-3 (CRITICAL) ---")
            print(raw_output)
            print(f"----------------------------------------")
            return {"error": "Failed to parse JSON from local model", "raw_response": raw_output}

    except Exception as e:
        error_message = f"‚ùå L·ªói khi g·ªçi LLM (critical): {str(e)}"
        print(error_message)
        return {"error": error_message}

### MAIN FUNC

In [None]:
def mainFlow(source_text: str, 
             max_iters=CONFIG["flow_params"]["max_iters"], 
             min_improve=CONFIG["flow_params"]["min_improve"]):
    
    best_reason, best_score = None, 0
    history = []
    current_feedback = None
    current_reasoning = ""

    for step in range(1, max_iters + 1):
        print(f"\nüîÑ V√≤ng {step} ...")
        
        current_reasoning = reasoningRun(source_text, current_feedback)
        
        artifact_marker = "CV=" 
        if artifact_marker in current_reasoning:
            print(f"‚ö†Ô∏è Ph√°t hi·ªán artifact '{artifact_marker}', ƒëang c·∫Øt b·ªè...")
            artifact_index = current_reasoning.find(artifact_marker)
            current_reasoning = current_reasoning[:artifact_index].strip()
        
        if not current_reasoning:
            print("‚õî Reasoning tr·∫£ v·ªÅ r·ªóng, d·ª´ng v√≤ng l·∫∑p.")
            break
            
        print(f"\nüîÑ Reaoning Result: {current_reasoning}")
        
        critical_output = criticalRun(source_text, current_reasoning)
        
        if critical_output.get("error"):
            print(f"‚õî L·ªói t·ª´ Critical: {critical_output['error']}, d·ª´ng v√≤ng l·∫∑p.")
            break
        
        average_score = averageScore(critical_output)
        
        current_feedback = critical_output.get("feedback_text", "") 

        print(f"üìä ƒêi·ªÉm TBC: {average_score:.2f}")
        print(f"üìù Nh·∫≠n x√©t (To√†n b·ªô JSON): {json.dumps(critical_output, ensure_ascii=False, indent=2)}\n")

        history.append({
            "round": step, 
            "score": average_score, 
            "reasoning": current_reasoning, 
            "evaluation": critical_output.get("scoring", {}),
            "feedback": current_feedback
        })
        
        if average_score > best_score + min_improve:
            best_reason, best_score = current_reasoning, average_score
            print(f"üìà C·∫£i thi·ªán t·ªët, ƒëi·ªÉm m·ªõi: {best_score:.2f}")
            if average_score >= 4.8:
                print("üéâ ƒê·∫°t ƒëi·ªÉm cao, d·ª´ng s·ªõm.")
                break
        else:
            print("‚õî Kh√¥ng c·∫£i thi·ªán ƒë√°ng k·ªÉ, d·ª´ng.")
            break

    return {"best_reasoning": best_reason, "best_score": best_score, "history": history}

### RUN MAIN

In [None]:
if __name__ == "__main__":
    
    # --- G·ªåI C√ÅC H√ÄM KH·ªûI T·∫†O ---
    initialize_llm_client()
    initialize_dataset()
    
    # ‚¨ÖÔ∏è Ch·ªâ c·∫ßn ki·ªÉm tra 2 th√†nh ph·∫ßn
    if LLM_CLIENT is None or HF_DATASET is None: 
        print("‚õî Kh√¥ng th·ªÉ b·∫Øt ƒë·∫ßu mainFlow (model ho·∫∑c dataset ch∆∞a ƒë∆∞·ª£c t·∫£i).")
        print("Vui l√≤ng ki·ªÉm tra l·∫°i l·ªói ·ªü tr√™n.")
        exit()
        
    try:
        inputPara = randomContent()
        if inputPara is None:
            print("‚õî Kh√¥ng t·∫£i ƒë∆∞·ª£c n·ªôi dung test data, d·ª´ng ch∆∞∆°ng tr√¨nh.")
            exit()
            
        flow_params = CONFIG.get("flow_params", {})
        result = mainFlow(
            inputPara,
            max_iters=flow_params.get("max_iters", 3),
            min_improve=flow_params.get("min_improve", 0.05)
        )

        print("\n" + "="*50)
        print("üéØ K·∫æT QU·∫¢ CU·ªêI C√ôNG üéØ")
        print("="*50)
        
        final_output = {
            "best_score": result["best_score"],
            "best_reasoning": result["best_reasoning"]
        }
        
        print(json.dumps(final_output, ensure_ascii=False, indent=2))
        
        UTL.insert_json(result["history"], HISTORIES, indent=2)
        print(f"\nüìù L·ªãch s·ª≠ ch·∫°y ƒë·∫ßy ƒë·ªß ƒë√£ ƒë∆∞·ª£c l∆∞u v√†o '{HISTORIES}'")

    except Exception as e:
        print(f"\n‚ùå ƒê√£ x·∫£y ra l·ªói kh√¥ng mong mu·ªën trong main: {e}")