In [166]:
import pandas as pd
import concurrent.futures
import re
import json
from langchain_community.chat_models import ChatOllama 
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
import chardet
import os
from pathlib import Path
from typing import List
from langchain.docstore.document import Document as LangDocument

MASTER_JS = r"C:\users\tuf\documents\cloudysys_nickfury\dcmsln\dcmsln_201812\Master\PageJS\Query"
BUSINESSRULE_DIR = r"C:\Users\TUF\Documents\cloudysys_nickfury\dcmsln\BusinessRule\Commons"
VECTOR_STORE_PATH = "./common/vectorstore"
EMBEDDING_MODEL = "nomic-embed-text"


In [167]:
# RAG function
def remove_simplified_chinese(text: str) -> str:
    return re.sub(r"[^\x00-\x7F\u4e00-\u9fff\n\r\t\w\s.,:;!?(){}[\]\"'@#$%^&*\-+=\\/]", "", text)

def read_file_with_detected_encoding(file_path: str) -> str:
    print(file_path)

    with open(file_path, 'rb') as f:
        raw = f.read()
    return raw.decode('GB2312', errors='ignore')

def extract_vb_functions(file_path: str) -> List[LangDocument]:
    raw_code = read_file_with_detected_encoding(file_path)

    # 移除簡體中文註解
    cleaned_code = remove_simplified_chinese(raw_code)

    pattern = r"(Public\s+(Sub|Function)|Private\s+(Sub|Function)|Sub|Function)\s+\w+\s*\(.*?\)[\s\S]+?End\s+(Sub|Function)"
    matches = re.finditer(pattern, cleaned_code, re.MULTILINE | re.IGNORECASE)

    docs = []
    file_name = Path(file_path).name

    for match in matches:
        full_func = match.group(0).strip()

        # 擷取函式名稱
        func_name_match = re.search(r"(Sub|Function)\s+(\w+)", full_func)
        func_name = func_name_match.group(2) if func_name_match else "unknown"

        doc = LangDocument(
            page_content=full_func,
            metadata={
                "source": file_name,
                "function": func_name
            }
        )
        docs.append(doc)

    return docs
def collect_all_vb_chunks(directory: str) -> List[LangDocument]:
    all_docs = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".vb"):
                file_path = os.path.join(root, file)
                chunks = extract_vb_functions(file_path)
                all_docs.extend(chunks)
    return all_docs

def build_vector_store(docs: List[LangDocument]):
    print(f"[info] Building vectorstore with {len(docs)} code chunks...")
    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL)
    db = FAISS.from_documents(docs, embeddings)
    os.makedirs(VECTOR_STORE_PATH, exist_ok=True)
    db.save_local(VECTOR_STORE_PATH)
    print(f"[success] Vectorstore saved to: {VECTOR_STORE_PATH}")


In [168]:
# 建立RAG
# all_chunks = collect_all_vb_chunks(SOURCE_DIR)
# build_vector_store(all_chunks)

In [169]:
class MessageClassification(BaseModel):
    line: str = Field(...)
    original: str = Field(...)
    issue: str = Field(...)
    replacement: str = Field(...)
    reason: str = Field(...)

# Json 輸出格式解析器
parser = JsonOutputParser()
format_instructions = parser.get_format_instructions()

# 使用本地 LLM 模型
embed_model = OllamaEmbeddings(model="nomic-embed-text") 
fst_llm = ChatOllama(model="llama3:8B") #提出
sec_llm = ChatOllama(model="mistral:7B") #審查
third_llm = ChatOllama(model="phi3:3.8B") #檢查改動是否符合原始邏輯，若符合則發動投票


In [170]:
# Agent 1：提出修改方案
fst_prompt = PromptTemplate.from_template("""
You are a senior JavaScript expert. Your job is to analyze outdated or deprecated JavaScript code and refactor it into modern, browser-compatible syntax.

Let's analyze the following JavaScript code snippet **line by line**. Identify any problematic usage, deprecated patterns, or bad practices. For each issue, provide:
- The **original line** (or range, e.g., "3-7")
- A **modernized replacement**
- A brief **explanation**
- The **line number** (or range, starting from 1)

=== Original Code ===
{code}
===

### Output Format (in JSON array)

- If **no issues are found**, return an **empty array**: `[]`
- Use **double quotes** for all string values to ensure JSON validity.
- Output should be **strictly machine-parseable** JSON array.
- Separate each object with a comma.
- Each object must contain keys: "line", "original", "issue", "replacement", "reason".
- Ensure the JSON is well-formed and valid, with no dangling commas or unexpected characters.

### Example:
[
  {{
    "line": "3-7",
    "original": "var x = document.all;",
    "issue": "Usage of 'document.all' is deprecated.",
    "replacement": "var x = document.getElementById('someId');",
    "reason": "'document.all' is non-standard and not supported in modern browsers. Use standard DOM methods like 'getElementById' instead."
  }},
  {{
    "line": "13",
    "original": "eval('oCtl = document.webForm.' + someId +';');",
    "issue": "'eval' usage is discouraged due to security risks.",
    "replacement": "oCtl = document.getElementById(someId);",
    "reason": "'eval' can lead to code injection vulnerabilities. Use safer DOM access methods."
  }},
  {{
    "line": "92",
    "original": "element.attachEvent('onmouseover', onMouseOver);",
    "issue": "'attachEvent' is deprecated in favor of 'addEventListener'.",
    "replacement": "element.addEventListener('mouseover', onMouseOver);",
    "reason": "'attachEvent' is not supported in modern browsers. Use 'addEventListener' for event binding."
  }}
]

""")

# Agent 2：語法審查
sec_prompt = PromptTemplate.from_template("""
You are an expert in JavaScript Linter tools.

Carefully review the following proposed code modifications described in JSON format.
For each item, think step-by-step about the following:

1. **Syntax Validity** – Is the proposed replacement syntactically correct?
2. **Modern Browser Compatibility** – Is the suggested code aligned with current JavaScript standards and compatible with modern browsers?
3. **Practical Feasibility** – Would this change work reliably in real-world JavaScript environments?

Take your time to reason through each point before making your decision.

=== Proposal ===
{proposal}
===

Please return a JSON array. Each item should include your judgment and reasoning in the following format:
[
  {{
    "line": <line number of the proposal>,
    "vote": "approve" | "reject",
    "comment": "<Step-by-step explanation of your reasoning and conclusion>"
  }}
]
Ensure the JSON is well-formed and valid, with no dangling commas or unexpected characters.
""")

# Agent 3：語意一致性
third_prompt = PromptTemplate.from_template("""
You are an expert in semantic consistency analysis.

Your task is to verify whether the rewritten JavaScript code preserves the **same logic and behavior** as the original.

Follow these strict steps:
1. Identify and summarize the **intent and behavior** of the ORIGINAL code.
2. Analyze the REWRITTEN code line by line.
3. Compare both codes to ensure that all conditions, side effects, control flows, and outcomes are preserved.
4. If any functionality is missing, logic altered, or behavior changed in the rewritten code, REJECT it.
5. If the rewritten code is **functionally identical or better in quality**, APPROVE it — but only if the core logic is unchanged.
6. Make sure you DO NOT confuse the roles of "Original" and "Rewritten".

=== ORIGINAL CODE ===
{original}

=== REWRITTEN CODE ===
{rewritten}

Please return a JSON array, returning a list with this structure:
[
  {
    "vote": "approve" | "reject",
    "comment": "<Step-by-step explanation of the differences or confirmations, explicitly naming 'original' and 'rewritten' code in each step>"
  }
]
Ensure the JSON is well-formed and valid, with no dangling commas or unexpected characters.
""")

In [171]:
from json_repair import repair_json

# 生成.js
def write_modified_js(original_path: str, original_code: str, approved_changes: List[dict]):
    modified_code = original_code
    for item in approved_changes:
        modified_code = modified_code.replace(item["original"], item["replacement"])
    
    p = Path(original_path)
    output_path = p.with_name(f"{p.stem}_modified{p.suffix}")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(modified_code)

# 審核json
def extract_json(text: str) -> list:
    try:
        # 找第一個 "[" 與最後一個 "]"
        start = text.find('[')
        end = text.rfind(']') + 1

        if start == -1 or end <= start:
            print("找不到 JSON 陣列")
            return []

        json_str = text[start:end]
        json_str = json_str.encode('utf-8').decode('unicode_escape')
        repaired_string = repair_json(json_str)

        try:
            return json.loads(repaired_string)
        except json.JSONDecodeError:
            # 嘗試修剪後再次解析
            print(f"確認是否為JSON格式失敗, 此時的json: {repaired_string}")
            print("嘗試修剪")
            cleaned = re.sub(r',\s*([\]}])', r'\1', repaired_string)
            cleaned_repaired_string = repair_json(cleaned)
            print(f"修剪後json: {cleaned_repaired_string}")
            return json.loads(cleaned_repaired_string)

    except json.JSONDecodeError as e:
        print(f"JSON 解析失敗：{e}")
        return []

# 呼叫LLM
def call_llm(llm, prompt):
    response = llm.invoke(prompt)
    return response.content if hasattr(response, "content") else str(response) # Ensure it's a string

# ====== Main LLM Logic ======
def analyze_message_with_multi_llm(code: str):
    # Agent 1：提案改寫
    fst_input = fst_prompt.format(code=code)
    fst_response = call_llm(fst_llm, fst_input)
    proposals = extract_json(fst_response)

    # 印出反饋
    print("fst_output:", proposals)

    if not proposals:
        return {"status": "no-change", "original": code}

    approved_results = []

    for i, p in enumerate(proposals):
        print(f"\nProcessing proposal {i+1}: {p}")
        # 檢查關鍵鍵是否存在
        if not all(k in p for k in ["line", "original", "issue", "replacement", "reason"]):
            print(f"Skipping proposal {i+1} due to missing required keys: {p}")
            continue

        # Agent 2：語法審查
        sec_input = sec_prompt.format(proposal=json.dumps([p], ensure_ascii=False))
        sec_result = extract_json(call_llm(sec_llm, sec_input))
        print("sec_result:", sec_result)
        

        # Agent 3：語意一致性
        third_input = third_prompt.format(
            original=p["original"],
            rewritten=p["replacement"]
        )
        third_result = extract_json(call_llm(third_llm, third_input))
        print("third_result:", third_result)

        # 投票機制
        vote = (
            (sec_result and sec_result[0]["vote"] == "approve")
            and (third_result and third_result[0]["vote"] == "approve")
        )

        if vote:
            approved_results.append(p)

    return {"status": "done", "approved": approved_results}

# ====== Folder Runner ======
def run_folder_review(folder_path: str):
    js_files = list(Path(folder_path).rglob("*.js"))
    print(f"共發現 {len(js_files)} 個 JS 檔案")

    for file_path in js_files:
        print(f"\n 分析：{file_path}")
        with open(file_path, "r", encoding="utf-8") as f:
            code = f.read()

        result = analyze_message_with_multi_llm(code)

        if result["status"] == "done":
            write_modified_js(file_path, result["original"], result["approved"])
        else:
            print(f"無需修改：{file_path}")

# ====== decoder ======
def read_text_file_safely(path: str) -> str:
    with open(path, "rb") as f:
        raw_data = f.read()
        detected = chardet.detect(raw_data)
        encoding = detected["encoding"] or "utf-8"

    fallback_encodings = []

    if encoding: # Add the detected encoding first
        fallback_encodings.append(encoding)
    if 'GB2312' not in fallback_encodings: # Add GB2312 if not already present
        fallback_encodings.append("GB2312")

    # Ensure common encodings are covered
    additional_encodings = ["utf-8", "big5", "cp950", "gbk", "gb18030", "utf-16", "windows-1252"]
    for enc in additional_encodings:
        if enc not in fallback_encodings:
            fallback_encodings.append(enc)

    for enc in fallback_encodings:
        try:
            print(f"嘗試使用編碼: {enc}")
            return raw_data.decode(enc)
        except UnicodeDecodeError:
            continue

    raise UnicodeDecodeError(f"所有常見編碼皆無法解碼檔案: {path}")


In [172]:
# 單一個js FOR測試
def run_first_js_review(folder_path: str):
    js_files = list(Path(folder_path).rglob("*.js"))
    if not js_files:
        print("找不到任何 JS 檔案")
        return

    # first_file = js_files[0]
    first_file = r"C:\users\tuf\documents\cloudysys_nickfury\dcmsln\dcmsln_201812\Master\PageJS\Query\ME0100S.js"
    print(f"正在分析 ME0100S.js ：{first_file}")

    code = read_text_file_safely(first_file)
    result = analyze_message_with_multi_llm(code)

    if result["status"] == "done":
        # write_modified_js(first_file, result["original"], result["approved"])
        print(f"修改完畢如下")
        print(result["approved"])
    else:
        print(f"無需修改")

run_first_js_review(MASTER_JS)

正在分析 ME0100S.js ：C:\users\tuf\documents\cloudysys_nickfury\dcmsln\dcmsln_201812\Master\PageJS\Query\ME0100S.js
嘗試使用編碼: GB2312
嘗試使用編碼: utf-8
嘗試使用編碼: big5
嘗試使用編碼: cp950
嘗試使用編碼: gbk
fst_output: [{'line': '1-2', 'original': '//æ\x9c¬ç\x94»é\x9d¢å\x88\x9då§\x8bå\x8c\x96ç\x9a\x84ç\x89¹å\x88«å\x87½æ\x95°,è¢«fInit()è°\x83ç\x94¨\nfunction PageSpecialInit(){\n    fInit();}', 'issue': 'Function declaration with a comment is not recommended.', 'replacement': 'function pageSpecialInit() { fInit(); }', 'reason': "It's better to use a single line for the function declaration and keep comments separate."}, {'line': '14-22', 'original': 'switch (sSubmitCtl) {\n    ...}', 'issue': 'Use a switch statement with a more modern syntax.', 'replacement': 'const submitControl = sSubmitCtl;\nswitch (submitControl) { ... }', 'reason': 'The old syntax for the `switch` statement is outdated and not supported in modern browsers.'}, {'line': '26-29', 'original': 'if (doSelExist(RowCountStart, RowCountEnd) == false){\

KeyError: '\n    "vote"'