<a href="https://colab.research.google.com/github/yourusername/vaca-app/blob/main/colab/vaca_llm_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎯 VACA App - LLM 單字卡生成器

使用 Qwen2.5 7B 模型為 VACA 背單字 App 生成高品質單字卡片。

## 功能特色：
- 🧠 使用 Qwen2.5 7B Instruct 模型
- 📚 支援考試標籤（IELTS、TOEFL、GRE 等）
- 🎯 避免已背單字重複
- 📄 標準 JSON 格式輸出
- 💾 Google Drive 檔案交換


## 📋 環境設定


In [None]:
# 安裝必要套件
!pip install transformers torch accelerate bitsandbytes
!pip install google-colab-utils
print("✅ 套件安裝完成")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import json
import os
import time
from datetime import datetime
from google.colab import drive, files

print(f"🚀 PyTorch Version: {torch.__version__}")
print(f"🔥 CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"💻 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

## 🔗 Google Drive 連接


In [None]:
# 掛載 Google Drive
drive.mount('/content/drive')

# 設定工作目錄
DRIVE_BASE = '/content/drive/MyDrive/VACA_LLM'
os.makedirs(DRIVE_BASE, exist_ok=True)
os.makedirs(f'{DRIVE_BASE}/requests', exist_ok=True)
os.makedirs(f'{DRIVE_BASE}/responses', exist_ok=True)

print(f"📁 工作目錄：{DRIVE_BASE}")
print("✅ Google Drive 連接完成")

## 🧠 模型載入


In [None]:
# 配置 4-bit 量化以節省記憶體
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

print("🔄 正在載入 Qwen2.5-7B-Instruct 模型...")
print("⏳ 預計需要 3-5 分鐘，請耐心等候...")

model_name = "Qwen/Qwen2.5-7B-Instruct"

# 載入 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 🔧 完整修復 tokenizer 設定問題
if tokenizer.pad_token is None:
    # 使用不同的 token 作為 pad_token，避免和 eos_token 衝突
    if tokenizer.unk_token is not None:
        tokenizer.pad_token = tokenizer.unk_token
        print(f"✅ 設定 pad_token 為 unk_token: {tokenizer.unk_token}")
    else:
        # 如果沒有 unk_token，添加新的特殊 token
        tokenizer.add_special_tokens({"pad_token": "<pad>"})
        print("✅ 添加新的 pad_token: <pad>")

# 設定 attention mask 相關參數
tokenizer.padding_side = "left"  # 改為左側 padding
print("✅ 設定 padding_side 為 left")

# 確保 tokenizer 設定一致性
print(f"📊 tokenizer 資訊:")
print(f"   - pad_token: {tokenizer.pad_token} (id: {tokenizer.pad_token_id})")
print(f"   - eos_token: {tokenizer.eos_token} (id: {tokenizer.eos_token_id})")
print(f"   - unk_token: {tokenizer.unk_token} (id: {tokenizer.unk_token_id if tokenizer.unk_token else 'None'})")
print(f"   - vocab_size: {len(tokenizer)}")

# 🔧 載入模型（移除 flash attention 依賴）
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16,
        # 移除 flash_attention_2 以避免依賴問題
    )
    print("✅ 模型載入成功（使用標準 attention）")
except Exception as e:
    print(f"❌ 模型載入失敗: {e}")
    print("🔄 嘗試使用備用配置...")
    
    # 備用載入方式
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quantization_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )
    print("✅ 模型載入成功（備用配置）")

# 🔧 如果添加了新的 token，需要調整模型 embeddings
if tokenizer.pad_token == "<pad>":
    model.resize_token_embeddings(len(tokenizer))
    print("✅ 調整模型 token embeddings 大小")

print("✅ 模型載入成功！")
print(f"📊 模型記憶體使用: {model.get_memory_footprint() / 1024**3:.2f} GB")

## 📝 Prompt Template 設計


In [None]:
def create_vocabulary_prompt(request_data):
    """
    根據請求資料建立單字生成的 prompt - 修復版（生成英文）
    
    request_data 格式:
    {
        "count": 5,
        "tags": ["IELTS", "academic"],
        "difficulty": "6.5-7.5", 
        "learned_words_summary": "已學習2000個常用詞，包含基礎動詞、形容詞",
        "avoid_words": ["abandon", "ability"] // 最近已背的詞
    }
    """
    count = request_data.get('count', 5)
    tags = request_data.get('tags', ['general'])
    difficulty = request_data.get('difficulty', 'intermediate')
    learned_summary = request_data.get('learned_words_summary', 'beginner level')
    avoid_words = request_data.get('avoid_words', [])
    
    avoid_text = ""
    if avoid_words:
        avoid_text = f"\n- Avoid these recently studied words: {', '.join(avoid_words[:20])}"
    
    tags_text = ', '.join(tags)
    
    # 🌍 修改：生成英文 JSON，避免編碼問題
    prompt = f"""You are a professional English vocabulary card generator. Please generate {count} high-quality English vocabulary cards according to the following requirements.

## User Requirements:
- Tag categories: {tags_text}
- Difficulty level: {difficulty}
- Learning background: {learned_summary}{avoid_text}

## Generation Rules:
1. Choose words suitable for the specified difficulty level
2. Avoid repeating already learned words
3. Provide accurate English definitions and meanings
4. Include practical example sentences
5. Mark parts of speech

## Output Format (Strict JSON, ALL ENGLISH):
```json
{{
  "cards": [
    {{
      "word": {{
        "base": "example",
        "phonetic": "/ɪɡˈzæmpəl/",
        "forms": [
          {{"pos": "n.", "form": "examples"}}
        ]
      }},
      "posPrimary": "n.",
      "meaning": "a thing characteristic of its kind or illustrating a general rule",
      "definition": "an instance serving for illustration",
      "synonyms": ["instance", "case", "illustration"],
      "antonyms": ["exception"],
      "example": "This is a good example of effective communication.",
      "tags": ["{tags_text.split(', ')[0] if tags else 'general'}"],
      "anchors": [],
      "translation_key": "example"
    }}
  ]
}}
```

IMPORTANT: 
- ALL text content MUST be in English only
- No Chinese characters in the JSON output
- Use 'translation_key' field for frontend translation lookup
- Meaning and definition should be clear English explanations

Please generate {count} vocabulary cards that meet the requirements:"""
    
    return prompt

# 測試 prompt 生成
test_request = {
    "count": 3,
    "tags": ["IELTS", "academic"], 
    "difficulty": "6.5-7.5",
    "learned_words_summary": "Have learned 2000 basic words",
    "avoid_words": ["abandon", "ability", "abstract"]
}

print("📝 測試 Prompt (英文版):")
print(create_vocabulary_prompt(test_request)[:500] + "...")

## 🎯 單字生成核心函數


In [None]:
def generate_vocabulary_cards(request_data, max_retries=3):
    """
    生成單字卡片的核心函數 - 終極修復版
    """
    prompt = create_vocabulary_prompt(request_data)
    
    for attempt in range(max_retries):
        try:
            print(f"🔄 生成嘗試 {attempt + 1}/{max_retries}...")
            
            # 🔧 改進的 tokenizer 編碼 - 避免 tuple index 問題
            try:
                inputs = tokenizer(
                    prompt, 
                    return_tensors="pt", 
                    truncation=True,
                    max_length=1800,  # 進一步減少輸入長度
                    padding=False,    # 單個輸入不需要 padding
                    add_special_tokens=True
                )
                
                # 手動移至設備
                inputs = {k: v.to(model.device) for k, v in inputs.items()}
                
                print(f"📊 輸入 tokens 數量: {inputs['input_ids'].shape[-1]}")
                
            except Exception as tokenizer_error:
                print(f"❌ Tokenizer 錯誤: {tokenizer_error}")
                continue
            
            # 🚀 大幅優化的生成參數 - 追求速度和穩定性
            generation_config = {
                "max_new_tokens": 500,     # 減少至500 tokens
                "min_new_tokens": 80,      # 確保最小長度
                "temperature": 0.2,        # 更低溫度提高一致性
                "top_p": 0.9,             # 稍高的top_p保持多樣性
                "do_sample": True,
                "pad_token_id": tokenizer.pad_token_id,
                "eos_token_id": tokenizer.eos_token_id,
                "repetition_penalty": 1.05,
                "length_penalty": 0.9,     # 鼓勵較短回應
                "early_stopping": True,
                "use_cache": True,         # 啟用 KV cache
                "num_beams": 1,           # 關閉 beam search 加速
            }
            
            # 開始生成 - 加上更多錯誤處理
            start_time = time.time()
            
            try:
                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        **generation_config
                    )
                
                generation_time = time.time() - start_time
                print(f"⏱️  生成時間: {generation_time:.2f}秒")
                
            except Exception as generation_error:
                print(f"❌ 生成錯誤: {generation_error}")
                continue
            
            # 🔧 安全的解碼回應 - 修復 tuple index 問題
            try:
                # 確保 outputs 是正確格式
                if isinstance(outputs, tuple):
                    outputs = outputs[0]  # 取第一個元素
                elif hasattr(outputs, 'sequences'):
                    outputs = outputs.sequences
                
                # 檢查 outputs 形狀
                if len(outputs.shape) != 2:
                    print(f"❌ 異常的 outputs 形狀: {outputs.shape}")
                    continue
                
                # 安全地計算 input 長度
                input_length = inputs['input_ids'].shape[-1]
                if outputs.shape[-1] <= input_length:
                    print(f"❌ 生成長度異常: {outputs.shape[-1]} <= {input_length}")
                    continue
                
                # 只解碼新生成的部分
                new_tokens = outputs[0][input_length:]
                response = tokenizer.decode(new_tokens, skip_special_tokens=True)
                
                print(f"📊 新生成 tokens: {len(new_tokens)}")
                print(f"📄 回應預覽: {response[:150]}...")
                
            except Exception as decode_error:
                print(f"❌ 解碼錯誤: {decode_error}")
                import traceback
                traceback.print_exc()
                continue
            
            # 🎯 強化的 JSON 解析策略
            result = None
            
            # 策略 1: 直接尋找 JSON 對象 (最常見情況)
            try:
                # 移除所有非JSON字符
                import re
                # 找到第一個完整的 JSON 對象
                json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
                matches = re.findall(json_pattern, response, re.DOTALL)
                
                for match in matches:
                    try:
                        test_result = json.loads(match)
                        if 'cards' in test_result:
                            result = test_result
                            print("✅ 策略1成功: 正規表達式提取 JSON")
                            break
                    except:
                        continue
            except:
                pass
            
            # 策略 2: 找尋 ```json 標記
            if not result:
                try:
                    json_start = response.find('```json')
                    if json_start != -1:
                        json_end = response.find('```', json_start + 7)
                        if json_end != -1:
                            json_text = response[json_start + 7:json_end].strip()
                            result = json.loads(json_text)
                            print("✅ 策略2成功: Markdown JSON 區塊")
                except:
                    pass
            
            # 策略 3: 逐字符搜尋完整 JSON
            if not result:
                try:
                    start_idx = response.find('{')
                    if start_idx != -1:
                        bracket_count = 0
                        for i, char in enumerate(response[start_idx:]):
                            if char == '{':
                                bracket_count += 1
                            elif char == '}':
                                bracket_count -= 1
                                if bracket_count == 0:
                                    json_text = response[start_idx:start_idx + i + 1]
                                    try:
                                        result = json.loads(json_text)
                                        print("✅ 策略3成功: 括號匹配解析")
                                        break
                                    except:
                                        continue
                except:
                    pass
            
            # 📊 驗證結果格式並返回
            if result and isinstance(result, dict) and 'cards' in result:
                cards = result['cards']
                if isinstance(cards, list) and len(cards) > 0:
                    # 驗證每個卡片的必要欄位
                    valid_cards = []
                    for card in cards:
                        if all(key in card for key in ['word', 'meaning', 'example']):
                            valid_cards.append(card)
                    
                    if valid_cards:
                        result['cards'] = valid_cards
                        print(f"✅ 成功生成 {len(valid_cards)} 個有效單字卡")
                        return result
            
            print("❌ JSON 解析或驗證失敗")
            print(f"📄 完整回應: {response[:800]}...")
                
        except Exception as e:
            print(f"❌ 嘗試 {attempt + 1} 發生錯誤: {e}")
            import traceback
            traceback.print_exc()
            
            # 如果是記憶體問題，清理一下
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    print("❌ 達到最大重試次數，生成失敗")
    return None

print("✅ 終極修復版單字生成函數準備完成")

## 🔄 Google Drive 檔案監控系統


In [None]:
def process_request_file(filepath):
    """
    處理請求檔案 - 修復編碼版
    """
    try:
        # 📖 讀取請求 - 確保 UTF-8 編碼
        with open(filepath, 'r', encoding='utf-8') as f:
            request_data = json.load(f)
        
        print(f"📥 收到請求: {request_data}")
        
        # 生成單字卡
        result = generate_vocabulary_cards(request_data)
        
        if result:
            # 產生回應檔案
            response_filename = os.path.basename(filepath).replace('request_', 'response_')
            response_path = f"{DRIVE_BASE}/responses/{response_filename}"
            
            # 加上時間戳記和狀態
            response_data = {
                "status": "success",
                "timestamp": datetime.now().isoformat(),
                "request": request_data,
                "result": result,
                "encoding_info": {
                    "format": "english_json",
                    "requires_translation": True,
                    "version": "1.1.0"
                }
            }
            
            # 🔧 強化的編碼寫入 - 確保無亂碼
            try:
                # 方法1: 標準 UTF-8 寫入
                with open(response_path, 'w', encoding='utf-8', newline='') as f:
                    json.dump(response_data, f, ensure_ascii=False, indent=2, separators=(',', ': '))
                print(f"✅ 回應已保存（UTF-8）: {response_path}")
                
                # 🔍 驗證寫入結果
                with open(response_path, 'r', encoding='utf-8') as f:
                    verification = json.load(f)
                    print(f"📊 驗證成功: {len(verification.get('result', {}).get('cards', []))} 張卡片")
                    
            except UnicodeEncodeError as encoding_error:
                print(f"⚠️ UTF-8 編碼失敗，嘗試備用方案: {encoding_error}")
                
                # 方法2: ASCII 安全寫入 (備用)
                with open(response_path, 'w', encoding='utf-8') as f:
                    json.dump(response_data, f, ensure_ascii=True, indent=2)
                print(f"✅ 回應已保存（ASCII 安全）: {response_path}")
                
        else:
            # 生成失敗的回應
            response_filename = os.path.basename(filepath).replace('request_', 'error_')
            response_path = f"{DRIVE_BASE}/responses/{response_filename}"
            
            error_data = {
                "status": "error",
                "timestamp": datetime.now().isoformat(),
                "request": request_data,
                "error": "Failed to generate vocabulary cards",
                "error_code": "GENERATION_FAILED"
            }
            
            with open(response_path, 'w', encoding='utf-8') as f:
                json.dump(error_data, f, ensure_ascii=False, indent=2)
                
            print(f"❌ 錯誤回應已保存: {response_path}")
        
        # 刪除已處理的請求檔案
        os.remove(filepath)
        print(f"🗑️  已刪除請求檔案: {filepath}")
        
    except Exception as e:
        print(f"❌ 處理請求檔案錯誤: {e}")
        import traceback
        traceback.print_exc()

def monitor_requests(check_interval=10):
    """
    監控請求檔案的主循環 - 強化版
    """
    print(f"🔍 開始監控請求檔案 (每 {check_interval} 秒檢查一次)")
    print(f"📁 監控目錄: {DRIVE_BASE}/requests/")
    print("🌍 生成格式: 英文 JSON (避免編碼問題)")
    print("💡 提示：按 Ctrl+C 停止監控")
    
    try:
        while True:
            requests_dir = f"{DRIVE_BASE}/requests"
            
            # 檢查是否有新的請求檔案
            if os.path.exists(requests_dir):
                request_files = [f for f in os.listdir(requests_dir) 
                               if f.startswith('request_') and f.endswith('.json')]
                
                for filename in request_files:
                    filepath = os.path.join(requests_dir, filename)
                    print(f"\n🎯 發現新請求: {filename}")
                    process_request_file(filepath)
                
                if not request_files:
                    print(".", end="", flush=True)  # 顯示活動指示器
            else:
                print(f"⚠️ 請求目錄不存在: {requests_dir}")
                time.sleep(5)
            
            time.sleep(check_interval)
            
    except KeyboardInterrupt:
        print("\n🛑 監控已停止")
    except Exception as e:
        print(f"\n❌ 監控錯誤: {e}")
        import traceback
        traceback.print_exc()

print("✅ 檔案監控系統準備完成（編碼修復版）")

## 🧪 測試生成功能


In [None]:
# 🧪 改進的測試系統
def run_comprehensive_test():
    """
    執行全面的診斷測試
    """
    print("🔍 === VACA LLM 診斷測試開始 ===")
    
    # 1. 檢查模型和 tokenizer 狀態
    print("\n1️⃣ 檢查模型狀態:")
    if 'model' in globals() and 'tokenizer' in globals():
        print("✅ 模型和 tokenizer 已載入")
        print(f"   - 模型類型: {type(model).__name__}")
        print(f"   - Tokenizer 類型: {type(tokenizer).__name__}")
        print(f"   - pad_token: {tokenizer.pad_token} (id: {tokenizer.pad_token_id})")
        print(f"   - eos_token: {tokenizer.eos_token} (id: {tokenizer.eos_token_id})")
        
        if torch.cuda.is_available():
            print(f"   - GPU 記憶體: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
    else:
        print("❌ 模型或 tokenizer 未載入")
        return False
    
    # 2. 測試 tokenizer
    print("\n2️⃣ 測試 Tokenizer:")
    try:
        test_text = "Hello, this is a test."
        tokens = tokenizer(test_text, return_tensors="pt", padding=False)
        print(f"✅ Tokenizer 正常工作")
        print(f"   - 測試文本: {test_text}")
        print(f"   - Token 數量: {tokens['input_ids'].shape[-1]}")
    except Exception as e:
        print(f"❌ Tokenizer 錯誤: {e}")
        return False
    
    # 3. 測試簡單生成
    print("\n3️⃣ 測試簡單生成:")
    try:
        simple_prompt = "Generate a simple English word: "
        inputs = tokenizer(simple_prompt, return_tensors="pt").to(model.device)
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=10,
                temperature=0.7,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True
            )
        
        response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
        print(f"✅ 簡單生成測試成功")
        print(f"   - 輸入: {simple_prompt}")
        print(f"   - 輸出: {response}")
        
    except Exception as e:
        print(f"❌ 簡單生成測試失敗: {e}")
        return False
    
    # 4. 測試單字卡生成
    print("\n4️⃣ 測試單字卡生成:")
    
    # 使用最簡單的請求
    minimal_request = {
        "count": 1,
        "tags": ["basic"],
        "difficulty": "easy", 
        "learned_words_summary": "beginner level",
        "avoid_words": []
    }
    
    print(f"📝 測試請求: {json.dumps(minimal_request, indent=2)}")
    
    try:
        result = generate_vocabulary_cards(minimal_request, max_retries=2)
        
        if result:
            print("🎉 單字卡生成測試成功！")
            print("📄 生成結果:")
            print(json.dumps(result, ensure_ascii=False, indent=2))
            
            # 詳細驗證
            cards = result.get('cards', [])
            if cards:
                card = cards[0]
                print(f"\n🔍 卡片驗證:")
                for field in ['word', 'meaning', 'example']:
                    if field in card:
                        print(f"   ✅ {field}: {card[field]}")
                    else:
                        print(f"   ❌ 缺少 {field}")
                        
                print("✅ 全部測試通過！系統已準備就緒。")
                return True
            else:
                print("❌ 生成的卡片陣列為空")
                return False
        else:
            print("❌ 單字卡生成失敗")
            return False
            
    except Exception as e:
        print(f"❌ 單字卡生成測試錯誤: {e}")
        import traceback
        traceback.print_exc()
        return False

# 執行完整測試
print("🚀 開始執行完整診斷...")
success = run_comprehensive_test()

if success:
    print("\n🎯 === 測試結果：系統正常！===")
    print("✅ 可以開始監控 Google Drive 檔案")
    print("✅ 準備處理前端請求")
else:
    print("\n⚠️ === 測試結果：發現問題 ===")
    print("🔧 請檢查上面的錯誤訊息並重新執行相關設定")

## 🚀 啟動監控服務


In [None]:
print("🎯 VACA LLM 單字生成器已準備就緒！")
print()
print("📋 使用說明：")
print(f"1. 前端應用將請求寫入: {DRIVE_BASE}/requests/request_[timestamp].json")
print(f"2. 此服務將回應寫入: {DRIVE_BASE}/responses/response_[timestamp].json")
print("3. 請求格式參考上方的 test_request")
print()
print("🔧 請求檔案格式:")
print(json.dumps(test_request, ensure_ascii=False, indent=2))
print()

# 啟動監控
monitor_requests(check_interval=5)

## 🛠️ 手動測試區域


In [None]:
# 手動建立測試請求檔案
manual_request = {
    "count": 3,
    "tags": ["TOEFL", "academic"],
    "difficulty": "7.0-8.0",
    "learned_words_summary": "已掌握基礎詞彙，正在準備托福考試",
    "avoid_words": ["achieve", "analysis", "approach", "area", "assessment"]
}

# 寫入請求檔案
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
request_filename = f"request_{timestamp}.json"
request_path = f"{DRIVE_BASE}/requests/{request_filename}"

with open(request_path, 'w', encoding='utf-8') as f:
    json.dump(manual_request, f, ensure_ascii=False, indent=2)

print(f"✅ 已建立測試請求檔案: {request_path}")
print("🔍 監控系統將會自動處理此請求")

## 📊 系統狀態檢查


In [None]:
def check_system_status():
    print("🔍 系統狀態檢查:")
    print(f"📁 工作目錄: {DRIVE_BASE}")
    
    # 檢查目錄
    requests_dir = f"{DRIVE_BASE}/requests"
    responses_dir = f"{DRIVE_BASE}/responses"
    
    print(f"📥 請求目錄: {requests_dir}")
    if os.path.exists(requests_dir):
        request_files = os.listdir(requests_dir)
        print(f"   待處理請求: {len(request_files)} 個")
        for f in request_files[:5]:  # 只顯示前5個
            print(f"   - {f}")
    
    print(f"📤 回應目錄: {responses_dir}")
    if os.path.exists(responses_dir):
        response_files = os.listdir(responses_dir)
        print(f"   已完成回應: {len(response_files)} 個")
        for f in sorted(response_files)[-3:]:  # 顯示最近3個
            print(f"   - {f}")
    
    # 記憶體使用
    if torch.cuda.is_available():
        memory_used = torch.cuda.memory_allocated() / 1024**3
        memory_cached = torch.cuda.memory_reserved() / 1024**3
        print(f"💾 GPU 記憶體使用: {memory_used:.2f} GB / {memory_cached:.2f} GB (allocated/cached)")
    
    print("✅ 系統狀態檢查完成")

check_system_status()