In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import joblib
import os
import pandas as pd


data = pd.read_csv('sentences.csv', sep='\t', names=['id', 'language', 'text'], encoding='latin-1')

# 檢查 'language' 欄位是否有 NaN 值並移除對應的列
print(f"原始資料筆數: {len(data)}")
data.dropna(subset=['language'], inplace=True)
print(f"移除 NaN 後資料筆數: {len(data)}")

def train_model():
    """訓練語言辨識模型"""
    print("🚀 開始訓練模型...")
    # 不需要檢查 data 是否為 None，因為 read_csv 總會返回一個 DataFrame
    # if data is None:
    #     print("❌ 無法訓練模型，資料未正確載入。")
    #     return

    print("📊 準備資料進行訓練...")

    # 特徵提取：將文本轉換為TF-IDF特徵
    vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=500000, max_df=0.95)

    X = vectorizer.fit_transform(data['text'])
    y = data['language']

    # 分割訓練集和測試集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"訓練集大小: {X_train.shape[0]} 筆")
    print(f"測試集大小: {X_test.shape[0]} 筆")

    # 訓練模型
    print("🧠 訓練模型中...")
    model = MultinomialNB()
    model.fit(X_train, y_train)

    # 評估模型
    print("\n📈 模型評估結果:")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

    # 創建模型目錄（如果不存在）
    os.makedirs('./models', exist_ok=True)

    # 儲存模型和向量器
    joblib.dump(model, './models/language_model.pkl')
    joblib.dump(vectorizer, './models/vectorizer.pkl')
    print("✅ 模型訓練完成並已儲存！")

    return model, vectorizer

if __name__ == "__main__":
    train_model()

原始資料筆數: 190247
移除 NaN 後資料筆數: 190247
🚀 開始訓練模型...
📊 準備資料進行訓練...
訓練集大小: 152197 筆
測試集大小: 38050 筆
🧠 訓練模型中...


KeyboardInterrupt: 