In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample
from scipy.sparse import hstack
import joblib
import os

def preprocess_and_balance(data_path, target_count=10000):
    data = pd.read_csv(data_path, sep='\t', names=['id', 'language', 'text'], encoding='latin-1')
    print(f"Original number of records: {len(data)}")

    data.dropna(subset=['language', 'text'], inplace=True)
    print(f"Number of records after removing NaNs: {len(data)}")

    lang_counts = data['language'].value_counts()
    sufficient_langs = lang_counts[lang_counts >= target_count].index
    data = data[data['language'].isin(sufficient_langs)]

    print(f"Number of language classes (samples ≥ {target_count}): {len(sufficient_langs)}")

    balanced_list = []
    for lang in data['language'].unique():
        subset = data[data['language'] == lang]
        resampled = resample(
            subset,
            replace=(len(subset) < target_count),
            n_samples=target_count,
            random_state=42
        )
        balanced_list.append(resampled)

    balanced_data = pd.concat(balanced_list).sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"\n✅ Each language class has {target_count} samples. Total samples: {len(balanced_data)}")
    print(balanced_data['language'].value_counts())

    return balanced_data

def train_model(balanced_data):
    print("🚀 Starting model training (word + char n-gram + Logistic Regression)")

    word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), max_features=50000, max_df=0.95, min_df=2)
    char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 6), max_features=50000, max_df=0.95, min_df=2)

    X_word = word_vectorizer.fit_transform(balanced_data['text'])
    X_char = char_vectorizer.fit_transform(balanced_data['text'])
    X = hstack([X_word, X_char])
    y = balanced_data['language']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

    model = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto')
    model.fit(X_train, y_train)

    print("\n📈 Model evaluation:")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

    os.makedirs('./models', exist_ok=True)
    joblib.dump(model, './models/language_model_wordchar.pkl')
    joblib.dump((word_vectorizer, char_vectorizer), './models/vectorizer_wordchar.pkl')
    print("✅ Model training completed and saved!")

    return model, (word_vectorizer, char_vectorizer)

if __name__ == "__main__":
    data = preprocess_and_balance("sentences.csv", target_count=10000)
    train_model(data)


原始資料筆數: 12731979
移除 NaN 後資料筆數: 12731962
語言類別數（樣本數 ≥ 10000）: 67

✅ 每類語言樣本固定為 10000，總樣本數: 670000
language
hun    10000
slk    10000
fin    10000
srp    10000
ind    10000
       ...  
tgl    10000
ita    10000
fra    10000
mar    10000
lfn    10000
Name: count, Length: 67, dtype: int64
🚀 開始訓練模型（word + char n-gram + Logistic Regression）
訓練集大小: 536000，測試集大小: 134000





📈 模型評估結果：
              precision    recall  f1-score   support

         ara       0.99      1.00      0.99      2000
         asm       0.99      0.98      0.99      2000
         bel       0.97      0.98      0.98      2000
         ben       0.99      0.99      0.99      2000
         ber       0.83      0.80      0.82      2000
         bul       0.93      0.93      0.93      2000
         ces       0.97      0.95      0.96      2000
         ckb       1.00      0.99      1.00      2000
         cmn       0.98      0.99      0.98      2000
         dan       0.90      0.90      0.90      2000
         deu       0.99      0.99      0.99      2000
         ell       1.00      1.00      1.00      2000
         eng       0.99      0.99      0.99      2000
         epo       0.98      0.98      0.98      2000
         fin       0.99      0.99      0.99      2000
         fra       0.99      0.99      0.99      2000
         gos       0.97      0.97      0.97      2000
         hau    