In [None]:
'''
============================================

---------------  claude 調參過程-- ----------

============================================
'''

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, 
                            precision_recall_curve, precision_score,recall_score,f1_score,
                            mean_squared_error,mean_absolute_error)
import pyvizml
from pyvizml import ClfMetrics
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
plt.rcParams['font.family'] = 'Heiti TC' #顯示中文字(Mac OS)

#import warnings
#warnings.filterwarnings("ignore", category=FutureWarning)

class XGBoostHawkeyeModel:
    def __init__(self):
        self.model = None
        self.feature_names = None
        self.optimal_threshold = 0.5
        self.threshold_metrics = {}

    # ======================
    # 1. 數據預處理
    # ======================
    def preprocess_data(self, df, target_col):
        processed_df = df.copy()
        label_encoders = {}
        categorical_cols = processed_df.select_dtypes(include=['object']).columns

        for col in categorical_cols: # 在變數中
            if col != target_col:    # 若不是目標變數
                le = LabelEncoder()  
                processed_df[col] = le.fit_transform(processed_df[col].astype(str))
                label_encoders[col] = le

        return processed_df, label_encoders

    # ======================
    # 2. scale_pos_weight baseline 測試
    # ======================
    def find_best_weight(self, X_train, y_train, X_val, y_val):
        pos_count = (y_train == 1).sum()
        neg_count = (y_train == 0).sum()
        scale_pos_weight = neg_count / pos_count

        print(f"\n正樣本數: {pos_count}, 負樣本數: {neg_count}, 建議初始 scale_pos_weight = {scale_pos_weight:.2f}")

        test_weights = [1, scale_pos_weight*0.5, scale_pos_weight, scale_pos_weight*1.5]
        best_auc, best_weight = 0, scale_pos_weight

        for w in test_weights:
            temp_model = xgb.XGBClassifier(
                objective="binary:logistic",
                eval_metric="auc",
                scale_pos_weight=w,
                #use_label_encoder=False,
                random_state=42,
                n_jobs=-1
            )
            temp_model.fit(X_train, y_train)
            y_pred_prob = temp_model.predict_proba(X_val)[:, 1]
            auc = roc_auc_score(y_val, y_pred_prob)
            print(f"scale_pos_weight={w:.2f}, AUC={auc:.4f}")

            if auc > best_auc:
                best_auc, best_weight = auc, w

        print(f"最佳 scale_pos_weight = {best_weight:.2f}, AUC={best_auc:.4f}")
        return best_weight

    # ======================
    # 3. 閾值優化
    # ======================
    def find_optimal_threshold(self, y_true, y_prob, metric = "f1"):
        precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)

        if metric == "f1":
            optimal_idx = np.argmax(f1_scores)
        elif metric == "precision":
            optimal_idx = np.argmax(precision)
        elif metric == "recall":
            optimal_idx = np.argmax(recall)

        self.optimal_threshold = thresholds[optimal_idx]
        self.threshold_metrics = {
            "optimal_threshold": self.optimal_threshold,
            "precision": precision[optimal_idx],
            "recall": recall[optimal_idx],
            "f1": f1_scores[optimal_idx],
        }

        print("\n=== 最佳閾值分析 ===")
        print(f"最佳閾值: {self.optimal_threshold:.4f}")
        print(f"Precision: {precision[optimal_idx]:.4f}")
        print(f"Recall: {recall[optimal_idx]:.4f}")
        print(f"F1: {f1_scores[optimal_idx]:.4f}")

        return self.optimal_threshold

    # ======================
    # 4. 超參數調優
    # ======================
    def tune_hyperparameters(self, X_train, y_train, best_weight = None ,search_type="grid", cv=3, n_iter=10):
        param_grid = {
            "n_estimators": [100, 200, 500],
            "max_depth": [3, 5, 7],
            "learning_rate": np.linspace(0.01, 0.2, 3),  # 減少數量
            "subsample": [0.6, 0.8, 1.0],
            "colsample_bytree": [0.6, 0.8, 1.0],
            "reg_alpha": [0, 0.1, 1],
            "reg_lambda": [0.1, 1, 10],
        }

        base_model = xgb.XGBClassifier(
            objective="binary:logistic",
            eval_metric="auc",
            scale_pos_weight = best_weight if best_weight else 1,
            random_state=42,
            n_jobs=-1
        )

        if search_type == "grid":
            searcher = GridSearchCV(base_model, param_grid, scoring="roc_auc", cv=cv, verbose=1, n_jobs=-1)
        else:
            searcher = RandomizedSearchCV(base_model, param_grid, n_iter=n_iter, scoring="roc_auc", cv=cv, verbose=1, n_jobs=-1, random_state=42)

        searcher.fit(X_train, y_train)

        print("\n=== 超參數調優結果 ===")
        print("最佳參數:", searcher.best_params_,'\n')
        print("最佳分數 (AUC):", searcher.best_score_)

        self.model = searcher.best_estimator_
        return self.model

    # ======================
    # 5. 模型訓練
    # ======================
    def train(self, X_train, y_train, best_weight=None):
        params = {
            "objective": "binary:logistic",
            "eval_metric": "auc",
            "scale_pos_weight": best_weight if best_weight else 1,
            "random_state": 42,
            "n_jobs": -1
        }
        self.model = xgb.XGBClassifier(**params)
        self.model.fit(X_train, y_train)

    # ======================
    # 6. 評估
    # ======================
    def evaluate(self, X_test, y_test, use_optimal_threshold=True):
        y_prob = self.model.predict_proba(X_test)[:, 1]
        threshold = self.optimal_threshold if use_optimal_threshold else 0.5
        y_pred = (y_prob >= threshold).astype(int)

        print("\n=== 模型評估 ===")
        print("AUC:", roc_auc_score(y_test, y_prob))
        print("F1:", f1_score(y_test, y_pred))
        print("\n分類報告:\n", classification_report(y_test, y_pred))

        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
        plt.title("混淆矩陣")
        plt.show()

        return {"auc": roc_auc_score(y_test, y_prob), "f1": f1_score(y_test, y_pred)}