In [3]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from scipy.stats import uniform
import sklearn

print(f"scikit-learn 版本: {sklearn.__version__}")

# ====================== 计算防守统计数据 ======================
def compute_defensive_stats(match_df, team_positions_df):
    """计算球队的防守统计数据，包括6种比率、总失球数和平均失球数"""
    epsilon = 1e-8  # 提高数值稳定性
    team_stats = {}
    for team in team_positions_df['team_name'].unique():
        team_stats[team] = {
            'total_goals_conceded': 0, 'ratio1_list': [], 'ratio2_list': [], 'ratio3_list': [],
            'ratio4_list': [], 'ratio5_list': [], 'ratio6_list': [], 'num_matches': 0
        }

    # 主场比赛统计
    for idx, row in match_df.iterrows():
        home_team, away_team = row['home_team_name'], row['away_team_name']
        if home_team not in team_stats or away_team not in team_stats:
            print(f"警告: 主队 {home_team} 或客队 {away_team} 未找到，跳过比赛 {idx}")
            continue
        if row['Pre-Match PPG (Away)'] > 0:
            team_stats[home_team]['ratio1_list'].append(
                row['away_team_goal_count'] / (row['Pre-Match PPG (Away)'] + epsilon))
        if row['away_team_corner_count'] > 0:
            team_stats[home_team]['ratio2_list'].append(
                row['away_team_goal_count'] / (row['away_team_corner_count'] + epsilon))
        denominator = row['home_team_yellow_cards'] + row['home_team_red_cards'] + row['home_team_fouls'] + epsilon
        team_stats[home_team]['ratio3_list'].append(row['away_team_goal_count'] / denominator)
        # 修改为 'away_xg' 并添加容错
        away_xg = row['away_xg'] if 'away_xg' in match_df.columns else 0
        if away_xg > 0:
            team_stats[home_team]['ratio4_list'].append(row['away_team_goal_count'] / (away_xg + epsilon))
        shots_total = row['away_team_shots_on_target'] + row['away_team_shots_off_target'] + epsilon
        team_stats[home_team]['ratio5_list'].append(row['away_team_goal_count'] / shots_total)
        if row['away_team_possession'] > 0:
            team_stats[home_team]['ratio6_list'].append(
                row['away_team_goal_count'] / (row['away_team_possession'] + epsilon))
        team_stats[home_team]['total_goals_conceded'] += row['away_team_goal_count']
        team_stats[home_team]['num_matches'] += 1

    # 客场比赛统计
    for idx, row in match_df.iterrows():
        away_team = row['away_team_name']
        if away_team not in team_stats:
            print(f"警告: 客队 {away_team} 未找到，跳过比赛 {idx}")
            continue
        team_stats[away_team]['total_goals_conceded'] += row['home_team_goal_count']
        team_stats[away_team]['num_matches'] += 1

    # 汇总数据
    data = []
    for team, stats in team_stats.items():
        num_matches = stats['num_matches']
        average_goals_conceded = stats['total_goals_conceded'] / num_matches if num_matches > 0 else 0
        data.append({
            'team_name': team,
            'ratio1': np.mean(stats['ratio1_list']) if stats['ratio1_list'] else 0,
            'ratio2': np.mean(stats['ratio2_list']) if stats['ratio2_list'] else 0,
            'ratio3': np.mean(stats['ratio3_list']) if stats['ratio3_list'] else 0,
            'ratio4': np.mean(stats['ratio4_list']) if stats['ratio4_list'] else 0,
            'ratio5': np.mean(stats['ratio5_list']) if stats['ratio5_list'] else 0,
            'ratio6': np.mean(stats['ratio6_list']) if stats['ratio6_list'] else 0,
            'total_goals_conceded': stats['total_goals_conceded'],
            'average_goals_conceded': average_goals_conceded
        })

    return pd.DataFrame(data)

# ====================== ELO评分算法实现 ======================
def initialize_elo_scores(team_positions_df):
    """初始化球队ELO分数"""
    teams = team_positions_df['team_name'].unique().tolist()
    team_elo = {team: 1500 for team in teams}
    for team in teams:
        team_data = team_positions_df[team_positions_df['team_name'] == team]
        rank = team_data['points_per_game'].rank().iloc[0]
        team_elo[team] += (20 * (len(teams) - rank))
    return team_elo

def update_elo_scores(elo_scores, home_team, away_team, home_score, away_score, K=30):
    """更新ELO分数"""
    home_elo, away_elo = elo_scores[home_team], elo_scores[away_team]
    expected_home = 1 / (1 + 10 ** ((away_elo - home_elo) / 400))
    expected_away = 1 / (1 + 10 ** ((home_elo - away_elo) / 400))

    if home_score > away_score:
        elo_scores[home_team] += K * (1 - expected_home)
        elo_scores[away_team] += K * (0 - expected_away)
    elif home_score < away_score:
        elo_scores[home_team] += K * (0 - expected_home)
        elo_scores[away_team] += K * (1 - expected_away)
    else:
        elo_scores[home_team] += K * (0.5 - expected_home)
        elo_scores[away_team] += K * (0.5 - expected_away)
    return elo_scores

# **修改后的函数：计算主客场表现差异比值**
def compute_home_away_diff(match_df, team_positions_df, elo_scores):
    """计算每支球队在每个赛季的主客场表现差异比值"""
    epsilon = 1e-8  # 防止除零
    home_away_diff = {}

    for team in team_positions_df['team_name'].unique():
        for season in match_df['season'].unique():
            team_matches = match_df[
                ((match_df['home_team_name'] == team) | (match_df['away_team_name'] == team)) &
                (match_df['season'] == season)
            ]
            if team_matches.empty:
                continue

            opponents = set(team_matches['home_team_name'].unique()) | set(team_matches['away_team_name'].unique())
            opponents.remove(team)

            diff_ratios = []
            weights = []

            for opponent in opponents:
                home_match = team_matches[
                    (team_matches['home_team_name'] == team) & (team_matches['away_team_name'] == opponent)]
                away_match = team_matches[
                    (team_matches['away_team_name'] == team) & (team_matches['home_team_name'] == opponent)]

                if not home_match.empty and not away_match.empty:
                    # 使用 'home_xg' 和 'away_xg'，添加容错处理
                    home_xg = home_match['home_xg'].values[0] if 'home_xg' in home_match.columns else 0  # 主场xG
                    away_xg = away_match['away_xg'].values[0] if 'away_xg' in away_match.columns else 0  # 客场xG
                    home_conceded_xg = home_match['away_xg'].values[0] if 'away_xg' in home_match.columns else 0  # 主场失球xG
                    away_conceded_xg = away_match['home_xg'].values[0] if 'home_xg' in away_match.columns else 0  # 客场失球xG

                    ratio_xg = home_xg / (away_xg + epsilon)  # 主场xG / 客场xG
                    ratio_conceded = home_conceded_xg / (away_conceded_xg + epsilon)  # 主场失球xG / 客场失球xG
                    diff_ratio = (ratio_xg + ratio_conceded) / 2  # 平均差异比值

                    opponent_elo = elo_scores.get(opponent, 1500)  # 获取对手ELO
                    diff_ratios.append(diff_ratio)
                    weights.append(opponent_elo)

            if diff_ratios:
                weighted_diff = np.average(diff_ratios, weights=weights)  # 加权平均
                home_away_diff[f"{team}_{season}"] = weighted_diff
            else:
                home_away_diff[f"{team}_{season}"] = 1.0  # 默认无差异

    return home_away_diff

# ====================== 数据加载函数（含交叉验证） ======================
def load_all_league_data(base_path, leagues, seasons):
    """加载所有联赛数据并进行预处理，同时计算主客场差异"""
    all_team_positions, all_match_positions = [], []
    elo_scores = None  # 将在第一次加载后初始化

    for country_name, league_name in leagues:
        for season in seasons:
            print(f"加载数据: {country_name} - {league_name} - {season}")
            team_file = os.path.join(base_path, f"{country_name}-{league_name}-teams-{season}-stats.csv")
            match_file = os.path.join(base_path, f"{country_name}-{league_name}-matches-{season}-stats.csv")
            if not os.path.exists(team_file) or not os.path.exists(match_file):
                print(f"警告: {country_name} - {league_name} - {season} 文件缺失")
                continue

            team_df = pd.read_csv(team_file)
            match_df = pd.read_csv(match_file)

            # 统一球队名称格式
            team_df['team_name'] = team_df.get('common_name', team_df.get('team_name', None)).str.strip().str.lower()
            match_df['home_team_name'] = match_df['home_team_name'].str.strip().str.lower()
            match_df['away_team_name'] = match_df['away_team_name'].str.strip().str.lower()

            team_names = team_df['team_name'].unique()
            original_match_count = len(match_df)
            match_df = match_df[
                match_df['home_team_name'].isin(team_names) & match_df['away_team_name'].isin(team_names)]
            if len(match_df) < original_match_count:
                print(f"警告: 过滤了 {original_match_count - len(match_df)} 场比赛")

            if match_df.empty:
                print("警告: 无有效比赛数据")
                continue

            # 初始化ELO分数（仅在第一次加载时）
            if elo_scores is None:
                elo_scores = initialize_elo_scores(team_df)

            defensive_stats_df = compute_defensive_stats(match_df, team_df)
            if defensive_stats_df.empty:
                print("警告: 未计算出防守统计")
                continue

            team_df = team_df.merge(defensive_stats_df, on='team_name', how='left')
            for col in ['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6', 'total_goals_conceded', 'average_goals_conceded']:
                if col not in team_df.columns:
                    print(f"警告: {col} 列缺失，设为0")
                    team_df[col] = 0

            # 归一化防守得分
            if 'total_goals_conceded' in team_df.columns and team_df['total_goals_conceded'].max() != team_df['total_goals_conceded'].min():
                team_df['normalized_defense_score'] = (team_df['total_goals_conceded'].max() - team_df['total_goals_conceded']) / (team_df['total_goals_conceded'].max() - team_df['total_goals_conceded'].min() + 1e-8)
            else:
                team_df['normalized_defense_score'] = 0

            # PCA降维（防守数据）
            defensive_columns = [col for col in team_df.columns if col in [
                'goals_conceded', 'total_goals_conceded', 'ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6',
                'normalized_defense_score'
            ]]
            team_df_defensive = team_df[defensive_columns].fillna(0)
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            pca_results = []
            for train_idx, _ in kf.split(team_df_defensive):
                train_data = team_df_defensive.iloc[train_idx]
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(train_data)
                pca = PCA(n_components=2)
                X_pca = pca.fit_transform(X_scaled)
                pca_results.append(X_pca)
            X_pca_avg = np.mean(
                [np.pad(r, ((0, len(team_df_defensive) - len(r)), (0, 0)), 'constant') for r in pca_results], axis=0)

            team_positions = pd.DataFrame({
                'team_name': team_df['team_name'], 'PC1': X_pca_avg[:, 0], 'PC2': X_pca_avg[:, 1],
                'points_per_game': team_df['points_per_game'], 'league': league_name, 'season': season,
                'ratio1': team_df['ratio1'], 'ratio2': team_df['ratio2'], 'ratio3': team_df['ratio3'],
                'ratio4': team_df['ratio4'], 'ratio5': team_df['ratio5'], 'ratio6': team_df['ratio6'],
                'normalized_defense_score': team_df['normalized_defense_score'],
                'total_goals_conceded': team_df['total_goals_conceded'],
                'average_goals_conceded': team_df['average_goals_conceded']
            })
            team_positions['team_season'] = league_name + '_' + team_positions['team_name'] + '_' + team_positions['season']
            all_team_positions.append(team_positions)

            # 比赛数据PCA
            match_df = match_df.drop(
                columns=['timestamp', 'date_GMT', 'status', 'attendance', 'referee', 'stadium_name', 'Game Week'],
                errors='ignore')
            numeric_cols = match_df.select_dtypes(include=[np.number]).columns
            match_df[numeric_cols] = match_df[numeric_cols].fillna(match_df[numeric_cols].mean())
            pca_match_results = []
            for train_idx, _ in kf.split(match_df[numeric_cols]):
                train_data = match_df[numeric_cols].iloc[train_idx]
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(train_data)
                pca = PCA(n_components=2)
                X_pca = pca.fit_transform(X_scaled)
                pca_match_results.append(X_pca)
            X_pca_match_avg = np.mean(
                [np.pad(r, ((0, len(match_df) - len(r)), (0, 0)), 'constant') for r in pca_match_results], axis=0)

            match_positions = pd.DataFrame({
                'home_team_name': match_df['home_team_name'], 'away_team_name': match_df['away_team_name'],
                'PC1': X_pca_match_avg[:, 0], 'PC2': X_pca_match_avg[:, 1], 'league': league_name, 'season': season
            })
            all_match_positions.append(match_positions)

    all_team_positions_df = pd.concat(all_team_positions, ignore_index=True)
    all_match_positions_df = pd.concat(all_match_positions, ignore_index=True)

    # 计算主客场差异比值并添加到team_positions_df
    home_away_diff = compute_home_away_diff(all_match_positions_df, all_team_positions_df, elo_scores)
    all_team_positions_df['home_away_diff'] = all_team_positions_df['team_season'].map(home_away_diff).fillna(1.0)

    return all_team_positions_df, all_match_positions_df

# ====================== 使用Adaboost优化权重 ======================
def compute_adaboost_weights(team_positions_df):
    """使用Adaboost计算防守比率的最佳权重，并通过交叉验证优化参数"""
    ratios = team_positions_df[['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6']].replace([np.inf, -np.inf], np.nan).fillna(0)
    correlation_matrix = ratios.corr()
    print("特征相关性矩阵:\n", correlation_matrix)

    scaler = StandardScaler()
    ratios_scaled = scaler.fit_transform(ratios)
    labels = pd.qcut(team_positions_df['average_goals_conceded'], q=2, labels=[0, 1], duplicates='drop').fillna(0).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(ratios_scaled, labels, test_size=0.2, random_state=42)

    ada = AdaBoostClassifier(algorithm='SAMME', random_state=42)
    param_grid = {
        'estimator': [DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(max_depth=10)],
        'n_estimators': [500, 1000, 1500, 250, 50, 200, 150, 100],
        'learning_rate': [0.01, 0.1, 0.5, 0.05, 0.25]
    }

    grid_search = GridSearchCV(ada, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    print("最佳参数:", best_params)

    adaboost = AdaBoostClassifier(
        estimator=best_params['estimator'],
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        algorithm='SAMME',
        random_state=42
    )
    adaboost.fit(X_train, y_train)

    y_pred = adaboost.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, adaboost.predict_proba(X_test)[:, 1])
    conf_matrix = confusion_matrix(y_test, y_pred)

    print("测试集准确率:", accuracy)
    print("测试集F1分数:", f1)
    print("测试集ROC-AUC:", roc_auc)
    print("混淆矩阵:\n", conf_matrix)

    feature_importances = np.zeros(6)
    for i, est in enumerate(adaboost.estimators_):
        feature_importances += adaboost.estimator_weights_[i] * est.feature_importances_
    feature_importances /= np.sum(adaboost.estimator_weights_)

    weights = feature_importances / np.sum(feature_importances)
    print(f"计算得到的权重: {weights}")

    return weights.astype(np.float32)

# **修改后的损失函数：加入主客场差异比值**
def compute_total_loss(positions, match_home_idx, match_away_idx, match_PC1, match_PC2, points_per_game, rank_scale,
                       ratios, w, normalized_defense_score, lambda_defense, lambda_supervision, lambda_reg, elo_scores,
                       home_away_diff):
    """计算总损失，考虑主客场表现差异比值"""
    epsilon = 1e-8
    if tf.shape(match_home_idx)[0] == 0:
        return tf.constant(0.0, dtype=tf.float32)

    # 匹配损失
    home_pos = tf.gather(positions, match_home_idx)
    away_pos = tf.gather(positions, match_away_idx)
    match_points = tf.stack([match_PC1, match_PC2], axis=1)
    dist_home = tf.norm(home_pos - match_points + epsilon, axis=1)
    dist_away = tf.norm(away_pos - match_points + epsilon, axis=1)
    all_distances = tf.concat([dist_home, dist_away], axis=0)
    dist_range = tf.reduce_max(all_distances) - tf.reduce_min(all_distances) + epsilon
    dist_home_norm = (dist_home - tf.reduce_min(all_distances)) / dist_range
    dist_away_norm = (dist_away - tf.reduce_min(all_distances)) / dist_range

    # 获取主客场ELO分数和差异比值
    home_elo = tf.gather(elo_scores, match_home_idx)
    away_elo = tf.gather(elo_scores, match_away_idx)
    home_diff = tf.gather(home_away_diff, match_home_idx)  # 主场球队的主客场差异比值
    away_diff = tf.gather(home_away_diff, match_away_idx)  # 客场球队的主客场差异比值

    elo_diff = tf.abs(home_elo - away_elo)
    # 主场权重保持不变，客场权重乘以差异比值
    weight_home = 1.0 / (1.0 + elo_diff * rank_scale + epsilon)
    weight_away = weight_home * away_diff  # 客场权重调整
    # 归一化权重
    total_weight = weight_home + weight_away
    weight_home = weight_home / (total_weight + epsilon)
    weight_away = weight_away / (total_weight + epsilon)

    match_loss = tf.reduce_mean(weight_home * dist_home_norm + weight_away * dist_away_norm)

    # 防守损失
    w = tf.abs(w)
    defense_target = -tf.reduce_sum(w * ratios, axis=1)
    defense_loss = tf.reduce_mean(tf.square(positions[:, 1] - defense_target))

    # 监督损失
    supervision_loss = tf.reduce_mean(tf.square(defense_target - normalized_defense_score))

    # 正则化
    regularization_loss = lambda_reg * tf.reduce_sum(tf.square(w))

    total_loss = match_loss + lambda_defense * defense_loss + lambda_supervision * supervision_loss + regularization_loss
    return tf.where(tf.math.is_nan(total_loss) | tf.math.is_inf(total_loss), 0.0, total_loss)

# **修改后的Adam优化函数**
def adam_optimize_positions(team_positions_df, match_positions_df, initial_lr=0.0005, decay_steps=200000,
                            decay_rate=0.9, clipnorm=0.5, iterations=30000, verbose_interval=1000, random_seed=42,
                            lambda_defense=0.1, lambda_supervision=0.1, lambda_reg=0.01, patience=100, w=None):
    """使用Adam优化球队位置，考虑主客场差异"""
    team_seasons = team_positions_df['team_season'].unique()
    team_season_to_idx = {t: i for i, t in enumerate(team_seasons)}

    elo_scores_dict = initialize_elo_scores(team_positions_df)
    team_names = [t.split('_')[1] for t in team_seasons]
    elo_scores = tf.convert_to_tensor([elo_scores_dict[name] for name in team_names], dtype=tf.float32)

    # 获取主客场差异比值
    home_away_diff = tf.convert_to_tensor(team_positions_df['home_away_diff'].values, dtype=tf.float32)

    team_positions_df = team_positions_df.set_index('team_season')
    init_positions = team_positions_df[['PC1', 'PC2']].values
    ratios = team_positions_df[['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6']].values
    scaler_pos = StandardScaler()
    scaler_rat = StandardScaler()
    positions_scaled = scaler_pos.fit_transform(init_positions)
    ratios_scaled = scaler_rat.fit_transform(ratios)

    positions = tf.Variable(positions_scaled, dtype=tf.float32)
    ratios = tf.constant(ratios_scaled, dtype=tf.float32)
    normalized_defense_score = tf.constant(team_positions_df['normalized_defense_score'].values, dtype=tf.float32)
    points_per_game = tf.constant(team_positions_df['points_per_game'].values, dtype=tf.float32)

    match_array = [
        [team_season_to_idx[f"{row['league']}_{row['home_team_name']}_{row['season']}"],
         team_season_to_idx[f"{row['league']}_{row['away_team_name']}_{row['season']}"], row['PC1'], row['PC2']]
        for _, row in match_positions_df.iterrows()
        if f"{row['league']}_{row['home_team_name']}_{row['season']}" in team_season_to_idx and
           f"{row['league']}_{row['away_team_name']}_{row['season']}" in team_season_to_idx
    ]
    match_array = np.array(match_array, dtype=np.float32)
    if len(match_array) == 0:
        print("警告: 无有效比赛数据")
        return [], team_positions_df.reset_index(), None, None

    match_home_idx = tf.constant(match_array[:, 0], dtype=tf.int32)
    match_away_idx = tf.constant(match_array[:, 1], dtype=tf.int32)
    match_PC1 = tf.constant(match_array[:, 2], dtype=tf.float32)
    match_PC2 = tf.constant(match_array[:, 3], dtype=tf.float32)

    tf.random.set_seed(random_seed)
    rank_scale = tf.Variable(1.0, dtype=tf.float32)
    w = tf.Variable(np.abs(w) if w is not None else np.ones(6, dtype=np.float32) / 6, dtype=tf.float32)
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_lr, decay_steps, decay_rate)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=clipnorm)

    losses, best_loss, patience_counter = [], float('inf'), 0
    best_positions, best_rank_scale, best_w = positions.numpy().copy(), 1.0, w.numpy().copy()

    for i in range(iterations):
        with tf.GradientTape() as tape:
            loss = compute_total_loss(positions, match_home_idx, match_away_idx, match_PC1, match_PC2, points_per_game,
                                      rank_scale, ratios, w, normalized_defense_score, lambda_defense,
                                      lambda_supervision, lambda_reg, elo_scores, home_away_diff)
        grads = tape.gradient(loss, [positions, rank_scale, w])
        if any(g is None for g in grads):
            print(f"警告: 迭代 {i + 1} 梯度为None")
            continue
        optimizer.apply_gradients(zip(grads, [positions, rank_scale, w]))
        loss_val = float(loss.numpy())
        losses.append(loss_val)

        if loss_val < best_loss:
            best_loss = loss_val
            best_positions, best_rank_scale, best_w = positions.numpy().copy(), float(rank_scale.numpy()), w.numpy().copy()
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"早停触发，训练在第 {i + 1} 轮停止")
            break

        if (i + 1) % verbose_interval == 0:
            print(f"迭代 {i + 1}/{iterations}, 损失 = {loss_val:.4f}, rank_scale = {rank_scale.numpy():.4f}")

    positions.assign(best_positions)
    rank_scale.assign(best_rank_scale)
    w.assign(best_w)
    final_pos = scaler_pos.inverse_transform(positions.numpy())
    for idx, team_season in enumerate(team_seasons):
        team_positions_df.loc[team_season, 'PC1'] = final_pos[idx, 0]
        team_positions_df.loc[team_season, 'PC2'] = final_pos[idx, 1]
    team_positions_df = team_positions_df.reset_index()
    return losses, team_positions_df, best_rank_scale, best_w

# ====================== 随机搜索超参数调整 ======================
def random_search_hyperparameters(team_positions, match_positions, w, n_iter=10, random_state=42):
    """随机搜索最佳超参数"""
    np.random.seed(random_state)
    best_loss, best_params = float('inf'), None

    for _ in range(n_iter):
        params = {
            'lambda_defense': uniform(0.01, 0.2).rvs(),
            'lambda_supervision': uniform(0.01, 0.2).rvs(),
            'lambda_reg': uniform(0.001, 0.02).rvs()
        }
        print(f"\n随机搜索: {params}")

        losses, _, _, _ = adam_optimize_positions(
            team_positions.copy(), match_positions.copy(), w=w, **params
        )
        if losses and losses[-1] < best_loss:
            best_loss = losses[-1]
            best_params = params

    print(f"\n最佳超参数: {best_params}, 最佳损失: {best_loss:.4f}")
    return best_params

# ====================== 可视化函数 ======================
def visualize_team_evolution_by_league_static(team_positions_df, seasons_order):
    """按联赛可视化球队演变"""
    output_dir = "/Users/peixuanma/Downloads/Output_Graphs"
    os.makedirs(output_dir, exist_ok=True)

    for league in team_positions_df['league'].unique():
        league_df = team_positions_df[team_positions_df['league'] == league].copy()
        valid_teams = league_df.groupby("team_name").filter(lambda x: x['season'].nunique() == len(seasons_order))['team_name'].unique()
        if not valid_teams.size:
            valid_teams = league_df['team_name'].unique()
        valid_df = league_df[league_df['team_name'].isin(valid_teams)].sort_values(['team_name', 'season'])

        plt.figure(figsize=(10, 8))
        plt.title(f"{league} - 球队演变 (进攻-防守)")
        plt.xlabel("PC1 (进攻)")
        plt.ylabel("PC2 (防守)")
        plt.grid(True)

        for team in valid_df['team_name'].unique():
            sub = valid_df[valid_df['team_name'] == team]
            plt.plot(sub['PC1'], sub['PC2'], marker='o', label=team)
            for _, row in sub.iterrows():
                plt.text(row['PC1'], row['PC2'], row['season'], fontsize=8, ha='right')

        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        filename = os.path.join(output_dir, f"{league}_evolution.png")
        plt.savefig(filename, dpi=150)
        plt.close()
        print(f"保存 {league} 图 -> {filename}")

# ====================== 主函数 ======================
if __name__ == "__main__":
    base_path = '/Users/peixuanma/Downloads/data1'
    leagues = [
        ("england", "premier-league"), ("germany", "bundesliga"), ("spain", "la-liga"),
        ("france", "ligue-1"), ("france", "ligue-2"), ("italy", "serie-a"), ("netherlands", "eredivisie"),
        ("portugal", "ligapro"), ("denmark", "superliga"), ("england", "championship"), ("spain", "segunda-division"),
        ("switzerland", "super-league"), ("portugal", "liga-nos"), ("italy", "serie-b"), ("germany", "2-bundesliga"),
        ("scotland", "premiership"), ("belgium", "pro-league"), ("austria", "bundesliga"),
    ]
    seasons = [
        "2013-to-2014", "2014-to-2015", "2015-to-2016", "2016-to-2017", "2017-to-2018",
        "2018-to-2019", "2019-to-2020", "2020-to-2021", "2021-to-2022", "2022-to-2023", "2023-to-2024"
    ]

    # 加载数据
    print("===== 加载数据 =====")
    all_team_positions, all_match_positions = load_all_league_data(base_path, leagues, seasons)

    # 计算Adaboost权重
    print("\n===== 计算Adaboost权重 =====")
    adaboost_weights = compute_adaboost_weights(all_team_positions)
    print("优化后的Adaboost权重:", adaboost_weights)

    # 随机搜索超参数
    print("\n===== 随机搜索超参数 =====")
    best_params = random_search_hyperparameters(all_team_positions, all_match_positions, adaboost_weights)

    # 最终训练
    print("\n===== 最终训练 =====")
    final_losses, final_team_positions, final_rank_scale, final_w = adam_optimize_positions(
        all_team_positions.copy(), all_match_positions.copy(), w=adaboost_weights, **best_params
    )
    if final_losses:
        final_team_positions.to_csv("trained_team_positions.csv", index=False)
        print(f"最终损失: {final_losses[-1]:.4f}")

    # 可视化
    print("\n===== 可视化 =====")
    visualize_team_evolution_by_league_static(final_team_positions, seasons)

    print("\n完成！")

scikit-learn 版本: 1.4.2
===== 加载数据 =====
加载数据: england - premier-league - 2013-to-2014
加载数据: england - premier-league - 2014-to-2015
加载数据: england - premier-league - 2015-to-2016
加载数据: england - premier-league - 2016-to-2017
加载数据: england - premier-league - 2017-to-2018
加载数据: england - premier-league - 2018-to-2019
加载数据: england - premier-league - 2019-to-2020
加载数据: england - premier-league - 2020-to-2021
加载数据: england - premier-league - 2021-to-2022
加载数据: england - premier-league - 2022-to-2023
加载数据: england - premier-league - 2023-to-2024
加载数据: germany - bundesliga - 2013-to-2014
加载数据: germany - bundesliga - 2014-to-2015
加载数据: germany - bundesliga - 2015-to-2016
加载数据: germany - bundesliga - 2016-to-2017
加载数据: germany - bundesliga - 2017-to-2018
加载数据: germany - bundesliga - 2018-to-2019
加载数据: germany - bundesliga - 2019-to-2020
加载数据: germany - bundesliga - 2020-to-2021
加载数据: germany - bundesliga - 2021-to-2022
加载数据: germany - bundesliga - 2022-to-2023
加载数据: germany - bundesliga - 2023-

  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 premier-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/premier-league_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 bundesliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/bundesliga_evolution.png
保存 la-liga 图 -> /Users/peixuanma/Downloads/Output_Graphs/la-liga_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligue-1 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligue-1_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligue-2 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligue-2_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 serie-a 图 -> /Users/peixuanma/Downloads/Output_Graphs/serie-a_evolution.png
保存 eredivisie 图 -> /Users/peixuanma/Downloads/Output_Graphs/eredivisie_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligapro 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligapro_evolution.png
保存 superliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/superliga_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 championship 图 -> /Users/peixuanma/Downloads/Output_Graphs/championship_evolution.png
保存 segunda-division 图 -> /Users/peixuanma/Downloads/Output_Graphs/segunda-division_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 super-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/super-league_evolution.png
保存 liga-nos 图 -> /Users/peixuanma/Downloads/Output_Graphs/liga-nos_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 serie-b 图 -> /Users/peixuanma/Downloads/Output_Graphs/serie-b_evolution.png
保存 2-bundesliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/2-bundesliga_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 premiership 图 -> /Users/peixuanma/Downloads/Output_Graphs/premiership_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 pro-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/pro-league_evolution.png

完成！


In [53]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from scipy.stats import uniform
import sklearn

print(f"scikit-learn 版本: {sklearn.__version__}")

# ====================== 计算防守统计数据 ======================
def compute_defensive_stats(match_df, team_positions_df):
    """计算球队的防守统计数据，包括6种比率、总失球数和平均失球数"""
    epsilon = 1e-8  # 提高数值稳定性
    team_stats = {}
    for team in team_positions_df['team_name'].unique():
        team_stats[team] = {
            'total_goals_conceded': 0, 'ratio1_list': [], 'ratio2_list': [], 'ratio3_list': [],
            'ratio4_list': [], 'ratio5_list': [], 'ratio6_list': [], 'num_matches': 0
        }

    # 主场比赛统计
    for idx, row in match_df.iterrows():
        home_team, away_team = row['home_team_name'], row['away_team_name']
        if home_team not in team_stats or away_team not in team_stats:
            print(f"警告: 主队 {home_team} 或客队 {away_team} 未找到，跳过比赛 {idx}")
            continue
        if row['Pre-Match PPG (Away)'] > 0:
            team_stats[home_team]['ratio1_list'].append(
                row['away_team_goal_count'] / (row['Pre-Match PPG (Away)'] + epsilon))
        if row['away_team_corner_count'] > 0:
            team_stats[home_team]['ratio2_list'].append(
                row['away_team_goal_count'] / (row['away_team_corner_count'] + epsilon))
        denominator = row['home_team_yellow_cards'] + row['home_team_red_cards'] + row['home_team_fouls'] + epsilon
        team_stats[home_team]['ratio3_list'].append(row['away_team_goal_count'] / denominator)
        if row['team_b_xg'] > 0:
            team_stats[home_team]['ratio4_list'].append(row['away_team_goal_count'] / (row['team_b_xg'] + epsilon))
        shots_total = row['away_team_shots_on_target'] + row['away_team_shots_off_target'] + epsilon
        team_stats[home_team]['ratio5_list'].append(row['away_team_goal_count'] / shots_total)
        if row['away_team_possession'] > 0:
            team_stats[home_team]['ratio6_list'].append(
                row['away_team_goal_count'] / (row['away_team_possession'] + epsilon))
        team_stats[home_team]['total_goals_conceded'] += row['away_team_goal_count']
        team_stats[home_team]['num_matches'] += 1  # 记录主场比赛场次

    # 客场比赛统计
    for idx, row in match_df.iterrows():
        away_team = row['away_team_name']
        if away_team not in team_stats:
            print(f"警告: 客队 {away_team} 未找到，跳过比赛 {idx}")
            continue
        team_stats[away_team]['total_goals_conceded'] += row['home_team_goal_count']
        team_stats[away_team]['num_matches'] += 1  # 记录客场比赛场次

    # 汇总数据
    data = []
    for team, stats in team_stats.items():
        num_matches = stats['num_matches']
        average_goals_conceded = stats['total_goals_conceded'] / num_matches if num_matches > 0 else 0
        data.append({
            'team_name': team,
            'ratio1': np.mean(stats['ratio1_list']) if stats['ratio1_list'] else 0,
            'ratio2': np.mean(stats['ratio2_list']) if stats['ratio2_list'] else 0,
            'ratio3': np.mean(stats['ratio3_list']) if stats['ratio3_list'] else 0,
            'ratio4': np.mean(stats['ratio4_list']) if stats['ratio4_list'] else 0,
            'ratio5': np.mean(stats['ratio5_list']) if stats['ratio5_list'] else 0,
            'ratio6': np.mean(stats['ratio6_list']) if stats['ratio6_list'] else 0,
            'total_goals_conceded': stats['total_goals_conceded'],
            'average_goals_conceded': average_goals_conceded
        })

    return pd.DataFrame(data)

# ====================== ELO评分算法实现 ======================
def initialize_elo_scores(team_positions_df):
    """初始化球队ELO分数"""
    teams = team_positions_df['team_name'].unique().tolist()
    team_elo = {team: 1500 for team in teams}
    for team in teams:
        team_data = team_positions_df[team_positions_df['team_name'] == team]
        rank = team_data['points_per_game'].rank().iloc[0]
        team_elo[team] += (20 * (len(teams) - rank))
    return team_elo

def update_elo_scores(elo_scores, home_team, away_team, home_score, away_score, K=30):
    """更新ELO分数"""
    home_elo, away_elo = elo_scores[home_team], elo_scores[away_team]
    expected_home = 1 / (1 + 10 ** ((away_elo - home_elo) / 400))
    expected_away = 1 / (1 + 10 ** ((home_elo - away_elo) / 400))

    if home_score > away_score:
        elo_scores[home_team] += K * (1 - expected_home)
        elo_scores[away_team] += K * (0 - expected_away)
    elif home_score < away_score:
        elo_scores[home_team] += K * (0 - expected_home)
        elo_scores[away_team] += K * (1 - expected_away)
    else:
        elo_scores[home_team] += K * (0.5 - expected_home)
        elo_scores[away_team] += K * (0.5 - expected_away)
    return elo_scores

# ====================== 数据加载函数（含交叉验证） ======================
def load_all_league_data(base_path, leagues, seasons):
    """加载所有联赛数据并进行预处理"""
    all_team_positions, all_match_positions = [], []
    for country_name, league_name in leagues:
        for season in seasons:
            print(f"加载数据: {country_name} - {league_name} - {season}")
            team_file = os.path.join(base_path, f"{country_name}-{league_name}-teams-{season}-stats.csv")
            match_file = os.path.join(base_path, f"{country_name}-{league_name}-matches-{season}-stats.csv")
            if not os.path.exists(team_file) or not os.path.exists(match_file):
                print(f"警告: {country_name} - {league_name} - {season} 文件缺失")
                continue

            team_df = pd.read_csv(team_file)
            match_df = pd.read_csv(match_file)

            # 统一球队名称格式
            team_df['team_name'] = team_df.get('common_name', team_df.get('team_name', None)).str.strip().str.lower()
            match_df['home_team_name'] = match_df['home_team_name'].str.strip().str.lower()
            match_df['away_team_name'] = match_df['away_team_name'].str.strip().str.lower()

            team_names = team_df['team_name'].unique()
            original_match_count = len(match_df)
            match_df = match_df[
                match_df['home_team_name'].isin(team_names) & match_df['away_team_name'].isin(team_names)]
            if len(match_df) < original_match_count:
                print(f"警告: 过滤了 {original_match_count - len(match_df)} 场比赛")

            if match_df.empty:
                print("警告: 无有效比赛数据")
                continue

            defensive_stats_df = compute_defensive_stats(match_df, team_df)
            if defensive_stats_df.empty:
                print("警告: 未计算出防守统计")
                continue

            team_df = team_df.merge(defensive_stats_df, on='team_name', how='left')
            for col in ['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6', 'total_goals_conceded', 'average_goals_conceded']:
                if col not in team_df.columns:
                    print(f"警告: {col} 列缺失，设为0")
                    team_df[col] = 0

            # 归一化防守得分
            if 'total_goals_conceded' in team_df.columns and team_df['total_goals_conceded'].max() != team_df[
                'total_goals_conceded'].min():
                team_df['normalized_defense_score'] = (team_df['total_goals_conceded'].max() - team_df[
                    'total_goals_conceded']) / (team_df['total_goals_conceded'].max() - team_df[
                    'total_goals_conceded'].min() + 1e-8)
            else:
                team_df['normalized_defense_score'] = 0

            # PCA降维（防守数据）
            defensive_columns = [col for col in team_df.columns if col in [
                'goals_conceded', 'total_goals_conceded', 'ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6',
                'normalized_defense_score'
            ]]
            team_df_defensive = team_df[defensive_columns].fillna(0)
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            pca_results = []
            for train_idx, _ in kf.split(team_df_defensive):
                train_data = team_df_defensive.iloc[train_idx]
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(train_data)
                pca = PCA(n_components=2)
                X_pca = pca.fit_transform(X_scaled)
                pca_results.append(X_pca)
            X_pca_avg = np.mean(
                [np.pad(r, ((0, len(team_df_defensive) - len(r)), (0, 0)), 'constant') for r in pca_results], axis=0)

            team_positions = pd.DataFrame({
                'team_name': team_df['team_name'], 'PC1': X_pca_avg[:, 0], 'PC2': X_pca_avg[:, 1],
                'points_per_game': team_df['points_per_game'], 'league': league_name, 'season': season,
                'ratio1': team_df['ratio1'], 'ratio2': team_df['ratio2'], 'ratio3': team_df['ratio3'],
                'ratio4': team_df['ratio4'], 'ratio5': team_df['ratio5'], 'ratio6': team_df['ratio6'],
                'normalized_defense_score': team_df['normalized_defense_score'],
                'total_goals_conceded': team_df['total_goals_conceded'],
                'average_goals_conceded': team_df['average_goals_conceded']
            })
            team_positions['team_season'] = league_name + '_' + team_positions['team_name'] + '_' + team_positions[
                'season']
            all_team_positions.append(team_positions)

            # 比赛数据PCA
            match_df = match_df.drop(
                columns=['timestamp', 'date_GMT', 'status', 'attendance', 'referee', 'stadium_name', 'Game Week'],
                errors='ignore')
            numeric_cols = match_df.select_dtypes(include=[np.number]).columns
            match_df[numeric_cols] = match_df[numeric_cols].fillna(match_df[numeric_cols].mean())
            pca_match_results = []
            for train_idx, _ in kf.split(match_df[numeric_cols]):
                train_data = match_df[numeric_cols].iloc[train_idx]
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(train_data)
                pca = PCA(n_components=2)
                X_pca = pca.fit_transform(X_scaled)
                pca_match_results.append(X_pca)
            X_pca_match_avg = np.mean(
                [np.pad(r, ((0, len(match_df) - len(r)), (0, 0)), 'constant') for r in pca_match_results], axis=0)

            match_positions = pd.DataFrame({
                'home_team_name': match_df['home_team_name'], 'away_team_name': match_df['away_team_name'],
                'PC1': X_pca_match_avg[:, 0], 'PC2': X_pca_match_avg[:, 1], 'league': league_name, 'season': season
            })
            all_match_positions.append(match_positions)

    return pd.concat(all_team_positions, ignore_index=True), pd.concat(all_match_positions, ignore_index=True)

# ====================== 使用Adaboost优化权重 ======================
def compute_adaboost_weights(team_positions_df):
    """使用Adaboost计算防守比率的最佳权重，并通过交叉验证优化参数"""
    # 数据预处理
    ratios = team_positions_df[['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6']].replace([np.inf, -np.inf],
                                                                                                     np.nan).fillna(0)

    # 检查特征相关性
    correlation_matrix = ratios.corr()
    print("特征相关性矩阵:\n", correlation_matrix)

    scaler = StandardScaler()
    ratios_scaled = scaler.fit_transform(ratios)

    # 将平均失球数分箱为两个分类标签
    labels = pd.qcut(team_positions_df['average_goals_conceded'], q=2, labels=[0, 1], duplicates='drop').fillna(0).astype(int)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(ratios_scaled, labels, test_size=0.2, random_state=42)

    # 定义Adaboost模型和参数网格
    ada = AdaBoostClassifier(algorithm='SAMME', random_state=42)
    param_grid = {
        'estimator': [DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(max_depth=10)],  # 增加深度
        'n_estimators': [500, 1000, 1500,250,50,200,150,100],  # 增加弱分类器数量
        'learning_rate': [0.01, 0.1, 0.5,0.05,0.25]  # 更广泛的学习率
    }

    # 使用网格搜索进行交叉验证
    grid_search = GridSearchCV(ada, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # 输出最佳参数
    best_params = grid_search.best_params_
    print("最佳参数:", best_params)

    # 使用最佳参数重新训练模型
    adaboost = AdaBoostClassifier(
        estimator=best_params['estimator'],
        n_estimators=best_params['n_estimators'],
        learning_rate=best_params['learning_rate'],
        algorithm='SAMME',
        random_state=42
    )
    adaboost.fit(X_train, y_train)

    # 在测试集上评估模型
    y_pred = adaboost.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test, adaboost.predict_proba(X_test)[:, 1])
    conf_matrix = confusion_matrix(y_test, y_pred)

    print("测试集准确率:", accuracy)
    print("测试集F1分数:", f1)
    print("测试集ROC-AUC:", roc_auc)
    print("混淆矩阵:\n", conf_matrix)

    # 计算特征重要性并加权
    feature_importances = np.zeros(6)
    for i, est in enumerate(adaboost.estimators_):
        feature_importances += adaboost.estimator_weights_[i] * est.feature_importances_
    feature_importances /= np.sum(adaboost.estimator_weights_)

    weights = feature_importances / np.sum(feature_importances)
    print(f"计算得到的权重: {weights}")

    return weights.astype(np.float32)

# ====================== 损失函数 ======================
def compute_total_loss(positions, match_home_idx, match_away_idx, match_PC1, match_PC2, points_per_game, rank_scale,
                       ratios, w, normalized_defense_score, lambda_defense, lambda_supervision, lambda_reg, elo_scores):
    """计算总损失，包括匹配损失、防守损失和监督损失"""
    epsilon = 1e-8
    if tf.shape(match_home_idx)[0] == 0:
        return tf.constant(0.0, dtype=tf.float32)

    # 匹配损失
    home_pos = tf.gather(positions, match_home_idx)
    away_pos = tf.gather(positions, match_away_idx)
    match_points = tf.stack([match_PC1, match_PC2], axis=1)
    dist_home = tf.norm(home_pos - match_points + epsilon, axis=1)
    dist_away = tf.norm(away_pos - match_points + epsilon, axis=1)
    all_distances = tf.concat([dist_home, dist_away], axis=0)
    dist_range = tf.reduce_max(all_distances) - tf.reduce_min(all_distances) + epsilon
    dist_home_norm = (dist_home - tf.reduce_min(all_distances)) / dist_range
    dist_away_norm = (dist_away - tf.reduce_min(all_distances)) / dist_range

    home_elo = tf.gather(elo_scores, match_home_idx)
    away_elo = tf.gather(elo_scores, match_away_idx)
    elo_diff = tf.abs(home_elo - away_elo)
    weight = 1.0 / (1.0 + elo_diff * rank_scale + epsilon)
    match_loss = tf.reduce_mean(weight * (dist_home_norm + dist_away_norm))

    # 防守损失
    w = tf.abs(w)  # 确保权重非负
    defense_target = -tf.reduce_sum(w * ratios, axis=1)
    defense_loss = tf.reduce_mean(tf.square(positions[:, 1] - defense_target))

    # 监督损失
    supervision_loss = tf.reduce_mean(tf.square(defense_target - normalized_defense_score))

    # 正则化
    regularization_loss = lambda_reg * tf.reduce_sum(tf.square(w))

    total_loss = match_loss + lambda_defense * defense_loss + lambda_supervision * supervision_loss + regularization_loss
    return tf.where(tf.math.is_nan(total_loss) | tf.math.is_inf(total_loss), 0.0, total_loss)

# ====================== Adam优化函数 ======================
def adam_optimize_positions(team_positions_df, match_positions_df, initial_lr=0.0005, decay_steps=200000,
                            decay_rate=0.9,
                            clipnorm=0.5, iterations=30000, verbose_interval=1000, random_seed=42, lambda_defense=0.1,
                            lambda_supervision=0.1, lambda_reg=0.01, patience=100, w=None):
    """使用Adam优化球队位置"""
    team_seasons = team_positions_df['team_season'].unique()
    team_season_to_idx = {t: i for i, t in enumerate(team_seasons)}

    elo_scores_dict = initialize_elo_scores(team_positions_df)
    team_names = [t.split('_')[1] for t in team_seasons]
    elo_scores = tf.convert_to_tensor([elo_scores_dict[name] for name in team_names], dtype=tf.float32)

    team_positions_df = team_positions_df.set_index('team_season')
    init_positions = team_positions_df[['PC1', 'PC2']].values
    ratios = team_positions_df[['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6']].values
    scaler_pos = StandardScaler()
    scaler_rat = StandardScaler()
    positions_scaled = scaler_pos.fit_transform(init_positions)
    ratios_scaled = scaler_rat.fit_transform(ratios)

    positions = tf.Variable(positions_scaled, dtype=tf.float32)
    ratios = tf.constant(ratios_scaled, dtype=tf.float32)
    normalized_defense_score = tf.constant(team_positions_df['normalized_defense_score'].values, dtype=tf.float32)
    points_per_game = tf.constant(team_positions_df['points_per_game'].values, dtype=tf.float32)

    match_array = [
        [team_season_to_idx[f"{row['league']}_{row['home_team_name']}_{row['season']}"],
         team_season_to_idx[f"{row['league']}_{row['away_team_name']}_{row['season']}"], row['PC1'], row['PC2']]
        for _, row in match_positions_df.iterrows()
        if f"{row['league']}_{row['home_team_name']}_{row['season']}" in team_season_to_idx and
           f"{row['league']}_{row['away_team_name']}_{row['season']}" in team_season_to_idx
    ]
    match_array = np.array(match_array, dtype=np.float32)
    if len(match_array) == 0:
        print("警告: 无有效比赛数据")
        return [], team_positions_df.reset_index(), None, None

    match_home_idx = tf.constant(match_array[:, 0], dtype=tf.int32)
    match_away_idx = tf.constant(match_array[:, 1], dtype=tf.int32)
    match_PC1 = tf.constant(match_array[:, 2], dtype=tf.float32)
    match_PC2 = tf.constant(match_array[:, 3], dtype=tf.float32)

    tf.random.set_seed(random_seed)
    rank_scale = tf.Variable(1.0, dtype=tf.float32)
    w = tf.Variable(np.abs(w) if w is not None else np.ones(6, dtype=np.float32) / 6, dtype=tf.float32)
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_lr, decay_steps, decay_rate)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=clipnorm)

    losses, best_loss, patience_counter = [], float('inf'), 0
    best_positions, best_rank_scale, best_w = positions.numpy().copy(), 1.0, w.numpy().copy()

    for i in range(iterations):
        with tf.GradientTape() as tape:
            loss = compute_total_loss(positions, match_home_idx, match_away_idx, match_PC1, match_PC2, points_per_game,
                                      rank_scale, ratios, w, normalized_defense_score, lambda_defense,
                                      lambda_supervision, lambda_reg, elo_scores)
        grads = tape.gradient(loss, [positions, rank_scale, w])
        if any(g is None for g in grads):
            print(f"警告: 迭代 {i + 1} 梯度为None")
            continue
        optimizer.apply_gradients(zip(grads, [positions, rank_scale, w]))
        loss_val = float(loss.numpy())
        losses.append(loss_val)

        if loss_val < best_loss:
            best_loss = loss_val
            best_positions, best_rank_scale, best_w = positions.numpy().copy(), float(
                rank_scale.numpy()), w.numpy().copy()
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"早停触发，训练在第 {i + 1} 轮停止")
            break

        if (i + 1) % verbose_interval == 0:
            print(f"迭代 {i + 1}/{iterations}, 损失 = {loss_val:.4f}, rank_scale = {rank_scale.numpy():.4f}")

    positions.assign(best_positions)
    rank_scale.assign(best_rank_scale)
    w.assign(best_w)
    final_pos = scaler_pos.inverse_transform(positions.numpy())
    for idx, team_season in enumerate(team_seasons):
        team_positions_df.loc[team_season, 'PC1'] = final_pos[idx, 0]
        team_positions_df.loc[team_season, 'PC2'] = final_pos[idx, 1]
    team_positions_df = team_positions_df.reset_index()
    return losses, team_positions_df, best_rank_scale, best_w

# ====================== 随机搜索超参数调整 ======================
def random_search_hyperparameters(team_positions, match_positions, w, n_iter=10, random_state=42):
    """随机搜索最佳超参数"""
    np.random.seed(random_state)
    best_loss, best_params = float('inf'), None

    for _ in range(n_iter):
        params = {
            'lambda_defense': uniform(0.01, 0.2).rvs(),
            'lambda_supervision': uniform(0.01, 0.2).rvs(),
            'lambda_reg': uniform(0.001, 0.02).rvs()
        }
        print(f"\n随机搜索: {params}")

        losses, _, _, _ = adam_optimize_positions(
            team_positions.copy(), match_positions.copy(), w=w, **params
        )
        if losses and losses[-1] < best_loss:
            best_loss = losses[-1]
            best_params = params

    print(f"\n最佳超参数: {best_params}, 最佳损失: {best_loss:.4f}")
    return best_params

# ====================== 可视化函数 ======================
def visualize_team_evolution_by_league_static(team_positions_df, seasons_order):
    """按联赛可视化球队演变"""
    output_dir = "/Users/peixuanma/Downloads/Output_Graphs"
    os.makedirs(output_dir, exist_ok=True)

    for league in team_positions_df['league'].unique():
        league_df = team_positions_df[team_positions_df['league'] == league].copy()
        valid_teams = league_df.groupby("team_name").filter(lambda x: x['season'].nunique() == len(seasons_order))[
            'team_name'].unique()
        if not valid_teams.size:
            valid_teams = league_df['team_name'].unique()
        valid_df = league_df[league_df['team_name'].isin(valid_teams)].sort_values(['team_name', 'season'])

        plt.figure(figsize=(10, 8))
        plt.title(f"{league} - 球队演变 (进攻-防守)")
        plt.xlabel("PC1 (进攻)")
        plt.ylabel("PC2 (防守)")
        plt.grid(True)

        for team in valid_df['team_name'].unique():
            sub = valid_df[valid_df['team_name'] == team]
            plt.plot(sub['PC1'], sub['PC2'], marker='o', label=team)
            for _, row in sub.iterrows():
                plt.text(row['PC1'], row['PC2'], row['season'], fontsize=8, ha='right')

        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        filename = os.path.join(output_dir, f"{league}_evolution.png")
        plt.savefig(filename, dpi=150)
        plt.close()
        print(f"保存 {league} 图 -> {filename}")

# ====================== 主函数 ======================
if __name__ == "__main__":
    base_path = '/Users/peixuanma/Downloads/data1'
    leagues = [
        ("england", "premier-league"), ("germany", "bundesliga"), ("spain", "la-liga"),
        ("france", "ligue-1"), ("france", "ligue-2"),("italy", "serie-a"), ("netherlands", "eredivisie"),
        ("portugal", "ligapro"), ("denmark", "superliga"), ("england", "championship"),("spain", "segunda-division"),("switzerland","super-league"),
        ("portugal", "liga-nos"), ("italy", "serie-b"), ("germany", "2-bundesliga"),("scotland","premiership"),("belgium","pro-league"),("austria","bundesliga"),
    ]
    seasons = [
        "2013-to-2014", "2014-to-2015", "2015-to-2016", "2016-to-2017", "2017-to-2018",
        "2018-to-2019", "2019-to-2020","2020-to-2021", "2021-to-2022", "2022-to-2023", "2023-to-2024"
    ]

    # 加载数据
    print("===== 加载数据 =====")
    all_team_positions, all_match_positions = load_all_league_data(base_path, leagues, seasons)

    # 计算Adaboost权重
    print("\n===== 计算Adaboost权重 =====")
    adaboost_weights = compute_adaboost_weights(all_team_positions)
    print("优化后的Adaboost权重:", adaboost_weights)

    # 随机搜索超参数
    print("\n===== 随机搜索超参数 =====")
    best_params = random_search_hyperparameters(all_team_positions, all_match_positions, adaboost_weights)

    # 最终训练
    print("\n===== 最终训练 =====")
    final_losses, final_team_positions, final_rank_scale, final_w = adam_optimize_positions(
        all_team_positions.copy(), all_match_positions.copy(), w=adaboost_weights, **best_params
    )
    if final_losses:
        final_team_positions.to_csv("trained_team_positions.csv", index=False)
        print(f"最终损失: {final_losses[-1]:.4f}")

    # 可视化
    print("\n===== 可视化 =====")
    visualize_team_evolution_by_league_static(final_team_positions, seasons)

    print("\n完成！")

scikit-learn 版本: 1.4.2
===== 加载数据 =====
加载数据: england - premier-league - 2013-to-2014
加载数据: england - premier-league - 2014-to-2015
加载数据: england - premier-league - 2015-to-2016
加载数据: england - premier-league - 2016-to-2017
加载数据: england - premier-league - 2017-to-2018
加载数据: england - premier-league - 2018-to-2019
加载数据: england - premier-league - 2019-to-2020
加载数据: england - premier-league - 2020-to-2021
加载数据: england - premier-league - 2021-to-2022
加载数据: england - premier-league - 2022-to-2023
加载数据: england - premier-league - 2023-to-2024
加载数据: germany - bundesliga - 2013-to-2014
加载数据: germany - bundesliga - 2014-to-2015
加载数据: germany - bundesliga - 2015-to-2016
加载数据: germany - bundesliga - 2016-to-2017
加载数据: germany - bundesliga - 2017-to-2018
加载数据: germany - bundesliga - 2018-to-2019
加载数据: germany - bundesliga - 2019-to-2020
加载数据: germany - bundesliga - 2020-to-2021
加载数据: germany - bundesliga - 2021-to-2022
加载数据: germany - bundesliga - 2022-to-2023
加载数据: germany - bundesliga - 2023-

  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 premier-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/premier-league_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 bundesliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/bundesliga_evolution.png
保存 la-liga 图 -> /Users/peixuanma/Downloads/Output_Graphs/la-liga_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligue-1 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligue-1_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligue-2 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligue-2_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 serie-a 图 -> /Users/peixuanma/Downloads/Output_Graphs/serie-a_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()


保存 eredivisie 图 -> /Users/peixuanma/Downloads/Output_Graphs/eredivisie_evolution.png


  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligapro 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligapro_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 superliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/superliga_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 championship 图 -> /Users/peixuanma/Downloads/Output_Graphs/championship_evolution.png
保存 segunda-division 图 -> /Users/peixuanma/Downloads/Output_Graphs/segunda-division_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 super-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/super-league_evolution.png
保存 liga-nos 图 -> /Users/peixuanma/Downloads/Output_Graphs/liga-nos_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 serie-b 图 -> /Users/peixuanma/Downloads/Output_Graphs/serie-b_evolution.png
保存 2-bundesliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/2-bundesliga_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 premiership 图 -> /Users/peixuanma/Downloads/Output_Graphs/premiership_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 pro-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/pro-league_evolution.png

完成！


In [59]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from scipy.stats import uniform
import sklearn

print(f"scikit-learn 版本: {sklearn.__version__}")

# ====================== 计算防守统计数据 ======================
def compute_defensive_stats(match_df, team_positions_df):
    """计算球队的防守统计数据，包括6种比率、总失球数和平均失球数"""
    epsilon = 1e-8  # 提高数值稳定性
    team_stats = {}
    for team in team_positions_df['team_name'].unique():
        team_stats[team] = {
            'total_goals_conceded': 0, 'ratio1_list': [], 'ratio2_list': [], 'ratio3_list': [],
            'ratio4_list': [], 'ratio5_list': [], 'ratio6_list': [], 'num_matches': 0
        }

    # 主场比赛统计
    for idx, row in match_df.iterrows():
        home_team, away_team = row['home_team_name'], row['away_team_name']
        if home_team not in team_stats or away_team not in team_stats:
            print(f"警告: 主队 {home_team} 或客队 {away_team} 未找到，跳过比赛 {idx}")
            continue
        if row['Pre-Match PPG (Away)'] > 0:
            team_stats[home_team]['ratio1_list'].append(
                row['away_team_goal_count'] / (row['Pre-Match PPG (Away)'] + epsilon))
        if row['away_team_corner_count'] > 0:
            team_stats[home_team]['ratio2_list'].append(
                row['away_team_goal_count'] / (row['away_team_corner_count'] + epsilon))
        denominator = row['home_team_yellow_cards'] + row['home_team_red_cards'] + row['home_team_fouls'] + epsilon
        team_stats[home_team]['ratio3_list'].append(row['away_team_goal_count'] / denominator)
        if row['team_b_xg'] > 0:
            team_stats[home_team]['ratio4_list'].append(row['away_team_goal_count'] / (row['team_b_xg'] + epsilon))
        shots_total = row['away_team_shots_on_target'] + row['away_team_shots_off_target'] + epsilon
        team_stats[home_team]['ratio5_list'].append(row['away_team_goal_count'] / shots_total)
        if row['away_team_possession'] > 0:
            team_stats[home_team]['ratio6_list'].append(
                row['away_team_goal_count'] / (row['away_team_possession'] + epsilon))
        team_stats[home_team]['total_goals_conceded'] += row['away_team_goal_count']
        team_stats[home_team]['num_matches'] += 1  # 记录主场比赛场次

    # 客场比赛统计
    for idx, row in match_df.iterrows():
        away_team = row['away_team_name']
        if away_team not in team_stats:
            print(f"警告: 客队 {away_team} 未找到，跳过比赛 {idx}")
            continue
        team_stats[away_team]['total_goals_conceded'] += row['home_team_goal_count']
        team_stats[away_team]['num_matches'] += 1  # 记录客场比赛场次

    # 汇总数据
    data = []
    for team, stats in team_stats.items():
        num_matches = stats['num_matches']
        average_goals_conceded = stats['total_goals_conceded'] / num_matches if num_matches > 0 else 0
        data.append({
            'team_name': team,
            'ratio1': np.mean(stats['ratio1_list']) if stats['ratio1_list'] else 0,
            'ratio2': np.mean(stats['ratio2_list']) if stats['ratio2_list'] else 0,
            'ratio3': np.mean(stats['ratio3_list']) if stats['ratio3_list'] else 0,
            'ratio4': np.mean(stats['ratio4_list']) if stats['ratio4_list'] else 0,
            'ratio5': np.mean(stats['ratio5_list']) if stats['ratio5_list'] else 0,
            'ratio6': np.mean(stats['ratio6_list']) if stats['ratio6_list'] else 0,
            'total_goals_conceded': stats['total_goals_conceded'],
            'average_goals_conceded': average_goals_conceded
        })

    return pd.DataFrame(data)

# ====================== ELO评分算法实现 ======================
def initialize_elo_scores(team_positions_df):
    """初始化球队ELO分数"""
    teams = team_positions_df['team_name'].unique().tolist()
    team_elo = {team: 1500 for team in teams}
    for team in teams:
        team_data = team_positions_df[team_positions_df['team_name'] == team]
        rank = team_data['points_per_game'].rank().iloc[0]
        team_elo[team] += (20 * (len(teams) - rank))
    return team_elo

def update_elo_scores(elo_scores, home_team, away_team, home_score, away_score, K=30):
    """更新ELO分数"""
    home_elo, away_elo = elo_scores[home_team], elo_scores[away_team]
    expected_home = 1 / (1 + 10 ** ((away_elo - home_elo) / 400))
    expected_away = 1 / (1 + 10 ** ((home_elo - away_elo) / 400))

    if home_score > away_score:
        elo_scores[home_team] += K * (1 - expected_home)
        elo_scores[away_team] += K * (0 - expected_away)
    elif home_score < away_score:
        elo_scores[home_team] += K * (0 - expected_home)
        elo_scores[away_team] += K * (1 - expected_away)
    else:
        elo_scores[home_team] += K * (0.5 - expected_home)
        elo_scores[away_team] += K * (0.5 - expected_away)
    return elo_scores

# ====================== 数据加载函数（含交叉验证） ======================
def load_all_league_data(base_path, leagues, seasons):
    """加载所有联赛数据并进行预处理"""
    all_team_positions, all_match_positions = [], []
    for country_name, league_name in leagues:
        for season in seasons:
            print(f"加载数据: {country_name} - {league_name} - {season}")
            team_file = os.path.join(base_path, f"{country_name}-{league_name}-teams-{season}-stats.csv")
            match_file = os.path.join(base_path, f"{country_name}-{league_name}-matches-{season}-stats.csv")
            if not os.path.exists(team_file) or not os.path.exists(match_file):
                print(f"警告: {country_name} - {league_name} - {season} 文件缺失")
                continue

            team_df = pd.read_csv(team_file)
            match_df = pd.read_csv(match_file)

            # 统一球队名称格式
            team_df['team_name'] = team_df.get('common_name', team_df.get('team_name', None)).str.strip().str.lower()
            match_df['home_team_name'] = match_df['home_team_name'].str.strip().str.lower()
            match_df['away_team_name'] = match_df['away_team_name'].str.strip().str.lower()

            team_names = team_df['team_name'].unique()
            original_match_count = len(match_df)
            match_df = match_df[
                match_df['home_team_name'].isin(team_names) & match_df['away_team_name'].isin(team_names)]
            if len(match_df) < original_match_count:
                print(f"警告: 过滤了 {original_match_count - len(match_df)} 场比赛")

            if match_df.empty:
                print("警告: 无有效比赛数据")
                continue

            defensive_stats_df = compute_defensive_stats(match_df, team_df)
            if defensive_stats_df.empty:
                print("警告: 未计算出防守统计")
                continue

            team_df = team_df.merge(defensive_stats_df, on='team_name', how='left')
            for col in ['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6', 'total_goals_conceded', 'average_goals_conceded']:
                if col not in team_df.columns:
                    print(f"警告: {col} 列缺失，设为0")
                    team_df[col] = 0

            # 归一化防守得分
            if 'total_goals_conceded' in team_df.columns and team_df['total_goals_conceded'].max() != team_df[
                'total_goals_conceded'].min():
                team_df['normalized_defense_score'] = (team_df['total_goals_conceded'].max() - team_df[
                    'total_goals_conceded']) / (team_df['total_goals_conceded'].max() - team_df[
                    'total_goals_conceded'].min() + 1e-8)
            else:
                team_df['normalized_defense_score'] = 0

            # PCA降维（防守数据）
            defensive_columns = [col for col in team_df.columns if col in [
                'goals_conceded', 'total_goals_conceded', 'ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6',
                'normalized_defense_score'
            ]]
            team_df_defensive = team_df[defensive_columns].fillna(0)
            kf = KFold(n_splits=5, shuffle=True, random_state=42)
            pca_results = []
            for train_idx, _ in kf.split(team_df_defensive):
                train_data = team_df_defensive.iloc[train_idx]
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(train_data)
                pca = PCA(n_components=2)
                X_pca = pca.fit_transform(X_scaled)
                pca_results.append(X_pca)
            X_pca_avg = np.mean(
                [np.pad(r, ((0, len(team_df_defensive) - len(r)), (0, 0)), 'constant') for r in pca_results], axis=0)

            team_positions = pd.DataFrame({
                'team_name': team_df['team_name'], 'PC1': X_pca_avg[:, 0], 'PC2': X_pca_avg[:, 1],
                'points_per_game': team_df['points_per_game'], 'league': league_name, 'season': season,
                'ratio1': team_df['ratio1'], 'ratio2': team_df['ratio2'], 'ratio3': team_df['ratio3'],
                'ratio4': team_df['ratio4'], 'ratio5': team_df['ratio5'], 'ratio6': team_df['ratio6'],
                'normalized_defense_score': team_df['normalized_defense_score'],
                'total_goals_conceded': team_df['total_goals_conceded'],
                'average_goals_conceded': team_df['average_goals_conceded']
            })
            team_positions['team_season'] = league_name + '_' + team_positions['team_name'] + '_' + team_positions[
                'season']
            all_team_positions.append(team_positions)

            # 比赛数据PCA
            match_df = match_df.drop(
                columns=['timestamp', 'date_GMT', 'status', 'attendance', 'referee', 'stadium_name', 'Game Week'],
                errors='ignore')
            numeric_cols = match_df.select_dtypes(include=[np.number]).columns
            match_df[numeric_cols] = match_df[numeric_cols].fillna(match_df[numeric_cols].mean())
            pca_match_results = []
            for train_idx, _ in kf.split(match_df[numeric_cols]):
                train_data = match_df[numeric_cols].iloc[train_idx]
                scaler = StandardScaler()
                X_scaled = scaler.fit_transform(train_data)
                pca = PCA(n_components=2)
                X_pca = pca.fit_transform(X_scaled)
                pca_match_results.append(X_pca)
            X_pca_match_avg = np.mean(
                [np.pad(r, ((0, len(match_df) - len(r)), (0, 0)), 'constant') for r in pca_match_results], axis=0)

            match_positions = pd.DataFrame({
                'home_team_name': match_df['home_team_name'], 'away_team_name': match_df['away_team_name'],
                'PC1': X_pca_match_avg[:, 0], 'PC2': X_pca_match_avg[:, 1], 'league': league_name, 'season': season
            })
            all_match_positions.append(match_positions)

    return pd.concat(all_team_positions, ignore_index=True), pd.concat(all_match_positions, ignore_index=True)

# ====================== 损失函数 ======================
def compute_total_loss(positions, match_home_idx, match_away_idx, match_PC1, match_PC2, points_per_game, rank_scale,
                       ratios, w, normalized_defense_score, lambda_defense, lambda_supervision, lambda_reg, elo_scores):
    """计算总损失，包括匹配损失、防守损失和监督损失"""
    epsilon = 1e-8
    if tf.shape(match_home_idx)[0] == 0:
        return tf.constant(0.0, dtype=tf.float32)

    # 匹配损失
    home_pos = tf.gather(positions, match_home_idx)
    away_pos = tf.gather(positions, match_away_idx)
    match_points = tf.stack([match_PC1, match_PC2], axis=1)
    dist_home = tf.norm(home_pos - match_points + epsilon, axis=1)
    dist_away = tf.norm(away_pos - match_points + epsilon, axis=1)
    all_distances = tf.concat([dist_home, dist_away], axis=0)
    dist_range = tf.reduce_max(all_distances) - tf.reduce_min(all_distances) + epsilon
    dist_home_norm = (dist_home - tf.reduce_min(all_distances)) / dist_range
    dist_away_norm = (dist_away - tf.reduce_min(all_distances)) / dist_range

    home_elo = tf.gather(elo_scores, match_home_idx)
    away_elo = tf.gather(elo_scores, match_away_idx)
    elo_diff = tf.abs(home_elo - away_elo)
    weight = 1.0 / (1.0 + elo_diff * rank_scale + epsilon)
    match_loss = tf.reduce_mean(weight * (dist_home_norm + dist_away_norm))

    # 防守损失
    w = tf.abs(w)  # 确保权重非负
    defense_target = -tf.reduce_sum(w * ratios, axis=1)
    defense_loss = tf.reduce_mean(tf.square(positions[:, 1] - defense_target))

    # 监督损失
    supervision_loss = tf.reduce_mean(tf.square(defense_target - normalized_defense_score))

    # 正则化
    regularization_loss = lambda_reg * tf.reduce_sum(tf.square(w))

    total_loss = match_loss + lambda_defense * defense_loss + lambda_supervision * supervision_loss + regularization_loss
    return tf.where(tf.math.is_nan(total_loss) | tf.math.is_inf(total_loss), 0.0, total_loss)

# ====================== Adam优化函数 ======================
def adam_optimize_positions(team_positions_df, match_positions_df, initial_lr=0.0005, decay_steps=200000,
                            decay_rate=0.9,
                            clipnorm=0.5, iterations=30000, verbose_interval=1000, random_seed=42, lambda_defense=0.1,
                            lambda_supervision=0.1, lambda_reg=0.01, patience=100, w=None):
    """使用Adam优化球队位置"""
    team_seasons = team_positions_df['team_season'].unique()
    team_season_to_idx = {t: i for i, t in enumerate(team_seasons)}

    elo_scores_dict = initialize_elo_scores(team_positions_df)
    team_names = [t.split('_')[1] for t in team_seasons]
    elo_scores = tf.convert_to_tensor([elo_scores_dict[name] for name in team_names], dtype=tf.float32)

    team_positions_df = team_positions_df.set_index('team_season')
    init_positions = team_positions_df[['PC1', 'PC2']].values
    ratios = team_positions_df[['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6', 'total_goals_conceded', 'average_goals_conceded']].values
    scaler_pos = StandardScaler()
    scaler_rat = StandardScaler()
    positions_scaled = scaler_pos.fit_transform(init_positions)
    ratios_scaled = scaler_rat.fit_transform(ratios)

    positions = tf.Variable(positions_scaled, dtype=tf.float32)
    ratios = tf.constant(ratios_scaled, dtype=tf.float32)
    normalized_defense_score = tf.constant(team_positions_df['normalized_defense_score'].values, dtype=tf.float32)
    points_per_game = tf.constant(team_positions_df['points_per_game'].values, dtype=tf.float32)

    match_array = [
        [team_season_to_idx[f"{row['league']}_{row['home_team_name']}_{row['season']}"],
         team_season_to_idx[f"{row['league']}_{row['away_team_name']}_{row['season']}"], row['PC1'], row['PC2']]
        for _, row in match_positions_df.iterrows()
        if f"{row['league']}_{row['home_team_name']}_{row['season']}" in team_season_to_idx and
           f"{row['league']}_{row['away_team_name']}_{row['season']}" in team_season_to_idx
    ]
    match_array = np.array(match_array, dtype=np.float32)
    if len(match_array) == 0:
        print("警告: 无有效比赛数据")
        return [], team_positions_df.reset_index(), None, None

    match_home_idx = tf.constant(match_array[:, 0], dtype=tf.int32)
    match_away_idx = tf.constant(match_array[:, 1], dtype=tf.int32)
    match_PC1 = tf.constant(match_array[:, 2], dtype=tf.float32)
    match_PC2 = tf.constant(match_array[:, 3], dtype=tf.float32)

    tf.random.set_seed(random_seed)
    rank_scale = tf.Variable(1.0, dtype=tf.float32)
    w = tf.Variable(np.abs(w) if w is not None else np.ones(8, dtype=np.float32) / 8, dtype=tf.float32)
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_lr, decay_steps, decay_rate)
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=clipnorm)

    losses, best_loss, patience_counter = [], float('inf'), 0
    best_positions, best_rank_scale, best_w = positions.numpy().copy(), 1.0, w.numpy().copy()

    for i in range(iterations):
        with tf.GradientTape() as tape:
            loss = compute_total_loss(positions, match_home_idx, match_away_idx, match_PC1, match_PC2, points_per_game,
                                      rank_scale, ratios, w, normalized_defense_score, lambda_defense,
                                      lambda_supervision, lambda_reg, elo_scores)
        grads = tape.gradient(loss, [positions, rank_scale, w])
        if any(g is None for g in grads):
            print(f"警告: 迭代 {i + 1} 梯度为None")
            continue
        optimizer.apply_gradients(zip(grads, [positions, rank_scale, w]))
        loss_val = float(loss.numpy())
        losses.append(loss_val)

        if loss_val < best_loss:
            best_loss = loss_val
            best_positions, best_rank_scale, best_w = positions.numpy().copy(), float(
                rank_scale.numpy()), w.numpy().copy()
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"早停触发，训练在第 {i + 1} 轮停止")
            break

        if (i + 1) % verbose_interval == 0:
            print(f"迭代 {i + 1}/{iterations}, 损失 = {loss_val:.4f}, rank_scale = {rank_scale.numpy():.4f}")

    positions.assign(best_positions)
    rank_scale.assign(best_rank_scale)
    w.assign(best_w)
    final_pos = scaler_pos.inverse_transform(positions.numpy())
    for idx, team_season in enumerate(team_seasons):
        team_positions_df.loc[team_season, 'PC1'] = final_pos[idx, 0]
        team_positions_df.loc[team_season, 'PC2'] = final_pos[idx, 1]
    team_positions_df = team_positions_df.reset_index()
    return losses, team_positions_df, best_rank_scale, best_w

# ====================== 随机搜索超参数调整 ======================
def random_search_hyperparameters(team_positions, match_positions, w, n_iter=10, random_state=42):
    """随机搜索最佳超参数"""
    np.random.seed(random_state)
    best_loss, best_params = float('inf'), None

    for _ in range(n_iter):
        params = {
            'lambda_defense': uniform(0.01, 0.2).rvs(),
            'lambda_supervision': uniform(0.01, 0.2).rvs(),
            'lambda_reg': uniform(0.001, 0.02).rvs()
        }
        print(f"\n随机搜索: {params}")

        losses, _, _, _ = adam_optimize_positions(
            team_positions.copy(), match_positions.copy(), w=w, **params
        )
        if losses and losses[-1] < best_loss:
            best_loss = losses[-1]
            best_params = params

    print(f"\n最佳超参数: {best_params}, 最佳损失: {best_loss:.4f}")
    return best_params

# ====================== 可视化函数 ======================
def visualize_team_evolution_by_league_static(team_positions_df, seasons_order):
    """按联赛可视化球队演变"""
    output_dir = "/Users/peixuanma/Downloads/Output_Graphs"
    os.makedirs(output_dir, exist_ok=True)

    for league in team_positions_df['league'].unique():
        league_df = team_positions_df[team_positions_df['league'] == league].copy()
        valid_teams = league_df.groupby("team_name").filter(lambda x: x['season'].nunique() == len(seasons_order))[
            'team_name'].unique()
        if not valid_teams.size:
            valid_teams = league_df['team_name'].unique()
        valid_df = league_df[league_df['team_name'].isin(valid_teams)].sort_values(['team_name', 'season'])

        plt.figure(figsize=(10, 8))
        plt.title(f"{league} - 球队演变 (进攻-防守)")
        plt.xlabel("PC1 (进攻)")
        plt.ylabel("PC2 (防守)")
        plt.grid(True)

        for team in valid_df['team_name'].unique():
            sub = valid_df[valid_df['team_name'] == team]
            plt.plot(sub['PC1'], sub['PC2'], marker='o', label=team)
            for _, row in sub.iterrows():
                plt.text(row['PC1'], row['PC2'], row['season'], fontsize=8, ha='right')

        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        filename = os.path.join(output_dir, f"{league}_evolution.png")
        plt.savefig(filename, dpi=150)
        plt.close()
        print(f"保存 {league} 图 -> {filename}")

# ====================== 主函数 ======================
if __name__ == "__main__":
    base_path = '/Users/peixuanma/Downloads/data1'
    leagues = [
        ("england", "premier-league"), ("germany", "bundesliga"), ("spain", "la-liga"),
        ("france", "ligue-1"), ("france", "ligue-2"),("italy", "serie-a"), ("netherlands", "eredivisie"),
        ("portugal", "ligapro"), ("denmark", "superliga"), ("england", "championship"),("spain", "segunda-division"),("switzerland","super-league"),
        ("portugal", "liga-nos"), ("italy", "serie-b"), ("germany", "2-bundesliga"),("scotland","premiership"),("belgium","pro-league"),("austria","bundesliga"),
    ]
    seasons = [
        "2013-to-2014", "2014-to-2015", "2015-to-2016", "2016-to-2017", "2017-to-2018",
        "2018-to-2019", "2019-to-2020","2020-to-2021", "2021-to-2022", "2022-to-2023", "2023-to-2024"
    ]

    # 加载数据
    print("===== 加载数据 =====")
    all_team_positions, all_match_positions = load_all_league_data(base_path, leagues, seasons)

    # 人工输入权重
    print("\n===== 输入防守权重 =====")
    feature_names = ['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6', 'total_goals_conceded', 'average_goals_conceded']
    weights = []
    for i, feature in enumerate(feature_names):
        weight = float(input(f"请输入第 {i+1} 个权重（对应特征: {feature}）: "))
        weights.append(weight)
    weights = np.array(weights, dtype=np.float32)
    weights /= np.sum(weights)  # 归一化权重
    print("输入的归一化权重:", weights)

    # 随机搜索超参数
    print("\n===== 随机搜索超参数 =====")
    best_params = random_search_hyperparameters(all_team_positions, all_match_positions, weights)

    # 最终训练
    print("\n===== 最终训练 =====")
    final_losses, final_team_positions, final_rank_scale, final_w = adam_optimize_positions(
        all_team_positions.copy(), all_match_positions.copy(), w=weights, **best_params
    )
    if final_losses:
        final_team_positions.to_csv("trained_team_positions.csv", index=False)
        print(f"最终损失: {final_losses[-1]:.4f}")

    # 可视化
    print("\n===== 可视化 =====")
    visualize_team_evolution_by_league_static(final_team_positions, seasons)

    print("\n完成！")

scikit-learn 版本: 1.4.2
===== 加载数据 =====
加载数据: england - premier-league - 2013-to-2014
加载数据: england - premier-league - 2014-to-2015
加载数据: england - premier-league - 2015-to-2016
加载数据: england - premier-league - 2016-to-2017
加载数据: england - premier-league - 2017-to-2018
加载数据: england - premier-league - 2018-to-2019
加载数据: england - premier-league - 2019-to-2020
加载数据: england - premier-league - 2020-to-2021
加载数据: england - premier-league - 2021-to-2022
加载数据: england - premier-league - 2022-to-2023
加载数据: england - premier-league - 2023-to-2024
加载数据: germany - bundesliga - 2013-to-2014
加载数据: germany - bundesliga - 2014-to-2015
加载数据: germany - bundesliga - 2015-to-2016
加载数据: germany - bundesliga - 2016-to-2017
加载数据: germany - bundesliga - 2017-to-2018
加载数据: germany - bundesliga - 2018-to-2019
加载数据: germany - bundesliga - 2019-to-2020
加载数据: germany - bundesliga - 2020-to-2021
加载数据: germany - bundesliga - 2021-to-2022
加载数据: germany - bundesliga - 2022-to-2023
加载数据: germany - bundesliga - 2023-

  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 premier-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/premier-league_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 bundesliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/bundesliga_evolution.png
保存 la-liga 图 -> /Users/peixuanma/Downloads/Output_Graphs/la-liga_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligue-1 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligue-1_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligue-2 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligue-2_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 serie-a 图 -> /Users/peixuanma/Downloads/Output_Graphs/serie-a_evolution.png
保存 eredivisie 图 -> /Users/peixuanma/Downloads/Output_Graphs/eredivisie_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 ligapro 图 -> /Users/peixuanma/Downloads/Output_Graphs/ligapro_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 superliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/superliga_evolution.png
保存 championship 图 -> /Users/peixuanma/Downloads/Output_Graphs/championship_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 segunda-division 图 -> /Users/peixuanma/Downloads/Output_Graphs/segunda-division_evolution.png
保存 super-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/super-league_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 liga-nos 图 -> /Users/peixuanma/Downloads/Output_Graphs/liga-nos_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 serie-b 图 -> /Users/peixuanma/Downloads/Output_Graphs/serie-b_evolution.png
保存 2-bundesliga 图 -> /Users/peixuanma/Downloads/Output_Graphs/2-bundesliga_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 premiership 图 -> /Users/peixuanma/Downloads/Output_Graphs/premiership_evolution.png


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)
  plt.savefig(filename, dpi=150)


保存 pro-league 图 -> /Users/peixuanma/Downloads/Output_Graphs/pro-league_evolution.png

完成！


In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix
from scipy.stats import uniform
import sklearn

print(f"scikit-learn 版本: {sklearn.__version__}")

# ====================== 攻防ELO评分模块 (无修改) ======================
class OffensiveDefensiveELO:
    def __init__(self, teams, initial_elo=1500, k_factor=30):
        self.ratings = {team: {'off': initial_elo, 'def': initial_elo} for team in teams}
        self.k = k_factor
        self.initial_elo = initial_elo
    def get_rating(self, team):
        return self.ratings.get(team, {'off': self.initial_elo, 'def': self.initial_elo})
    def update_ratings(self, home_team, away_team, home_xg, away_xg):
        home_ratings, away_ratings = self.get_rating(home_team), self.get_rating(away_team)
        home_off_elo, home_def_elo = home_ratings['off'], home_ratings['def']
        away_off_elo, away_def_elo = away_ratings['off'], away_ratings['def']
        exp_home_off_vs_away_def = 1 / (1 + 10**((away_def_elo - home_off_elo) / 400))
        exp_away_off_vs_home_def = 1 / (1 + 10**((home_def_elo - away_off_elo) / 400))
        total_xg = home_xg + away_xg
        if total_xg == 0: return
        expected_xg_home = total_xg * (exp_home_off_vs_away_def / (exp_home_off_vs_away_def + exp_away_off_vs_home_def))
        expected_xg_away = total_xg * (exp_away_off_vs_home_def / (exp_home_off_vs_away_def + exp_away_off_vs_home_def))
        home_offense_performance = self.k * (home_xg - expected_xg_home)
        away_offense_performance = self.k * (away_xg - expected_xg_away)
        self.ratings[home_team]['off'] += home_offense_performance
        self.ratings[away_team]['def'] -= home_offense_performance
        self.ratings[away_team]['off'] += away_offense_performance
        self.ratings[home_team]['def'] -= away_offense_performance

def calculate_elo_history(match_df, all_teams):
    if 'timestamp' not in match_df.columns: raise ValueError("DataFrame必须包含 'timestamp' 列用于排序")
    if 'home_xg' not in match_df.columns or 'away_xg' not in match_df.columns:
        print("警告: xG数据列缺失，将使用0.5作为默认值"); match_df['home_xg'], match_df['away_xg'] = 0.5, 0.5
    match_df['home_xg'] = pd.to_numeric(match_df['home_xg'], errors='coerce').fillna(0.5)
    match_df['away_xg'] = pd.to_numeric(match_df['away_xg'], errors='coerce').fillna(0.5)
    df_sorted = match_df.sort_values(by='timestamp').reset_index(drop=True)
    elo_system = OffensiveDefensiveELO(teams=all_teams)
    elo_history = []
    for index, row in df_sorted.iterrows():
        home_team, away_team = row['home_team_name'], row['away_team_name']
        home_ratings, away_ratings = elo_system.get_rating(home_team), elo_system.get_rating(away_team)
        elo_history.append({'match_id': index, 'home_team_name': home_team, 'away_team_name': away_team, 'home_off_elo_before': home_ratings['off'], 'home_def_elo_before': home_ratings['def'], 'away_off_elo_before': away_ratings['off'], 'away_def_elo_before': away_ratings['def']})
        elo_system.update_ratings(home_team, away_team, row['home_xg'], row['away_xg'])
    return pd.DataFrame(elo_history), elo_system.ratings

# ====================== 其他函数 (无修改) ======================
def compute_defensive_stats(match_df, team_positions_df):
    epsilon = 1e-8; team_stats = {}
    for team in team_positions_df['team_name'].unique(): team_stats[team] = {'total_goals_conceded': 0, 'ratio1_list': [], 'ratio2_list': [], 'ratio3_list': [], 'ratio4_list': [], 'ratio5_list': [], 'ratio6_list': [], 'num_matches': 0}
    for idx, row in match_df.iterrows():
        home_team, away_team = row['home_team_name'], row['away_team_name']
        if home_team not in team_stats or away_team not in team_stats: continue
        if row.get('Pre-Match PPG (Away)', 0) > 0: team_stats[home_team]['ratio1_list'].append(row['away_team_goal_count'] / (row['Pre-Match PPG (Away)'] + epsilon))
        if row.get('away_team_corner_count', 0) > 0: team_stats[home_team]['ratio2_list'].append(row['away_team_goal_count'] / (row['away_team_corner_count'] + epsilon))
        denominator = row.get('home_team_yellow_cards', 0) + row.get('home_team_red_cards', 0) + row.get('home_team_fouls', 0) + epsilon
        team_stats[home_team]['ratio3_list'].append(row.get('away_team_goal_count', 0) / denominator)
        away_xg = row.get('away_xg', 0)
        if away_xg > 0: team_stats[home_team]['ratio4_list'].append(row['away_team_goal_count'] / (away_xg + epsilon))
        shots_total = row.get('away_team_shots_on_target', 0) + row.get('away_team_shots_off_target', 0) + epsilon
        team_stats[home_team]['ratio5_list'].append(row.get('away_team_goal_count', 0) / shots_total)
        if row.get('away_team_possession', 0) > 0: team_stats[home_team]['ratio6_list'].append(row.get('away_team_goal_count', 0) / (row['away_team_possession'] + epsilon))
        team_stats[home_team]['total_goals_conceded'] += row.get('away_team_goal_count', 0)
        team_stats[home_team]['num_matches'] += 1
    for idx, row in match_df.iterrows():
        away_team = row['away_team_name']
        if away_team not in team_stats: continue
        team_stats[away_team]['total_goals_conceded'] += row.get('home_team_goal_count', 0)
        team_stats[away_team]['num_matches'] += 1
    data = []
    for team, stats in team_stats.items():
        num_matches = stats['num_matches']
        average_goals_conceded = stats['total_goals_conceded'] / num_matches if num_matches > 0 else 0
        data.append({'team_name': team, 'ratio1': np.mean(stats['ratio1_list']) if stats['ratio1_list'] else 0, 'ratio2': np.mean(stats['ratio2_list']) if stats['ratio2_list'] else 0,'ratio3': np.mean(stats['ratio3_list']) if stats['ratio3_list'] else 0,'ratio4': np.mean(stats['ratio4_list']) if stats['ratio4_list'] else 0,'ratio5': np.mean(stats['ratio5_list']) if stats['ratio5_list'] else 0,'ratio6': np.mean(stats['ratio6_list']) if stats['ratio6_list'] else 0,'total_goals_conceded': stats['total_goals_conceded'],'average_goals_conceded': average_goals_conceded})
    return pd.DataFrame(data)

def compute_home_away_diff(match_df, team_positions_df, final_elos):
    epsilon = 1e-8; home_away_diff = {}; avg_elos = {team: (ratings['off'] + ratings['def']) / 2 for team, ratings in final_elos.items()}
    for team in team_positions_df['team_name'].unique():
        for season in match_df['season'].unique():
            team_matches = match_df[((match_df['home_team_name'] == team) | (match_df['away_team_name'] == team)) & (match_df['season'] == season)]
            if team_matches.empty: continue
            opponents = set(team_matches['home_team_name'].unique()) | set(team_matches['away_team_name'].unique());
            if team in opponents: opponents.remove(team)
            diff_ratios, weights = [], []
            for opponent in opponents:
                home_match, away_match = team_matches[(team_matches['home_team_name'] == team) & (team_matches['away_team_name'] == opponent)], team_matches[(team_matches['away_team_name'] == team) & (team_matches['home_team_name'] == opponent)]
                if not home_match.empty and not away_match.empty:
                    home_xg, away_xg = home_match.get('home_xg', pd.Series([0])).values[0], away_match.get('away_xg', pd.Series([0])).values[0]
                    home_conceded_xg, away_conceded_xg = home_match.get('away_xg', pd.Series([0])).values[0], away_match.get('home_xg', pd.Series([0])).values[0]
                    diff_ratios.append(((home_xg / (away_xg + epsilon)) + (home_conceded_xg / (away_conceded_xg + epsilon))) / 2)
                    weights.append(avg_elos.get(opponent, 1500))
            if diff_ratios: home_away_diff[f"{team}_{season}"] = np.average(diff_ratios, weights=weights)
            else: home_away_diff[f"{team}_{season}"] = 1.0
    return home_away_diff

def load_all_league_data(base_path, leagues, seasons):
    all_team_positions, all_match_positions = [], []
    for country_name, league_name in leagues:
        for season in seasons:
            team_file, match_file = os.path.join(base_path, f"{country_name}-{league_name}-teams-{season}-stats.csv"), os.path.join(base_path, f"{country_name}-{league_name}-matches-{season}-stats.csv")
            if not os.path.exists(team_file) or not os.path.exists(match_file): continue
            team_df, match_df = pd.read_csv(team_file), pd.read_csv(match_file)
            team_df['team_name'] = team_df.get('common_name', team_df.get('team_name', None)).str.strip().str.lower()
            match_df['home_team_name'], match_df['away_team_name'] = match_df['home_team_name'].str.strip().str.lower(), match_df['away_team_name'].str.strip().str.lower()
            team_names = team_df['team_name'].unique()
            match_df = match_df[match_df['home_team_name'].isin(team_names) & match_df['away_team_name'].isin(team_names)]
            if match_df.empty: continue
            match_df['league'], match_df['season'], team_df['league'], team_df['season'] = league_name, season, league_name, season
            team_df = team_df.merge(compute_defensive_stats(match_df, team_df), on='team_name', how='left')
            all_team_positions.append(team_df); all_match_positions.append(match_df)
    combined_teams = pd.concat(all_team_positions, ignore_index=True).drop_duplicates(subset=['team_name', 'season'])
    combined_matches = pd.concat(all_match_positions, ignore_index=True)
    return combined_teams, combined_matches

# ====================== 核心修改：Adaboost优化权重 ======================
def compute_adaboost_weights(team_positions_df):
    """使用Adaboost计算防守比率的最佳权重"""
    # 确保有足够的数据用于拆分和交叉验证
    if len(team_positions_df) < 10:
        print("警告: 数据量过小，无法进行Adaboost权重计算。返回默认权重。")
        return np.ones(6, dtype=np.float32) / 6

    ratios = team_positions_df[['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6']].replace([np.inf, -np.inf], np.nan).fillna(0)
    scaler = StandardScaler()
    ratios_scaled = scaler.fit_transform(ratios)
    
    # 确保标签列有足够的变化
    if team_positions_df['average_goals_conceded'].nunique() < 2:
        print("警告: 目标变量 'average_goals_conceded' 缺乏变化，无法进行分类。返回默认权重。")
        return np.ones(6, dtype=np.float32) / 6
        
    labels = pd.qcut(team_positions_df['average_goals_conceded'], q=2, labels=[0, 1], duplicates='drop').fillna(0).astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(ratios_scaled, labels, test_size=0.2, random_state=42, stratify=labels)

    # <--- MODIFICATION START --->
    # 动态计算交叉验证折数(cv)，以避免ValueError
    min_class_count = y_train.value_counts().min()
    # cv值不能大于任何类别的样本数，且最小为2
    cv_value = min(5, min_class_count)
    if cv_value < 2:
        print(f"警告: 训练集中最小类别样本数({min_class_count})过少，无法进行交叉验证。返回默认权重。")
        return np.ones(6, dtype=np.float32) / 6
    # <--- MODIFICATION END --->

    ada = AdaBoostClassifier(algorithm='SAMME', random_state=42)
    param_grid = {'estimator': [DecisionTreeClassifier(max_depth=5), DecisionTreeClassifier(max_depth=10)],'n_estimators': [50, 100, 200, 500],'learning_rate': [0.01, 0.1, 0.5]}
    
    # <--- MODIFICATION START --->
    # 使用动态cv_value，并将n_jobs=1以避免潜在的protobuf错误
    grid_search = GridSearchCV(ada, param_grid, cv=cv_value, scoring='accuracy', n_jobs=1)
    # <--- MODIFICATION END --->
    
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    adaboost = AdaBoostClassifier(estimator=best_params['estimator'],n_estimators=best_params['n_estimators'],learning_rate=best_params['learning_rate'],algorithm='SAMME',random_state=42)
    adaboost.fit(X_train, y_train)
    
    feature_importances = np.zeros(6)
    if hasattr(adaboost, 'estimator_weights_'):
        for i, est in enumerate(adaboost.estimators_):
            feature_importances += adaboost.estimator_weights_[i] * est.feature_importances_
        if np.sum(adaboost.estimator_weights_) > 0:
            feature_importances /= np.sum(adaboost.estimator_weights_)

    weights = feature_importances / np.sum(feature_importances) if np.sum(feature_importances) > 0 else np.ones(6) / 6
    return weights.astype(np.float32)

# ====================== 损失函数和优化器 (无修改) ======================
def compute_total_loss(positions, match_data, points_per_game, rank_scale, ratios, w, normalized_defense_score, lambda_defense, lambda_supervision, lambda_reg):
    epsilon = 1e-8
    match_home_idx, match_away_idx = tf.cast(match_data[:, 0], dtype=tf.int32), tf.cast(match_data[:, 1], dtype=tf.int32)
    match_PC1, match_PC2 = match_data[:, 2], match_data[:, 3]
    home_off_elo, home_def_elo, away_off_elo, away_def_elo = match_data[:, 4], match_data[:, 5], match_data[:, 6], match_data[:, 7]
    home_pos, away_pos = tf.gather(positions, match_home_idx), tf.gather(positions, match_away_idx)
    match_points = tf.stack([match_PC1, match_PC2], axis=1)
    dist_home, dist_away = tf.norm(home_pos - match_points + epsilon, axis=1), tf.norm(away_pos - match_points + epsilon, axis=1)
    all_distances = tf.concat([dist_home, dist_away], axis=0)
    dist_range = tf.reduce_max(all_distances) - tf.reduce_min(all_distances) + epsilon
    dist_home_norm, dist_away_norm = (dist_home - tf.reduce_min(all_distances)) / dist_range, (dist_away - tf.reduce_min(all_distances)) / dist_range
    elo_diff = tf.abs((home_off_elo - away_def_elo) - (away_off_elo - home_def_elo))
    weight = 1.0 / (1.0 + elo_diff * rank_scale + epsilon)
    match_loss = tf.reduce_mean(weight * (dist_home_norm + dist_away_norm))
    w = tf.abs(w)
    defense_target = -tf.reduce_sum(w * ratios, axis=1)
    defense_loss = tf.reduce_mean(tf.square(positions[:, 1] - defense_target))
    supervision_loss = tf.reduce_mean(tf.square(defense_target - normalized_defense_score))
    regularization_loss = lambda_reg * tf.reduce_sum(tf.square(w))
    total_loss = match_loss + lambda_defense * defense_loss + lambda_supervision * supervision_loss + regularization_loss
    return tf.where(tf.math.is_nan(total_loss) | tf.math.is_inf(total_loss), 0.0, total_loss)

def adam_optimize_positions(team_positions_df, match_positions_with_elo_df, initial_lr=0.0005, iterations=50000, patience=100, w=None, lambda_defense=0.1, lambda_supervision=0.1, lambda_reg=0.01):
    team_seasons = team_positions_df['team_season'].unique(); team_season_to_idx = {t: i for i, t in enumerate(team_seasons)}
    team_positions_df = team_positions_df.set_index('team_season')
    init_positions = np.random.rand(len(team_seasons), 2)
    ratios = team_positions_df.loc[team_seasons].reindex(columns=['ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6']).fillna(0).values
    scaler_pos, scaler_rat = StandardScaler(), StandardScaler()
    positions_scaled, ratios_scaled = scaler_pos.fit_transform(init_positions), scaler_rat.fit_transform(ratios)
    positions, ratios_tf = tf.Variable(positions_scaled, dtype=tf.float32), tf.constant(ratios_scaled, dtype=tf.float32)
    normalized_defense_score = tf.constant(team_positions_df.loc[team_seasons, 'normalized_defense_score'].fillna(0).values, dtype=tf.float32)
    points_per_game = tf.constant(team_positions_df.loc[team_seasons, 'points_per_game'].fillna(0).values, dtype=tf.float32)
    match_array = []
    for _, row in match_positions_with_elo_df.iterrows():
        home_ts, away_ts = f"{row['league']}_{row['home_team_name']}_{row['season']}", f"{row['league']}_{row['away_team_name']}_{row['season']}"
        if home_ts in team_season_to_idx and away_ts in team_season_to_idx:
            match_array.append([team_season_to_idx[home_ts], team_season_to_idx[away_ts], row.get('PC1', 0), row.get('PC2', 0), row['home_off_elo_before'], row['home_def_elo_before'], row['away_off_elo_before'], row['away_def_elo_before']])
    if not match_array: print("警告: 无有效比赛数据用于优化"); return [], team_positions_df.reset_index(), None, None
    match_data_tf = tf.constant(np.array(match_array, dtype=np.float32))
    rank_scale, w_tf = tf.Variable(1.0, dtype=tf.float32), tf.Variable(np.abs(w) if w is not None else np.ones(6, dtype=np.float32) / 6, dtype=tf.float32)
    optimizer = tf.keras.optimizers.Adam(learning_rate=initial_lr)
    losses, best_loss, patience_counter, best_positions = [], float('inf'), 0, positions.numpy().copy()
    for i in range(iterations):
        with tf.GradientTape() as tape:
            loss = compute_total_loss(positions, match_data_tf, points_per_game, rank_scale, ratios_tf, w_tf, normalized_defense_score, lambda_defense, lambda_supervision, lambda_reg)
        grads = tape.gradient(loss, [positions, rank_scale, w_tf])
        if grads[0] is None: continue
        optimizer.apply_gradients(zip(grads, [positions, rank_scale, w_tf]))
        loss_val = float(loss.numpy())
        losses.append(loss_val)
        if loss_val < best_loss: best_loss, best_positions, patience_counter = loss_val, positions.numpy().copy(), 0
        else: patience_counter += 1
        if patience_counter >= patience: print(f"早停触发，训练在第 {i + 1} 轮停止"); break
        if (i + 1) % 1000 == 0: print(f"迭代 {i + 1}/{iterations}, 损失 = {loss_val:.4f}")
    positions.assign(best_positions)
    final_pos = scaler_pos.inverse_transform(positions.numpy())
    team_positions_df['PC1'], team_positions_df['PC2'] = final_pos[:, 0], final_pos[:, 1]
    return losses, team_positions_df.reset_index(), float(rank_scale.numpy()), w_tf.numpy()

def random_search_hyperparameters(*args, **kwargs): pass
def visualize_team_evolution_by_league_static(*args, **kwargs): pass

# ====================== 主函数 (流程重构) ======================
if __name__ == "__main__":
    base_path = '/Users/peixuanma/Downloads/data1'
    leagues, seasons = [("england", "premier-league"), ("germany", "bundesliga"), ("spain", "la-liga")], ["2022-to-2023", "2023-to-2024"]

    print("===== 1. 加载所有联赛和赛季数据 =====")
    all_teams_df, all_matches_df = load_all_league_data(base_path, leagues, seasons)
    
    # 确保模拟数据和真实数据都有必要的列
    for col in ['points_per_game', 'average_goals_conceded', 'normalized_defense_score', 'ratio1', 'ratio2', 'ratio3', 'ratio4', 'ratio5', 'ratio6']:
        if col not in all_teams_df.columns: all_teams_df[col] = np.random.rand(len(all_teams_df))
    all_teams_df['team_season'] = all_teams_df['league'].astype(str) + '_' + all_teams_df['team_name'].astype(str) + '_' + all_teams_df['season'].astype(str)

    print("===== 2. 计算全局攻防ELO历史 =====")
    all_teams_list = all_teams_df['team_name'].unique()
    elo_history_df, final_elos = calculate_elo_history(all_matches_df, all_teams_list)
    all_matches_df_sorted = all_matches_df.sort_values(by='timestamp').reset_index(drop=True).reset_index().rename(columns={'index': 'match_id'})
    matches_with_elo_df = pd.merge(all_matches_df_sorted, elo_history_df, on=['match_id', 'home_team_name', 'away_team_name'], how='left')
    print("ELO历史已合并到比赛数据中。")
    
    print("===== 3. 计算其他特征 (Adaboost, 主客场差异等) =====")
    adaboost_weights = compute_adaboost_weights(all_teams_df)
    
    print("\n===== 4. 最终训练 =====")
    best_params = {'lambda_defense': 0.1, 'lambda_supervision': 0.1, 'lambda_reg': 0.01}
    final_losses, final_team_positions, final_rank_scale, final_w = adam_optimize_positions(all_teams_df.copy(), matches_with_elo_df.copy(), w=adaboost_weights, **best_params)

    if final_losses:
        final_team_positions.to_csv("trained_team_positions_with_od_elo.csv", index=False)
        print(f"\n训练完成！最终损失: {final_losses[-1]:.4f}")
        print("最终位置数据已保存到 'trained_team_positions_with_od_elo.csv'")
    else:
        print("\n训练失败或没有有效数据。")

    print("\n完成！")

scikit-learn 版本: 1.4.2
===== 1. 加载所有联赛和赛季数据 =====
===== 2. 计算全局攻防ELO历史 =====
警告: xG数据列缺失，将使用0.5作为默认值
ELO历史已合并到比赛数据中。
===== 3. 计算其他特征 (Adaboost, 主客场差异等) =====

===== 4. 最终训练 =====
迭代 1000/50000, 损失 = 0.4824
迭代 2000/50000, 损失 = 0.1244


KeyboardInterrupt: 