In [1]:
import pandas as pd
import csv
import io

# 原始数据
input_filename = 'Spotify_Dataset_V3.csv'
# 清洗后数据
output_filename = 'Spotify_Weekly_Data_Cleaned.csv'


def robust_clean_spotify_data(file_path):
    """
    最终版清洗流程:
    1. 使用 utf-8-sig 编码和 csv 模块进行健壮的解析.
    2. 将干净的数据加载到 pandas.
    3. 按 (歌曲, 周) 聚合.
    4. 对结果进行排序.
    """
    try:
        #1: 解析 CSV 文件 
        print("步骤1:开始以文本形式加载并使用csv模块解析")
        
        parsed_rows = []
        # 使用'utf-8-sig'来处理文件开头的BOM隐藏字符
        with open(file_path, 'r', encoding='utf-8-sig') as f:
            reader = csv.reader(f, delimiter=';', quotechar='"')
            for row in reader:
                parsed_rows.append(row)

        header = parsed_rows[0]
        cleaned_header = [col.split(',')[0].strip() for col in header]

        df = pd.DataFrame(parsed_rows[1:], columns=cleaned_header)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        df.dropna(axis=1, how='all', inplace=True)
        
        print(f"成功加载并解析文件, 原始数据形状: {df.shape}")
        print(f"解析后的列名: {df.columns.tolist()}")

        # 步骤2:数据类型转换和清理
        print("\n步骤2:开始进行数据类型转换和清理")

        df.rename(columns={'# of Artist': 'Artist_Rank_Str', '# of Nationality': 'Nationality_Rank_Str'}, inplace=True)
        
        if 'id' not in df.columns or 'Date' not in df.columns:
            raise ValueError("关键列'id'或'Date'不在数据中")

        # 将 'Date' 列转换为 datetime 对象
        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

        numeric_cols = ['Rank', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 
                        'Acousticness', 'Instrumentalness', 'Valence', 'Points (Total)', 
                        'Points (Ind for each Artist/Nat)']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            
        df.dropna(subset=['id', 'Date'], inplace=True)
        print("数据类型转换完成")
        
        #  步骤3:转换数据形状 (按周和歌曲ID聚合)
        print("\n步骤 3: 开始转换数据形状 (按周聚合)")
        
        aggregation_rules = {
            'Rank': 'first', 'Title': 'first', 'Artists': 'first',
            'Danceability': 'first', 'Energy': 'first', 'Loudness': 'first', 'Speechiness': 'first',
            'Acousticness': 'first', 'Instrumentalness': 'first', 'Valence': 'first',
            'Points (Total)': 'first', 'Points (Ind for each Artist/Nat)': 'first', 'Song URL': 'first',
            'Artist (Ind.)': lambda x: list(x),
            'Nationality': lambda x: list(x),
            'Continent': lambda x: list(x),
        }

        # groupby自动按日期排序,
        df_weekly = df.groupby(['Date', 'id']).agg(aggregation_rules).reset_index()

        artist_counts = df.groupby(['Date', 'id']).size().reset_index(name='Artist_Count')
        nationality_counts = df.groupby(['Date', 'id'])['Nationality'].nunique().reset_index(name='Nationality_Count')

        df_weekly = pd.merge(df_weekly, artist_counts, on=['Date', 'id'])
        df_weekly = pd.merge(df_weekly, nationality_counts, on=['Date', 'id'])
        
        print(f"数据形状转换完成, 新数据形状: {df_weekly.shape}")

        #步骤 :清洗 Loudness列
        print("\n步骤4:开始修正Loudness列")

        def correct_loudness(loudness):
            if pd.isna(loudness): return None
            if loudness < -100: return loudness / 1000.0
            return loudness

        df_weekly['Loudness_Corrected'] = df_weekly['Loudness'].apply(correct_loudness)
        print("Loudness列修正完成。")
        
        # 进行最终排序数据
        print("\n步骤 5: 开始排序最终数据")
        # 首先按日期降序 (新日期在前), 然后在同一日期内按排名升序 (Rank 1 在前)
        df_weekly.sort_values(by=['Date', 'Rank'], ascending=[False, True], inplace=True)
        print("数据排序完成。")

        #保存数据 
        df_weekly.to_csv(output_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\n清洗完成! 适合每周预测的数据已保存到 '{output_filename}'")
        
        return df_weekly

    except FileNotFoundError:
        print(f"错误: 文件 '{file_path}' 未找到。")
        return None
    except Exception as e:
        print(f"处理过程中发生了一个错误: {e}")
        return None

#主程序入口 
if __name__ == "__main__":
    cleaned_data = robust_clean_spotify_data(input_filename)
    if cleaned_data is not None:
        print("\n最终数据预览 (前5行):")
        pd.set_option('display.max_colwidth', 100)
        print(cleaned_data.head())


步骤1:开始以文本形式加载并使用csv模块解析
成功加载并解析文件, 原始数据形状: (651936, 20)
解析后的列名: ['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality', 'Nationality', 'Continent', 'Points (Total)', 'Points (Ind for each Artist/Nat)', 'id', 'Song URL']

步骤2:开始进行数据类型转换和清理
数据类型转换完成

步骤 3: 开始转换数据形状 (按周聚合)
数据形状转换完成, 新数据形状: (464475, 20)

步骤4:开始修正Loudness列
Loudness列修正完成。

步骤 5: 开始排序最终数据
数据排序完成。

清洗完成! 适合每周预测的数据已保存到 'Spotify_Weekly_Data_Cleaned.csv'

最终数据预览 (前5行):
             Date                      id  Rank              Title  \
464368 2023-05-29  3qQbCzHBycnDpGskqOWY0E   1.0    Ella Baila Sola   
464471 2023-05-29  7ro0hRteUMfnOioTFI5TG1   2.0     WHERE SHE GOES   
464341 2023-05-29  2UW7JaomAMuX9pZrjVpHAU   3.0    La Bebe - Remix   
464456 2023-05-29  7FbrGaHYVDmfr7KoLIZnQ7   4.0  Cupid - Twin Ver.   
464451 2023-05-29  6pD0ufEQq0xdHSsRbg9LBK   5.0          un x100to   

                 

In [5]:
import pandas as pd
import numpy as np
import ast

input_filename = 'Spotify_Weekly_Data_Cleaned.csv'
output_filename = 'Spotify_Model_Ready_Features_V2.csv'

def create_advanced_features(file_path):
    try:
        print("Step1:Loading weekly aggregated data")
        df = pd.read_csv(file_path, sep=';')
        
        #确保日期列是日期时间对象
        df['Date'] = pd.to_datetime(df['Date'])
        
        print(f"File loaded successfully. Shape: {df.shape}")
        print("\nStep2:Creating advanced features")

        #必须先按歌曲ID分组再按日期升序排序
        df.sort_values(by=['id', 'Date'], ascending=[True, True], inplace=True)
        
        # 第一个新特征创建歌曲生命周期特征上榜周数
        # cumcount函数是从零开始的累积计数加一后即为上榜周数
        df['Weeks_on_chart'] = df.groupby('id').cumcount() + 1
        print("Successfully created 'Weeks_on_chart' feature.")

        # 第二个新特征创建艺人热度特征这是一个多步骤的过程
        # 检查并转换艺人独立列
        # CSV读取时列表会变成字符串需要用此函数安全地转换回来
        if isinstance(df['Artist (Ind.)'].iloc[0], str):
            df['Artist (Ind.)'] = df['Artist (Ind.)'].apply(ast.literal_eval)
        
        # 展开数据框让每一行代表一个艺人歌曲周的组合
        artist_df = df[['Date', 'id', 'Artist (Ind.)', 'Points (Total)']].explode('Artist (Ind.)')

        # 计算每个艺人截至当前日期的历史平均分不包含当前周防止数据泄露
        artist_df.sort_values(['Artist (Ind.)', 'Date'], inplace=True)
        artist_df['artist_hist_avg'] = artist_df.groupby('Artist (Ind.)')['Points (Total)'].transform(
            lambda x: x.shift(1).expanding().mean()
        )
        # 对于艺人的首次出现历史平均分为空值用零填充
        artist_df['artist_hist_avg'].fillna(0, inplace=True)

        # 聚合得到每首歌在每一周的最高艺人热度
        # 即合作艺人中历史表现最好的那个人的分数
        artist_hotness = artist_df.groupby(['Date', 'id'])['artist_hist_avg'].max().reset_index(name='Artist_Hotness')

        # 将计算出的热度特征合并回主数据框
        df = pd.merge(df, artist_hotness, on=['Date', 'id'], how='left')
        print("Successfully created 'Artist_Hotness' feature.")
        print("\n Step 3: Creating basic time-series features")
        
        #按ID分组
        grouped = df.groupby('id')
        
        #创建滞后特征即上一周的表现
        df['Rank_last_week'] = grouped['Rank'].shift(1)
        df['Points_last_week'] = grouped['Points (Total)'].shift(1)
        
        #创建变化量特征
        df['Rank_change'] = df['Rank_last_week'] - df['Rank'] #正数表示排名上升
        df['Points_change'] = df['Points (Total)'] - df['Points_last_week']
        
        #创建滚动特征即最近四周的平均表现
        df['Points_rolling_mean_4w'] = grouped['Points (Total)'].transform(lambda x: x.shift(1).rolling(window=4, min_periods=1).mean())
        df['Rank_rolling_mean_4w'] = grouped['Rank'].transform(lambda x: x.shift(1).rolling(window=4, min_periods=1).mean())
        
        print("Successfully created lag, difference, and rolling features.")


        print("\n Step4:Creating the target variable (next week's points) ")
        #目标是预测下n周的积分所以我们将积分向上移动n行
        df['Points_next_week'] = grouped['Points (Total)'].shift(-1)
        df['Points_next_2weeks'] = grouped['Points (Total)'].shift(-2)
        df['Points_next_4weeks'] = grouped['Points (Total)'].shift(-4)
        print("Target variable 'Points_next_week' created successfully.")
        
        # --- 必要修改 1: 新增排名升降目标变量 ---
        df['Rank_next_week'] = grouped['Rank'].shift(-1)
        rank_change_next_week = df['Rank'] - df['Rank_next_week']
        conditions = [
            rank_change_next_week > 0,
            rank_change_next_week < 0
        ]
        choices = ['Rise', 'Fall']
        df['Rank_change_direction_next_week'] = np.select(conditions, choices, default='Stable')


        print("\nStep5:Cleaning missing values and selecting final features")
        
        # 对于一首歌第一次上榜的记录它的历史特征是空值
        # 用零来填充这可以作为模型识别新上榜的信号
        feature_cols_to_fill = [
            'Rank_last_week', 'Points_last_week', 'Rank_change', 'Points_change',
            'Points_rolling_mean_4w', 'Rank_rolling_mean_4w', 'Artist_Hotness'
        ]
        df[feature_cols_to_fill] = df[feature_cols_to_fill].fillna(0)
        
        # df.dropna(subset=['Points_next_week'], inplace=True)
        
        # 选择最终特征集
        final_features = [
            # 核心音频特征
            'Danceability', 'Energy', 'Loudness_Corrected', 'Speechiness', 
            'Acousticness', 'Instrumentalness', 'Valence',
            # 合作相关特征
            'Artist_Count', 'Nationality_Count',
            # 当前状态特征
            'Rank', 'Points (Total)',
            # 创建历史特征
            'Rank_last_week', 'Points_last_week',
            'Rank_change', 'Points_change', 'Points_rolling_mean_4w', 'Rank_rolling_mean_4w',
            'Weeks_on_chart', 'Artist_Hotness',
            # 目标变量
            'Points_next_week','Points_next_2weeks', 'Points_next_4weeks',
            'Rank_change_direction_next_week'
        ]
        
        # 加入上下文信息列以便后续按时间划分数据集
        context_features = ['Date', 'id', 'Title', 'Artists']
        final_df = df[context_features + final_features].copy()

        print("Final feature selection is complete.")
        print(f"Shape of the model-ready dataset: {final_df.shape}")
        
        #Save Data
        final_df.to_csv(output_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\nFeature engineering complete! Model-ready data saved to '{output_filename}'")
        
        return final_df

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please ensure the previous step ran successfully.")
        return None
    except Exception as e:
        print(f"An error occurred during processing: {e}")
        return None

# Main execution block 
if __name__ == "__main__":
    model_ready_data = create_advanced_features(input_filename)
    if model_ready_data is not None:
        print("\nFinal data preview (first 5 rows):")
        pd.set_option('display.max_columns', None) # 显示所有列
        print(model_ready_data.head())



Step1:Loading weekly aggregated data
File loaded successfully. Shape: (464475, 21)

Step2:Creating advanced features
Successfully created 'Weeks_on_chart' feature.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  artist_df['artist_hist_avg'].fillna(0, inplace=True)


Successfully created 'Artist_Hotness' feature.

 Step 3: Creating basic time-series features
Successfully created lag, difference, and rolling features.

 Step4:Creating the target variable (next week's points) 
Target variable 'Points_next_week' created successfully.

Step5:Cleaning missing values and selecting final features
Final feature selection is complete.
Shape of the model-ready dataset: (455363, 26)

Feature engineering complete! Model-ready data saved to 'Spotify_Model_Ready_Features_V2.csv'

Final data preview (first 5 rows):
        Date                      id                                 Title  \
0 2017-03-24  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYNEXTDOOR)   
1 2017-03-25  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYNEXTDOOR)   
2 2017-03-26  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYNEXTDOOR)   
3 2017-03-27  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYNEXTDOOR)   
4 2017-03-28  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYN

### Motivation for RQ4: From Missing Values to Chart (Re-)Entry Prediction

在构建特征时，我们为每首歌曲生成了其上一周的相关特征，例如：

```python
df['Rank_last_week'] = grouped['Rank'].shift(1)
df['Points_last_week'] = grouped['Points (Total)'].shift(1)

在这一过程中，我们注意到：如果某首歌在下一周未再出现在榜单中（即下榜），其对应的 Points_next_week 将被设置为缺失值并最终被剔除。
这意味着模型在 RQ1 的训练中，只能学习那些“连续上榜”的样本，而未能考虑“下榜”或“重新上榜”的动态行为。

基于这一发现，我们进一步提出了新的研究任务 —— 上下榜预测（Drop & Re-entry Prediction）。
这一扩展的任务旨在弥补 RQ1 在逻辑与任务本质上的局限，使模型不仅能够预测积分变化趋势，还能识别歌曲热度的转折点：

下榜预测（Drop Prediction）：判断当前上榜歌曲是否将在下一周退出榜单；

上榜预测（Re-entry Prediction）：判断当前未上榜的歌曲是否会在下一周重新进入榜单。

In [3]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, precision_score, recall_score
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns # 导入seaborn库以美化图表
import numpy as np
import joblib
from lightgbm import early_stopping

#  Configurations
input_filename = 'Spotify_Model_Ready_Features_V2.csv'
metrics_output_filename = 'model_performance_metrics_cv.csv'
importance_output_filename = 'feature_importance_final_model.csv'
model_output_filename = 'lgbm_spotify_model.pkl'

def train_with_time_series_cv(file_path, model_params=None, target_column='Points_next_week'):
    try:
        sns.set_theme(style="whitegrid", palette="viridis", font_scale=1.1)
        
        print("Step 1: Loading model-ready data")
        df = pd.read_csv(file_path, sep=';')
        df['Date'] = pd.to_datetime(df['Date'])
        df.sort_values('Date', inplace=True)
        print(f"File loaded and sorted by date. Shape: {df.shape}")

        # 自动区分不同预测周期的输出文件名
        suffix = target_column.replace('Points_', '')
        metrics_output_filename = f"metrics_{suffix}.csv"
        importance_output_filename = f"importance_{suffix}.csv"
        model_output_filename = f"model_{suffix}.pkl"

        #  Define Features and Target 
        print("\nStep 2: Defining features and target")

        feature_columns = [
            'Danceability', 'Energy', 'Loudness_Corrected', 'Speechiness', 
            'Acousticness', 'Instrumentalness', 'Valence', 'Artist_Count', 
            'Nationality_Count', 'Rank', 'Points (Total)', 'Rank_last_week', 
            'Points_last_week', 'Rank_change', 'Points_change', 
            'Points_rolling_mean_4w', 'Rank_rolling_mean_4w', 
            'Weeks_on_chart', 'Artist_Hotness'
        ]

        X = df[feature_columns]
        y = df[target_column]

        # Time Series Cross-Validation
        print("\nStep 3: Performing Time Series Cross-Validation")

        tscv = TimeSeriesSplit(n_splits=5)
        
        mae_scores, r2_scores, spearman_scores = [], [], []

        # 我们只在最后一次分割上进行可视化，以避免生成过多图片
        last_val_indices = None

        # 如果没有指定参数，就用默认参数
        if model_params is None:
            model_params = {'random_state': 42, 'n_estimators': 500, 'learning_rate': 0.05, 'num_leaves': 31}
        
        print(f"Using model parameters: {model_params}")

        for fold, (train_index, val_index) in enumerate(tscv.split(X)):
            last_val_indices = val_index # 持续更新，最后一次循环时即为最终测试集
            print(f"Fold {fold+1}/5")
            X_train, X_val = X.iloc[train_index], X.iloc[val_index]
            y_train, y_val = y.iloc[train_index], y.iloc[val_index]
            
            train_dates = df.iloc[train_index]['Date']
            val_dates = df.iloc[val_index]['Date']
            print(f"Training on {len(X_train)} samples from {train_dates.min().date()} to {train_dates.max().date()}")
            print(f"Validating on {len(X_val)} samples from {val_dates.min().date()} to {val_dates.max().date()}")

            lgbm = lgb.LGBMRegressor(**model_params, device='gpu')
            lgbm.fit(
                X_train, y_train,
                eval_set=[(X_val, y_val)],
                eval_metric='l1',
                callbacks=[early_stopping(stopping_rounds=50, verbose=False)]
            )
            
            predictions = lgbm.predict(X_val)
            
            mae = mean_absolute_error(y_val, predictions)
            r2 = r2_score(y_val, predictions)
            spearman_corr, _ = spearmanr(y_val, predictions)
            
            mae_scores.append(mae)
            r2_scores.append(r2)
            spearman_scores.append(spearman_corr)
            print(f"MAE: {mae:.2f}, R2: {r2:.2f}, Spearman Corr: {spearman_corr:.2f}\n")

        print("\nStep4:Aggregating Cross-Validation Result")

        metrics_df = pd.DataFrame({
            'Metric': ['Average MAE (5-fold CV)', 'Std Dev of MAE (5-fold CV)', 'Average R2 (5-fold CV)', 'Average Spearman Correlation'],
            'Value': [np.mean(mae_scores), np.std(mae_scores), np.mean(r2_scores), np.mean(spearman_scores)],
            'Description': [
                '5次验证的平均绝对误差，这是模型性能的稳健估计',
                '误差的标准差，衡量模型性能的稳定性，越小越好',
                '5次验证的平均R平方，衡量模型的平均解释能力',
                '预测排名与真实排名的一致性，越接近1越好'
            ]
        })
        
        print("\nModel Performance (Cross-Validated)")
        print(metrics_df.to_string())
        metrics_df.to_csv(metrics_output_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\nCross-validated metrics saved to '{metrics_output_filename}'")

        # Train Final Model and Analyze Importance 
        print("\nStep 5 Training final model on ALL data for feature analysis")
    
        final_model = lgb.LGBMRegressor(**model_params, device='gpu')
        final_model.fit(X, y)
        print("Final model training complete.")

        joblib.dump(final_model, model_output_filename)
        print(f"Final model saved to '{model_output_filename}'")
        
        #Step 6: Final Evaluation and Plotting
        print("\n Step 6: Final evaluation on a hold-out set and plotting")
        
        #使用最后一次交叉验证的验证集作为我们的最终测试集，以保持一致性
        test_df = df.iloc[last_val_indices].copy()
        final_predictions = final_model.predict(test_df[feature_columns])
        y_test = test_df[target_column]

        #1.特征重要性图 (Figure 2)
        feature_importance_df = pd.DataFrame({
            'feature': feature_columns,
            'importance': final_model.feature_importances_
        }).sort_values('importance', ascending=False).reset_index(drop=True)
        feature_importance_df.to_csv(importance_output_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\nFinal model's feature importances saved to '{importance_output_filename}'")
        
        plt.figure(figsize=(12, 8))
        sns.barplot(x="importance", y="feature", data=feature_importance_df.head(15), palette="viridis")
        plt.title("Top 15 Feature Importances (Final Model)", fontsize=18, weight='bold')
        plt.xlabel("LightGBM Feature Importance", fontsize=14)
        plt.ylabel("Feature", fontsize=14)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        plt.close()
        print("Feature importance plot saved as 'feature_importance.png'")
        
        # 2.真实值 vs 预测值散点图 (Figure 3)
        fig, ax = plt.subplots(figsize=(8, 8))
        sns.scatterplot(x=y_test, y=final_predictions, alpha=0.5, ax=ax, edgecolor='k', s=80)
        ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
        ax.set_xlabel("Actual Points (Next Week)", fontsize=14)
        ax.set_ylabel("Predicted Points (Next Week)", fontsize=14)
        ax.set_title("Actual vs. Predicted Points on Test Set", fontsize=16, weight='bold')
        ax.legend()
        ax.set_aspect('equal', adjustable='box')
        plt.tight_layout()
        plt.savefig('actual_vs_predicted.png')
        plt.close()
        print("Actual vs. Predicted plot saved as 'actual_vs_predicted.png'")

        # 3.残差图 (Figure 5)
        residuals = y_test - final_predictions
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x=final_predictions, y=residuals, alpha=0.5, edgecolor='k', s=80)
        plt.axhline(y=0, color='r', linestyle='--')
        plt.xlabel("Predicted Points", fontsize=14)
        plt.ylabel("Residuals (Actual - Predicted)", fontsize=14)
        plt.title("Residual Plot", fontsize=16, weight='bold')
        plt.tight_layout()
        plt.savefig('residuals_plot.png')
        plt.close()
        print("Residual plot saved as 'residuals_plot.png'")
        
        # 4.预测误差分布图 (Figure 6)
        plt.figure(figsize=(10, 6))
        sns.histplot(residuals, kde=True, bins=50)
        plt.title('Distribution of Prediction Errors (Residuals)', fontsize=16, weight='bold')
        plt.xlabel('Prediction Error (Actual - Predicted)', fontsize=14)
        plt.ylabel('Frequency', fontsize=14)
        plt.axvline(x=residuals.mean(), color='r', linestyle='--', label=f'Mean Error: {residuals.mean():.2f}')
        plt.legend()
        plt.tight_layout()
        plt.savefig('prediction_error_distribution.png')
        plt.close()
        print("Prediction error distribution plot saved as 'prediction_error_distribution.png'")

                # --- Step 7: Out-of-Time (OOT) Hold-Out Testing ---
        print("\n Step 7: Performing Out-of-Time (OOT) Hold-Out Testing")

        # 划分最近 3 个月的数据作为未来未见的测试集
        split_date = df['Date'].max() - pd.DateOffset(months=3)
        oot_df = df[df['Date'] >= split_date].copy()
        X_oot = oot_df[feature_columns]
        y_oot = oot_df[target_column]

        oot_predictions = final_model.predict(X_oot)

        # 计算指标
        mae_oot = mean_absolute_error(y_oot, oot_predictions)
        r2_oot = r2_score(y_oot, oot_predictions)
        spearman_oot, _ = spearmanr(y_oot, oot_predictions)

        print("\n--- OOT Hold-Out Results ---")
        print(f"Time Range: {oot_df['Date'].min().date()} → {oot_df['Date'].max().date()}")
        print(f"Samples: {len(oot_df)}")
        print(f"MAE: {mae_oot:.2f}")
        print(f"R²: {r2_oot:.2f}")
        print(f"Spearman Corr: {spearman_oot:.2f}")

        # 保存结果到文件
        oot_results_df = pd.DataFrame({
            'Metric': ['MAE', 'R2', 'Spearman'],
            'Value': [mae_oot, r2_oot, spearman_oot],
            'Description': [
                '绝对误差（越低越好）',
                '解释方差（越高越好）',
                '预测与真实的排名一致性（越高越好）'
            ]
        })
        oot_results_df.to_csv('oot_holdout_results.csv', index=False, sep=';', encoding='utf-8-sig')
        print("OOT metrics saved to 'oot_holdout_results.csv'")

        # 绘制 OOT 测试结果散点图
        plt.figure(figsize=(8, 8))
        sns.scatterplot(x=y_oot, y=oot_predictions, alpha=0.5, edgecolor='k', s=80)
        plt.plot([y_oot.min(), y_oot.max()], [y_oot.min(), y_oot.max()], 'r--', lw=2)
        plt.title('OOT Test: Actual vs Predicted (Unseen Future Period)', fontsize=16, weight='bold')
        plt.xlabel('Actual Points (Next Week)', fontsize=14)
        plt.ylabel('Predicted Points (Next Week)', fontsize=14)
        plt.tight_layout()
        plt.savefig('oot_actual_vs_predicted.png')
        plt.close()
        print("OOT actual vs predicted plot saved as 'oot_actual_vs_predicted.png'")


        results = lgbm.evals_result_
        plt.plot(results['valid_0']['l1'])
        plt.title(f'Fold {fold+1} Validation MAE over Iterations')
        plt.xlabel('Iteration')
        plt.ylabel('MAE')
        plt.tight_layout()
        plt.savefig(f'learning_curve_fold_{fold+1}.png')
        plt.close()

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
    except Exception as e:
        print(f"An error occurred during processing: {e}")

def hyperparameter_search(X, y):
    """
    (可选) 使用随机搜索寻找最佳超参数。
    """
    print("\n (Optional) Starting Hyperparameter Search")
    param_dist = {
        'n_estimators': [500, 1000, 1500, 2000],
        'learning_rate': [0.01, 0.02, 0.05, 0.1],
        'num_leaves': [31, 50, 70, 100],
        'max_depth': [-1, 10, 20, 30],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'subsample': [0.7, 0.8, 0.9, 1.0],
    }

    tscv = TimeSeriesSplit(n_splits=5)
    lgbm = lgb.LGBMRegressor(random_state=42)

    random_search = RandomizedSearchCV(
        estimator=lgbm, param_distributions=param_dist, n_iter=25,
        scoring='neg_mean_absolute_error', cv=tscv, n_jobs=-1,
        verbose=2, random_state=42
    )

    random_search.fit(X, y)
    print("\nBest parameters found:")
    print(random_search.best_params_)
    return random_search.best_params_


if __name__ == "__main__":

    print("\n\n== RUNNING IN HYPERPARAMETER SEARCH MODE ===")
    df = pd.read_csv(input_filename, sep=';')
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values('Date', inplace=True)

    feature_columns = [
        'Danceability', 'Energy', 'Loudness_Corrected', 'Speechiness', 
        'Acousticness', 'Instrumentalness', 'Valence', 'Artist_Count', 
        'Nationality_Count', 'Rank', 'Points (Total)', 'Rank_last_week', 
        'Points_last_week', 'Rank_change', 'Points_change', 
        'Points_rolling_mean_4w', 'Rank_rolling_mean_4w', 
        'Weeks_on_chart', 'Artist_Hotness'
    ]

    target_columns = [
        'Points_next_week',
        'Points_next_2weeks',
        'Points_next_4weeks'
    ]

    short_term_df = df.dropna(subset=['Points_next_week'])
    X_all = short_term_df[feature_columns]
    y_all = short_term_df['Points_next_week']
    best_params = hyperparameter_search(X_all, y_all)
    best_params['random_state'] = 42

    for target in target_columns:
        print(f"\n\n===== Training model for target: {target} =====")
        train_with_time_series_cv(input_filename, model_params=best_params, target_column=target)



== RUNNING IN HYPERPARAMETER SEARCH MODE ===

 (Optional) Starting Hyperparameter Search
Fitting 5 folds for each of 25 candidates, totalling 125 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013307 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3932
[LightGBM] [Info] Number of data points in the train set: 455363, number of used features: 19
[LightGBM] [Info] Start training from score 100.749850

Best parameters found:
{'subsample': 0.9, 'num_leaves': 50, 'n_estimators': 1000, 'max_depth': 30, 'learning_rate': 0.01, 'colsample_bytree': 0.8}


===== Training model for target: Points_next_week =====
Step 1: Loading model-ready data
File loaded and sorted by date. Shape: (455363, 24)

Step 2: Defining features and target

Step 3: Performing Time Series Cross-Validation
Using model parameters: {'subsample': 0.9, 'num_leaves': 50, 'n_estimators': 1000, 'max_depth': 30, 'learning_rate': 0.01, '

In [4]:
import pandas as pd
import joblib
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

model_filename = 'lgbm_spotify_model.pkl'
data_filename = 'Spotify_Model_Ready_Features_V2.csv'
output_metrics_filename = 'rq2_performance_comparison_advanced.csv'
output_plot_filename = 'rq2_mae_comparison_advanced.png'

def analyze_rq2_advanced(model_path, data_path):
    """
    执行RQ2的深度分析：比较模型在不同启动类型（冷/温/热启动）上的表现。
    """
    try:
        print("Step 1: Loading final model and data")
        model = joblib.load(model_path)
        df = pd.read_csv(data_path, sep=';')
        df['Date'] = pd.to_datetime(df['Date'])
        print(f"Model and data loaded successfully. Data shape: {df.shape}")

        print("\nStep 2: Defining test set and making predictions")
        split_date = df['Date'].max() - pd.DateOffset(months=3)
        test_df = df[df['Date'] >= split_date].copy()
        
        feature_columns = [
            'Danceability', 'Energy', 'Loudness_Corrected', 'Speechiness', 
            'Acousticness', 'Instrumentalness', 'Valence', 'Artist_Count', 
            'Nationality_Count', 'Rank', 'Points (Total)', 'Rank_last_week', 
            'Points_last_week', 'Rank_change', 'Points_change', 
            'Points_rolling_mean_4w', 'Rank_rolling_mean_4w', 
            'Weeks_on_chart', 'Artist_Hotness'
        ]
        target_column = 'Points_next_week'

        X_test = test_df[feature_columns]
        y_test = test_df[target_column]
        
        predictions = model.predict(X_test)
        test_df['predictions'] = predictions

        print("\nStep 3: Defining granular groups for analysis")
        
        def define_start_type(row):
            is_new_song = row['Weeks_on_chart'] == 1
            is_new_artist = row['Artist_Hotness'] == 0
            
            if is_new_song and is_new_artist:
                return 'True Cold Start (New Song, New Artist)'
            elif is_new_song and not is_new_artist:
                return 'Warm Start (New Song, Established Artist)'
            else: # is_old_song
                return 'Hot Start (Old Song)'

        test_df['start_type'] = test_df.apply(define_start_type, axis=1)
        
        print("Groups defined based on 'start_type'.")
        print("\nGroup Counts:")
        print(test_df['start_type'].value_counts())




        print("\nStep 4: Calculating performance metrics for each group ")
        results = []

        def evaluate_group(group_df, group_name):
            if len(group_df) == 0: return None
            mae = mean_absolute_error(group_df[target_column], group_df['predictions'])
            r2 = r2_score(group_df[target_column], group_df['predictions'])
            spearman_corr, _ = spearmanr(group_df[target_column], group_df['predictions'])
            return {
                'Group': group_name,
                'Sample_Size': len(group_df),
                'MAE': mae,
                'R2_Score': r2,
                'Spearman_Correlation': spearman_corr
            }
        
        # 按我们新定义的“启动类型”进行分组评估
        for group_name, group_df in test_df.groupby('start_type'):
            result = evaluate_group(group_df, group_name)
            if result:
                results.append(result)
        
        # 结果汇总
        print("\nStep 5: Aggregating and saving results")
        
        results_df = pd.DataFrame(results).sort_values('MAE')
        
        print("\nRQ2 Performance Comparison (Advanced)")
        print(results_df.to_string())
        
        results_df.to_csv(output_metrics_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\nRQ2 analysis results saved to '{output_metrics_filename}'")

        #可视化对比结果
        print("\nStep 6: Generating comparison plot")
        
        sns.set_theme(style="whitegrid", palette="viridis", font_scale=1.1)
        plt.figure(figsize=(12, 7))
        
        sns.barplot(x='MAE', y='Group', data=results_df, orient='h')
        
        plt.title('Model Prediction Error (MAE) Comparison by Start Type', fontsize=18, weight='bold')
        plt.xlabel('Mean Absolute Error (Lower is Better)', fontsize=14)
        plt.ylabel('Group Type', fontsize=14)
        plt.xticks(fontsize=12)
        plt.yticks(fontsize=12)
        plt.tight_layout()
        plt.savefig(output_plot_filename)
        plt.close()
        
        print(f"Comparison plot saved to '{output_plot_filename}'")
        
    except FileNotFoundError as e:
        print(f"Error: {e}. Make sure '{model_path}' and '{data_path}' exist.")
    except Exception as e:
        print(f"An error occurred during processing: {e}")

# 主程序入口
if __name__ == "__main__":
    analyze_rq2_advanced(model_filename, data_filename)


Step 1: Loading final model and data
Model and data loaded successfully. Data shape: (455363, 24)

Step 2: Defining test set and making predictions

Step 3: Defining granular groups for analysis
Groups defined based on 'start_type'.

Group Counts:
start_type
Hot Start (Old Song)                         17458
Warm Start (New Song, Established Artist)      170
True Cold Start (New Song, New Artist)          23
Name: count, dtype: int64

Step 4: Calculating performance metrics for each group 

Step 5: Aggregating and saving results

RQ2 Performance Comparison (Advanced)
                                       Group  Sample_Size        MAE  R2_Score  Spearman_Correlation
0                       Hot Start (Old Song)        17458   7.275243  0.961700              0.980375
2  Warm Start (New Song, Established Artist)          170  23.821281  0.716141              0.841260
1     True Cold Start (New Song, New Artist)           23  34.008921 -0.019694              0.363726

RQ2 analysis results 

### 模型性能分析：热启动、温启动与冷启动

在这一分析中，我们根据模型在不同情境下的表现，分别探讨了热启动（Hot Start）、温启动（Warm Start）和真·冷启动（True Cold Start）的情况。以下是三种情况下的详细表现及其解释。

---

#### 1. **热启动 (Hot Start / 老歌)** - **模型的主场优势**
   - **MAE (平均误差)**: **7.28分** 
   - **R² (解释力)**: **0.96** -
   - **Spearman (排名相关性)**: **0.98** -

   **分析**：
   当一首歌已经在榜单上并且有了足够的历史数据（例如 `Weeks_on_chart > 1`），模型的表现非常出色。在这种情况下，模型依赖于 **上一周的积分（Points_last_week）** 和 **排名变化趋势（Rank_change）** 等特征，能够非常准确地预测歌手的下周表现。模型在这样的情境下非常自信：“这首歌上周表现很好，趋势向上，所以我预测它下周会继续表现优异。”

---

#### 2. **温启动 (Warm Start / 新歌但歌手是大牌)**
   - **MAE**: **23.82分** - 误差明显增大，但仍在可接受范围内。
   - **R²**: **0.72** - 仍然不错，说明模型能够解释72%的分数变化。
   - **Spearman**: **0.84** - 这是个亮点!!!！尽管积分预测不那么准确，但模型在预测新歌的相对排名方面依然非常出色。

   **分析**：
   对于新歌，尤其是那些由知名歌手演唱的歌曲，模型虽然没有完整的历史数据（例如 **Points_last_week** 特征为0），但它能够通过 **艺人热度（Artist_Hotness）** 特征来弥补这一不足。这一特征帮助模型判断：“我虽然不认识这首歌，但我认识它的歌手（如 Taylor Swift）。他的/她的其他歌曲表现出色，因此这首新歌的起点也不会低。” 这种 **“明星光环”效应** 为模型提供了有力的预测支持。

---

#### 3. **真·冷启动 (True Cold Start / 新歌+新歌手)**
   - **MAE**: **34.01分** - 误差非常大。
   - **R²**: **-0.019** - 这是最关键的发现！R²为负数，意味着模型的预测结果甚至比直接猜所有歌曲的平均分还要差。
   - **Spearman**: **0.36** - 非常低，说明模型几乎无法正确判断这些歌曲的相对排名。

   **分析**：
   在最极端的冷启动情况下，即 **新歌+新歌手**，模型的表现非常差。此时，模型既没有历史数据，也不知道歌手的知名度（**Artist_Hotness** 为0）。因此，模型只能依赖歌曲本身的音频特征（如 **Danceability**）进行预测，这显然是非常有限的。这个结果清楚地表明，音频特征本身几乎没有预测能力，真正决定一首歌能否成功的因素是它的历史表现和艺人的人气。

---

### 总结
- **热启动（老歌）**：模型在已有历史数据的基础上表现优秀，能够高效预测歌曲的未来表现。
- **温启动（新歌但歌手是大牌）**：虽然缺乏完整的历史数据，模型依然能够通过艺人热度等特征做出较为准确的预测。
- **冷启动（新歌+新歌手）**：当模型缺乏关键特征时，其预测能力显著下降，特别是音频特征对预测的贡献非常有限。

这些分析表明，**历史表现和艺人知名度** 是预测榜单成绩的关键因素，而**音频特征** 在冷启动时的作用相对较小。
