In [31]:
import pandas as pd
import csv
import io

# 原始数据
input_filename = 'Spotify_Dataset_V3.csv'
# 清洗后数据
output_filename = 'Spotify_Weekly_Data_Cleaned.csv'


def robust_clean_spotify_data(file_path):
    """
    最终版清洗流程:
    1. 使用 utf-8-sig 编码和 csv 模块进行健壮的解析.
    2. 将干净的数据加载到 pandas.
    3. 按 (歌曲, 周) 聚合.
    4. 对结果进行排序.
    """
    try:
        #1: 解析 CSV 文件 
        print("步骤1:开始以文本形式加载并使用csv模块解析")
        
        parsed_rows = []
        # 使用'utf-8-sig'来处理文件开头的BOM隐藏字符
        with open(file_path, 'r', encoding='utf-8-sig') as f:
            reader = csv.reader(f, delimiter=';', quotechar='"')
            for row in reader:
                parsed_rows.append(row)

        header = parsed_rows[0]
        cleaned_header = [col.split(',')[0].strip() for col in header]

        df = pd.DataFrame(parsed_rows[1:], columns=cleaned_header)
        df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
        df.dropna(axis=1, how='all', inplace=True)
        
        print(f"成功加载并解析文件, 原始数据形状: {df.shape}")
        print(f"解析后的列名: {df.columns.tolist()}")

        # 步骤2:数据类型转换和清理
        print("\n步骤2:开始进行数据类型转换和清理")

        df.rename(columns={'# of Artist': 'Artist_Rank_Str', '# of Nationality': 'Nationality_Rank_Str'}, inplace=True)
        
        if 'id' not in df.columns or 'Date' not in df.columns:
            raise ValueError("关键列'id'或'Date'不在数据中")

        # 将 'Date' 列转换为 datetime 对象
        df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')

        numeric_cols = ['Rank', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 
                        'Acousticness', 'Instrumentalness', 'Valence', 'Points (Total)', 
                        'Points (Ind for each Artist/Nat)']
        for col in numeric_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            
        df.dropna(subset=['id', 'Date'], inplace=True)
        print("数据类型转换完成")
        
        #  步骤3:转换数据形状 (按周和歌曲ID聚合)
        print("\n步骤 3: 开始转换数据形状 (按周聚合)")
        
        aggregation_rules = {
            'Rank': 'first', 'Title': 'first', 'Artists': 'first',
            'Danceability': 'first', 'Energy': 'first', 'Loudness': 'first', 'Speechiness': 'first',
            'Acousticness': 'first', 'Instrumentalness': 'first', 'Valence': 'first',
            'Points (Total)': 'first', 'Points (Ind for each Artist/Nat)': 'first', 'Song URL': 'first',
            'Artist (Ind.)': lambda x: list(x),
            'Nationality': lambda x: list(x),
            'Continent': lambda x: list(x),
        }

        # groupby自动按日期排序,
        df_weekly = df.groupby(['Date', 'id']).agg(aggregation_rules).reset_index()

        artist_counts = df.groupby(['Date', 'id']).size().reset_index(name='Artist_Count')
        nationality_counts = df.groupby(['Date', 'id'])['Nationality'].nunique().reset_index(name='Nationality_Count')

        df_weekly = pd.merge(df_weekly, artist_counts, on=['Date', 'id'])
        df_weekly = pd.merge(df_weekly, nationality_counts, on=['Date', 'id'])
        
        print(f"数据形状转换完成, 新数据形状: {df_weekly.shape}")

        #步骤 :清洗 Loudness列
        print("\n步骤4:开始修正Loudness列")

        def correct_loudness(loudness):
            if pd.isna(loudness): return None
            if loudness < -100: return loudness / 1000.0
            return loudness

        df_weekly['Loudness_Corrected'] = df_weekly['Loudness'].apply(correct_loudness)
        print("Loudness列修正完成。")
        
        # 进行最终排序数据
        print("\n步骤 5: 开始排序最终数据")
        # 首先按日期降序 (新日期在前), 然后在同一日期内按排名升序 (Rank 1 在前)
        df_weekly.sort_values(by=['Date', 'Rank'], ascending=[False, True], inplace=True)
        print("数据排序完成。")

        #保存数据 
        df_weekly.to_csv(output_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\n清洗完成! 适合每周预测的数据已保存到 '{output_filename}'")
        
        return df_weekly

    except FileNotFoundError:
        print(f"错误: 文件 '{file_path}' 未找到。")
        return None
    except Exception as e:
        print(f"处理过程中发生了一个错误: {e}")
        return None

#主程序入口 
if __name__ == "__main__":
    cleaned_data = robust_clean_spotify_data(input_filename)
    if cleaned_data is not None:
        print("\n最终数据预览 (前5行):")
        pd.set_option('display.max_colwidth', 100)
        print(cleaned_data.head())


步骤1:开始以文本形式加载并使用csv模块解析
成功加载并解析文件, 原始数据形状: (651936, 20)
解析后的列名: ['Rank', 'Title', 'Artists', 'Date', 'Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence', '# of Artist', 'Artist (Ind.)', '# of Nationality', 'Nationality', 'Continent', 'Points (Total)', 'Points (Ind for each Artist/Nat)', 'id', 'Song URL']

步骤2:开始进行数据类型转换和清理
数据类型转换完成

步骤 3: 开始转换数据形状 (按周聚合)
数据形状转换完成, 新数据形状: (464475, 20)

步骤4:开始修正Loudness列
Loudness列修正完成。

步骤 5: 开始排序最终数据
数据排序完成。

清洗完成! 适合每周预测的数据已保存到 'Spotify_Weekly_Data_Cleaned.csv'

最终数据预览 (前5行):
             Date                      id  Rank              Title  \
464368 2023-05-29  3qQbCzHBycnDpGskqOWY0E   1.0    Ella Baila Sola   
464471 2023-05-29  7ro0hRteUMfnOioTFI5TG1   2.0     WHERE SHE GOES   
464341 2023-05-29  2UW7JaomAMuX9pZrjVpHAU   3.0    La Bebe - Remix   
464456 2023-05-29  7FbrGaHYVDmfr7KoLIZnQ7   4.0  Cupid - Twin Ver.   
464451 2023-05-29  6pD0ufEQq0xdHSsRbg9LBK   5.0          un x100to   

                 

In [32]:
import pandas as pd
import numpy as np
import ast

input_filename = 'Spotify_Weekly_Data_Cleaned.csv'
output_filename = 'Spotify_Model_Ready_Features_V2.csv'

def create_advanced_features(file_path):
    try:
        print("Step1:Loading weekly aggregated data")
        df = pd.read_csv(file_path, sep=';')
        
        #确保日期列是日期时间对象
        df['Date'] = pd.to_datetime(df['Date'])
        
        print(f"File loaded successfully. Shape: {df.shape}")
        print("\nStep2:Creating advanced features")

        #必须先按歌曲ID分组再按日期升序排序
        df.sort_values(by=['id', 'Date'], ascending=[True, True], inplace=True)
        
        # 第一个新特征创建歌曲生命周期特征上榜周数
        # cumcount函数是从零开始的累积计数加一后即为上榜周数
        df['Weeks_on_chart'] = df.groupby('id').cumcount() + 1
        print("Successfully created 'Weeks_on_chart' feature.")

        # 第二个新特征创建艺人热度特征这是一个多步骤的过程
        # 检查并转换艺人独立列
        # CSV读取时列表会变成字符串需要用此函数安全地转换回来
        if isinstance(df['Artist (Ind.)'].iloc[0], str):
            df['Artist (Ind.)'] = df['Artist (Ind.)'].apply(ast.literal_eval)
        
        # 展开数据框让每一行代表一个艺人歌曲周的组合
        artist_df = df[['Date', 'id', 'Artist (Ind.)', 'Points (Total)']].explode('Artist (Ind.)')

        # 计算每个艺人截至当前日期的历史平均分不包含当前周防止数据泄露
        artist_df.sort_values(['Artist (Ind.)', 'Date'], inplace=True)
        artist_df['artist_hist_avg'] = artist_df.groupby('Artist (Ind.)')['Points (Total)'].transform(
            lambda x: x.shift(1).expanding().mean()
        )
        # 对于艺人的首次出现历史平均分为空值用零填充
        artist_df['artist_hist_avg'].fillna(0, inplace=True)

        # 聚合得到每首歌在每一周的最高艺人热度
        # 即合作艺人中历史表现最好的那个人的分数
        artist_hotness = artist_df.groupby(['Date', 'id'])['artist_hist_avg'].max().reset_index(name='Artist_Hotness')

        # 将计算出的热度特征合并回主数据框
        df = pd.merge(df, artist_hotness, on=['Date', 'id'], how='left')
        print("Successfully created 'Artist_Hotness' feature.")
        print("\n Step 3: Creating basic time-series features")
        
        #按ID分组
        grouped = df.groupby('id')
        
        #创建滞后特征即上一周的表现
        df['Rank_last_week'] = grouped['Rank'].shift(1)
        df['Points_last_week'] = grouped['Points (Total)'].shift(1)
        
        #创建变化量特征
        df['Rank_change'] = df['Rank_last_week'] - df['Rank'] #正数表示排名上升
        df['Points_change'] = df['Points (Total)'] - df['Points_last_week']
        
        #创建滚动特征即最近四周的平均表现
        df['Points_rolling_mean_4w'] = grouped['Points (Total)'].transform(lambda x: x.shift(1).rolling(window=4, min_periods=1).mean())
        df['Rank_rolling_mean_4w'] = grouped['Rank'].transform(lambda x: x.shift(1).rolling(window=4, min_periods=1).mean())
        
        print("Successfully created lag, difference, and rolling features.")


        print("\n Step4:Creating the target variable (next week's points) ")
        #目标是预测下n周的积分所以我们将积分向上移动n行
        df['Points_next_week'] = grouped['Points (Total)'].shift(-1)
        df['Points_next_2weeks'] = grouped['Points (Total)'].shift(-2)
        df['Points_next_4weeks'] = grouped['Points (Total)'].shift(-4)
        print("Target variable 'Points_next_week' created successfully.")
        
        # 排名升降目标变量 (1, 2, 4周) 
        df['Rank_next_week'] = grouped['Rank'].shift(-1)
        df['Rank_next_2weeks'] = grouped['Rank'].shift(-2)
        df['Rank_next_4weeks'] = grouped['Rank'].shift(-4)


        print("\nStep5:Cleaning missing values and selecting final features")
        
        # 对于一首歌第一次上榜的记录它的历史特征是空值
        # 用零来填充这可以作为模型识别新上榜的信号
        feature_cols_to_fill = [
            'Rank_last_week', 'Points_last_week', 'Rank_change', 'Points_change',
            'Points_rolling_mean_4w', 'Rank_rolling_mean_4w', 'Artist_Hotness'
        ]
        df[feature_cols_to_fill] = df[feature_cols_to_fill].fillna(0)
        
        # df.dropna(subset=['Points_next_week'], inplace=True)
        
        # 选择最终特征集
        final_features = [
            # 核心音频特征
            'Danceability', 'Energy', 'Loudness_Corrected', 'Speechiness', 
            'Acousticness', 'Instrumentalness', 'Valence',
            # 合作相关特征
            'Artist_Count', 'Nationality_Count',
            # 当前状态特征
            'Rank', 'Points (Total)',
            # 创建历史特征
            'Rank_last_week', 'Points_last_week',
            'Rank_change', 'Points_change', 'Points_rolling_mean_4w', 'Rank_rolling_mean_4w',
            'Weeks_on_chart', 'Artist_Hotness',
            # 目标变量
            'Points_next_week','Points_next_2weeks', 'Points_next_4weeks',
            'Rank_next_week', 'Rank_next_2weeks', 'Rank_next_4weeks'
        ]
        
        # 加入上下文信息列以便后续按时间划分数据集
        context_features = ['Date', 'id', 'Title', 'Artists']
        final_df = df[context_features + final_features].copy()

        print("Final feature selection is complete.")
        print(f"Shape of the model-ready dataset: {final_df.shape}")
        
        #Save Data
        final_df.to_csv(output_filename, index=False, sep=';', encoding='utf-8-sig')
        print(f"\nFeature engineering complete! Model-ready data saved to '{output_filename}'")
        
        return final_df

    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found. Please ensure the previous step ran successfully.")
        return None
    except Exception as e:
        print(f"An error occurred during processing: {e}")
        return None

# Main execution block 
if __name__ == "__main__":
    model_ready_data = create_advanced_features(input_filename)
    if model_ready_data is not None:
        print("\nFinal data preview (first 5 rows):")
        pd.set_option('display.max_columns', None) # 显示所有列
        print(model_ready_data.head())



Step1:Loading weekly aggregated data
File loaded successfully. Shape: (464475, 21)

Step2:Creating advanced features
Successfully created 'Weeks_on_chart' feature.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  artist_df['artist_hist_avg'].fillna(0, inplace=True)


Successfully created 'Artist_Hotness' feature.

 Step 3: Creating basic time-series features
Successfully created lag, difference, and rolling features.

 Step4:Creating the target variable (next week's points) 
Target variable 'Points_next_week' created successfully.

Step5:Cleaning missing values and selecting final features
Final feature selection is complete.
Shape of the model-ready dataset: (464475, 29)

Feature engineering complete! Model-ready data saved to 'Spotify_Model_Ready_Features_V2.csv'

Final data preview (first 5 rows):
        Date                      id                                 Title  \
0 2017-03-24  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYNEXTDOOR)   
1 2017-03-25  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYNEXTDOOR)   
2 2017-03-26  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYNEXTDOOR)   
3 2017-03-27  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYNEXTDOOR)   
4 2017-03-28  000xQL6tZNLJzIrtIgxqSl  Still Got Time (feat. PARTYN

### Motivation for RQ4: From Missing Values to Chart (Re-)Entry Prediction

在构建特征时，我们为每首歌曲生成了其上一周的相关特征，例如：

```python
df['Rank_last_week'] = grouped['Rank'].shift(1)
df['Points_last_week'] = grouped['Points (Total)'].shift(1)

在这一过程中，我们注意到：如果某首歌在下一周未再出现在榜单中（即下榜），其对应的 Points_next_week 将被设置为缺失值并最终被剔除。
这意味着模型在 RQ1 的训练中，只能学习那些“连续上榜”的样本，而未能考虑“下榜”或“重新上榜”的动态行为。

基于这一发现，我们进一步提出了新的研究任务 —— 上下榜预测（Drop & Re-entry Prediction）。
这一扩展的任务旨在弥补 RQ1 在逻辑与任务本质上的局限，使模型不仅能够预测积分变化趋势，还能识别歌曲热度的转折点：

下榜预测（Drop Prediction）：判断当前上榜歌曲是否将在下一周退出榜单；

上榜预测（Re-entry Prediction）：判断当前未上榜的歌曲是否会在下一周重新进入榜单。

In [33]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score, classification_report, accuracy_score # Keep imports for hyperparameter_search
from sklearn.preprocessing import LabelEncoder # Keep imports for hyperparameter_search
from scipy.stats import spearmanr
import numpy as np
import joblib
import os

# Configurations
input_filename = 'Spotify_Model_Ready_Features_V2.csv'

def train_regression_pipeline(df_train, df_oot, feature_columns, target_column, model_params):
    """
    Train a LightGBM regression model and evaluate on Out-of-Time (OOT) data.
    Saves model, metrics, feature importance, and OOT predictions.
    """
    try:
        output_dir = os.path.join("results", "regression", target_column)
        os.makedirs(output_dir, exist_ok=True)
        suffix = target_column.replace('Points_', '')
        metrics_output_filename = os.path.join(output_dir, f"metrics_{suffix}_oot.csv")
        importance_output_filename = os.path.join(output_dir, f"importance_{suffix}.csv")
        model_output_filename = os.path.join(output_dir, f"model_{suffix}.pkl")
        oot_predictions_output_filename = os.path.join(output_dir, "oot_predictions_and_actuals.csv")

        print(f"\nStep 5 Training final model for {target_column}")

        # Prepare training data
        df_train_target = df_train.dropna(subset=[target_column]).copy()
        X_train = df_train_target[feature_columns]
        y_train = df_train_target[target_column]

        if X_train.empty:
            print(f"Skipping {target_column}: No training data available after dropna.")
            return

        final_model = lgb.LGBMRegressor(**model_params)
        final_model.fit(X_train, y_train)
        print("Final model training complete.")

        joblib.dump(final_model, model_output_filename)
        print(f"Final model saved to '{model_output_filename}'")

        # Out-of-Time (OOT) evaluation
        print("\nStep 7: Performing Out-of-Time (OOT) Hold-Out Testing")

        df_oot_target = df_oot.dropna(subset=[target_column]).copy()
        X_oot = df_oot_target[feature_columns]
        y_oot = df_oot_target[target_column]

        if X_oot.empty:
            print(f"Warning: OOT set for {target_column} is empty. Skipping OOT evaluation.")
            return

        oot_predictions = final_model.predict(X_oot)

        # Compute metrics
        mae_oot = mean_absolute_error(y_oot, oot_predictions)
        r2_oot = r2_score(y_oot, oot_predictions)
        spearman_oot, _ = spearmanr(y_oot, oot_predictions)

        print("\n--- OOT Hold-Out Results ---")
        print(f"MAE: {mae_oot:.2f}, R²: {r2_oot:.2f}, Spearman Corr: {spearman_oot:.2f}")

        oot_results_df = pd.DataFrame({
            'Metric': ['MAE', 'R2', 'Spearman'],
            'Value': [mae_oot, r2_oot, spearman_oot]
        })
        oot_results_df.to_csv(metrics_output_filename, index=False, sep=';')

        # Save predictions for visualization
        oot_output_df = pd.DataFrame({'y_true': y_oot, 'y_pred': oot_predictions})
        oot_output_df.to_csv(oot_predictions_output_filename, index=False, sep=';')

        # Save feature importance
        feature_importance_df = pd.DataFrame({
            'feature': feature_columns,
            'importance': final_model.feature_importances_
        }).sort_values('importance', ascending=False).reset_index(drop=True)
        feature_importance_df.to_csv(importance_output_filename, index=False, sep=';')

        print(f"All results and data for {target_column} saved to '{output_dir}'")

        # Return the dataframe for post-processing
        return pd.DataFrame({'y_true': y_oot, 'y_pred': oot_predictions})

    except Exception as e:
        print(f"An error occurred during processing for {target_column}: {e}")


# Removed train_classification_pipeline function


def hyperparameter_search(X, y, is_classification=False):
    print(f"\nStarting Hyperparameter Search (Mode: {'Classification' if is_classification else 'Regression'})")

    param_dist = {
        'n_estimators': [500, 1000, 1500, 2000],
        'learning_rate': [0.01, 0.02, 0.05, 0.1],
        'num_leaves': [31, 50, 70, 100],
        'max_depth': [-1, 10, 20, 30],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'subsample': [0.7, 0.8, 0.9, 1.0],
    }

    tscv = TimeSeriesSplit(n_splits=5)

    if is_classification:
        lgbm = lgb.LGBMClassifier(random_state=42, is_unbalance=True)
        scoring = 'accuracy'
    else:
        lgbm = lgb.LGBMRegressor(random_state=42)
        scoring = 'neg_mean_absolute_error'

    random_search = RandomizedSearchCV(
        estimator=lgbm, param_distributions=param_dist, n_iter=25,
        scoring=scoring, cv=tscv, n_jobs=-1, verbose=2, random_state=42
    )

    random_search.fit(X, y)
    print("\nBest parameters found:")
    print(random_search.best_params_)
    return random_search.best_params_


if __name__ == "__main__":
    print("\n\n== RUNNING REGRESSION MODELS IN OOT VALIDATION MODE ===") # Modified print statement
    try:
        df = pd.read_csv(input_filename, sep=';', parse_dates=['Date'])
        df.sort_values('Date', inplace=True)
    except FileNotFoundError:
        print(f"CRITICAL ERROR: Input file '{input_filename}' not found. Exiting.")
        exit()

    # Split dataset into training and Out-of-Time (OOT)
    split_date = df['Date'].max() - pd.DateOffset(months=3)
    train_df = df[df['Date'] < split_date].copy()
    oot_df = df[df['Date'] >= split_date].copy()
    print(f"Data split at {split_date.date()}. Train set: {train_df.shape[0]} rows, OOT set: {oot_df.shape[0]} rows.")

    feature_columns = [
        'Danceability', 'Energy', 'Loudness_Corrected', 'Speechiness',
        'Acousticness', 'Instrumentalness', 'Valence', 'Artist_Count',
        'Nationality_Count', 'Rank', 'Points (Total)', 'Rank_last_week',
        'Points_last_week', 'Rank_change', 'Points_change',
        'Points_rolling_mean_4w', 'Rank_rolling_mean_4w',
        'Weeks_on_chart', 'Artist_Hotness'
    ]

    # Regression targets
    regression_targets = ['Points_next_week', 'Points_next_2weeks', 'Points_next_4weeks']
    for target in regression_targets:
        print(f"\n\n===== Processing Regression Target: {target} =====")
        short_term_df = train_df.dropna(subset=[target])
        if short_term_df.empty:
            print(f"Skipping {target}: No training data after dropna.")
            continue

        X_all = short_term_df[feature_columns]
        y_all = short_term_df[target]

        best_params = hyperparameter_search(X_all, y_all, is_classification=False)
        best_params['random_state'] = 42
        best_params['device'] = 'gpu'

        train_regression_pipeline(train_df, oot_df, feature_columns, target, best_params)

    # Removed classification targets loop



== RUNNING REGRESSION MODELS IN OOT VALIDATION MODE ===
Data split at 2023-02-28. Train set: 446305 rows, OOT set: 18170 rows.


===== Processing Regression Target: Points_next_week =====

Starting Hyperparameter Search (Mode: Regression)
Fitting 5 folds for each of 25 candidates, totalling 125 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3930
[LightGBM] [Info] Number of data points in the train set: 437712, number of used features: 19
[LightGBM] [Info] Start training from score 100.743288

Best parameters found:
{'subsample': 1.0, 'num_leaves': 50, 'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 0.9}

Step 5 Training final model for Points_next_week
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3930
[LightGBM] [Info] Number of data points in the train set: 437712, number of 

In [34]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import spearmanr
import numpy as np
import joblib
import os


input_filename = 'Spotify_Model_Ready_Features_V2.csv'

def train_regression_pipeline(df_train, df_oot, feature_columns, target_column, model_params):
    """
    训练 LightGBM 回归模型并在 OOT 集上验证。
    保存模型、特征重要性和预测结果。
    """
    try:
        output_dir = os.path.join("results", "regression", target_column)
        os.makedirs(output_dir, exist_ok=True)
        model_output_filename = os.path.join(output_dir, f"model_{target_column}.pkl")
        report_output_filename = os.path.join(output_dir, "oot_regression_report.txt")
        importance_output_filename = os.path.join(output_dir, "importance.csv")
        oot_predictions_output_filename = os.path.join(output_dir, "oot_predictions_and_actuals.csv")

        print(f"\nTraining final regression model for {target_column}...")

        # Step 1: 过滤缺失样本
        df_train_target = df_train.dropna(subset=[target_column]).copy()
        if df_train_target.empty:
            print(f"Skipping {target_column}: No training data available after dropna.")
            return

        X_train = df_train_target[feature_columns]
        y_train = df_train_target[target_column]

        # Step 2: 模型训练
        model = lgb.LGBMRegressor(**model_params)
        model.fit(X_train, y_train)
        print("Final model training complete.")

        # Step 3: 保存模型
        joblib.dump(model, model_output_filename)
        print(f"Model saved to '{model_output_filename}'")

        # Step 4: OOT 测试
        print("Performing Out-of-Time (OOT) Hold-Out Testing...")
        df_oot_target = df_oot.dropna(subset=[target_column]).copy()
        if df_oot_target.empty:
            print(f"Warning: OOT set for {target_column} is empty. Skipping evaluation.")
            return

        X_oot = df_oot_target[feature_columns]
        y_oot = df_oot_target[target_column]
        preds = model.predict(X_oot)

        # Step 5: 评估指标
        mae = mean_absolute_error(y_oot, preds)
        r2 = r2_score(y_oot, preds)
        spearman_corr, _ = spearmanr(y_oot, preds)

        print("OOT Hold-Out Results:")
        print(f"MAE: {mae:.3f}")
        print(f"R²: {r2:.3f}")
        print(f"Spearman correlation: {spearman_corr:.3f}")

        # 写报告
        with open(report_output_filename, 'w') as f:
            f.write(f"OOT MAE: {mae:.3f}\n")
            f.write(f"OOT R²: {r2:.3f}\n")
            f.write(f"OOT Spearman: {spearman_corr:.3f}\n")

        # 保存预测结果
        oot_output_df = pd.DataFrame({
            'y_true': y_oot,
            'y_pred': preds,
            'error': preds - y_oot
        })
        oot_output_df.to_csv(oot_predictions_output_filename, index=False, sep=';')

        # 保存特征重要性
        feature_importance_df = pd.DataFrame({
            'feature': feature_columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False).reset_index(drop=True)
        feature_importance_df.to_csv(importance_output_filename, index=False, sep=';')

        print(f"All results and data for {target_column} saved to '{output_dir}'")

        return oot_output_df

    except Exception as e:
        print(f"Error during processing for {target_column}: {e}")


def hyperparameter_search(X, y):
    print("Starting Hyperparameter Search (Regression Mode)")

    param_dist = {
        'n_estimators': [500, 800, 1000, 1500],
        'learning_rate': [0.01, 0.02, 0.05, 0.1],
        'num_leaves': [31, 50, 70, 100],
        'max_depth': [-1, 10, 20, 30],
        'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
        'subsample': [0.7, 0.8, 0.9, 1.0],
    }

    tscv = TimeSeriesSplit(n_splits=5)
    lgbm = lgb.LGBMRegressor(random_state=42)
    scoring = 'neg_mean_absolute_error'

    random_search = RandomizedSearchCV(
        estimator=lgbm,
        param_distributions=param_dist,
        n_iter=20,
        scoring=scoring,
        cv=tscv,
        n_jobs=-1,
        verbose=2,
        random_state=42
    )

    random_search.fit(X, y)
    print("Best parameters found:")
    print(random_search.best_params_)
    return random_search.best_params_






if __name__ == "__main__":
    print("Running Rank Regression Models in OOT Validation Mode")

    # 加载数据
    try:
        df = pd.read_csv(input_filename, sep=';', parse_dates=['Date'])
        df.sort_values('Date', inplace=True)
    except FileNotFoundError:
        print(f"Error: Input file '{input_filename}' not found. Exiting.")
        exit()

    # 时间切分（过去 vs 最近三个月）
    split_date = df['Date'].max() - pd.DateOffset(months=3)
    train_df = df[df['Date'] < split_date].copy()
    oot_df = df[df['Date'] >= split_date].copy()
    print(f"Data split at {split_date.date()}. Train set: {train_df.shape[0]} rows, OOT set: {oot_df.shape[0]} rows.")

    # 特征集合
    feature_columns = [
        'Danceability', 'Energy', 'Loudness_Corrected', 'Speechiness',
        'Acousticness', 'Instrumentalness', 'Valence', 'Artist_Count',
        'Nationality_Count', 'Rank', 'Points (Total)', 'Rank_last_week',
        'Points_last_week', 'Rank_change', 'Points_change',
        'Points_rolling_mean_4w', 'Rank_rolling_mean_4w',
        'Weeks_on_chart', 'Artist_Hotness'
    ]

    # 回归目标列表
    regression_targets = [
        'Rank_next_week',
        'Rank_next_2weeks',
        'Rank_next_4weeks'
    ]

    # 循环训练每个目标
    for target in regression_targets:
        print(f"Processing Regression Target: {target}")
        df_train_target_reg = train_df.dropna(subset=[target]).copy()
        if df_train_target_reg.empty:
            print(f"Skipping {target}: No training data after dropna.")
            continue

        X_all_reg = df_train_target_reg[feature_columns]
        y_all_reg = df_train_target_reg[target]

        best_params_reg = hyperparameter_search(X_all_reg, y_all_reg)
        best_params_reg['random_state'] = 42
        best_params_reg['device'] = 'gpu'

        train_regression_pipeline(train_df, oot_df, feature_columns, target, best_params_reg)


Running Rank Regression Models in OOT Validation Mode
Data split at 2023-02-28. Train set: 446305 rows, OOT set: 18170 rows.
Processing Regression Target: Rank_next_week
Starting Hyperparameter Search (Regression Mode)
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010011 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3930
[LightGBM] [Info] Number of data points in the train set: 437712, number of used features: 19
[LightGBM] [Info] Start training from score 100.256712
Best parameters found:
{'subsample': 1.0, 'num_leaves': 50, 'n_estimators': 800, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 0.9}

Training final regression model for Rank_next_week...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 3930
[LightGBM] [Info] Number of data points in the train set: 437712, number of used features: 19
[

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from sklearn.metrics import confusion_matrix

def create_regression_visuals(target_column):
    """为指定的回归目标生成一套完整的可视化图表。"""
    print(f"\n--- Generating visuals for REGRESSION target: {target_column} ---")
    
    # 定义文件路径
    input_dir = os.path.join("results", "regression", target_column)
    suffix = target_column.replace('Points_', '')
    importance_path = os.path.join(input_dir, f"importance_{suffix}.csv")
    predictions_path = os.path.join(input_dir, "oot_predictions_and_actuals.csv")
    
    if not os.path.exists(predictions_path) or not os.path.exists(importance_path):
        print(f"Warning: Results for {target_column} not found. Skipping visualization.")
        return

    # 加载数据
    importance_df = pd.read_csv(importance_path, sep=';')
    predictions_df = pd.read_csv(predictions_path, sep=';')
    y_test = predictions_df['y_true']
    final_predictions = predictions_df['y_pred']
    residuals = y_test - final_predictions

    # 1. 特征重要性图
    plt.figure(figsize=(12, 8))
    sns.barplot(x="importance", y="feature", data=importance_df.head(15), palette="viridis")
    plt.title(f"Top 15 Feature Importances ({target_column})", fontsize=18, weight='bold')
    plt.xlabel("LightGBM Feature Importance", fontsize=14)
    plt.ylabel("Feature", fontsize=14)
    plt.tight_layout()
    plt.savefig(os.path.join(input_dir, 'feature_importance.png'))
    plt.close()
    print("  - Feature importance plot saved.")

    # 2. 真实值 vs 预测值散点图
    plt.figure(figsize=(8, 8))
    sns.scatterplot(x=y_test, y=final_predictions, alpha=0.5, edgecolor='k', s=80)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction')
    plt.xlabel(f"Actual {target_column}", fontsize=14)
    plt.ylabel(f"Predicted {target_column}", fontsize=14)
    plt.title("OOT Actual vs. Predicted", fontsize=16, weight='bold')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(input_dir, 'actual_vs_predicted.png'))
    plt.close()
    print("  - Actual vs. Predicted plot saved.")

    # 3. 残差图
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=final_predictions, y=residuals, alpha=0.5, edgecolor='k', s=80)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.xlabel("Predicted Points", fontsize=14)
    plt.ylabel("Residuals (Actual - Predicted)", fontsize=14)
    plt.title("OOT Residual Plot", fontsize=16, weight='bold')
    plt.tight_layout()
    plt.savefig(os.path.join(input_dir, 'residuals_plot.png'))
    plt.close()
    print("  - Residual plot saved.")
    
    # 4. 预测误差分布图
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True, bins=50)
    plt.title('OOT Distribution of Prediction Errors', fontsize=16, weight='bold')
    plt.xlabel('Prediction Error (Actual - Predicted)', fontsize=14)
    plt.ylabel('Frequency', fontsize=14)
    plt.axvline(x=residuals.mean(), color='r', linestyle='--', label=f'Mean Error: {residuals.mean():.2f}')
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(input_dir, 'prediction_error_distribution.png'))
    plt.close()
    print("  - Prediction error distribution plot saved.")


def create_classification_visuals(target_column):
    """为指定的分类目标生成一套完整的可视化图表。"""
    print(f"\n--- Generating visuals for CLASSIFICATION target: {target_column} ---")
    
    input_dir = os.path.join("results", "classification", target_column)
    importance_path = os.path.join(input_dir, "importance.csv")
    predictions_path = os.path.join(input_dir, "oot_predictions_and_actuals.csv")
    encoder_path = os.path.join(input_dir, f"encoder_{target_column}.pkl")

    if not all(os.path.exists(p) for p in [predictions_path, importance_path, encoder_path]):
        print(f"Warning: Results for {target_column} not found. Skipping visualization.")
        return

    importance_df = pd.read_csv(importance_path, sep=';')
    predictions_df = pd.read_csv(predictions_path, sep=';')
    encoder = joblib.load(encoder_path)
    y_true = predictions_df['y_true']
    y_pred = predictions_df['y_pred']

    # 1. 特征重要性图
    plt.figure(figsize=(12, 8))
    sns.barplot(x="importance", y="feature", data=importance_df.head(15), palette="viridis")
    plt.title(f"Top 15 Feature Importances ({target_column})", fontsize=18, weight='bold')
    plt.xlabel("LightGBM Feature Importance", fontsize=14)
    plt.ylabel("Feature", fontsize=14)
    plt.tight_layout()
    plt.savefig(os.path.join(input_dir, 'feature_importance.png'))
    plt.close()
    print("  - Feature importance plot saved.")

    # 2. 混淆矩阵图
    plt.figure(figsize=(10, 7))
    cm = confusion_matrix(y_true, y_pred, labels=encoder.classes_)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
    plt.title("OOT Confusion Matrix", fontsize=16, weight='bold')
    plt.xlabel("Predicted Label", fontsize=14)
    plt.ylabel("True Label", fontsize=14)
    plt.tight_layout()
    plt.savefig(os.path.join(input_dir, 'oot_confusion_matrix.png'))
    plt.close()
    print("  - Confusion matrix plot saved.")

if __name__ == "__main__":
    sns.set_theme(style="whitegrid", palette="viridis", font_scale=1.1)

    regression_targets = ['Points_next_week', 'Points_next_2weeks', 'Points_next_4weeks']
    classification_target = 'Rank_change_direction_next_week'
    
    for target in regression_targets:
        create_regression_visuals(target)
        
    create_classification_visuals(classification_target)
    
    print("\nAll visualizations generated successfully.")


--- Generating visuals for REGRESSION target: Points_next_week ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="importance", y="feature", data=importance_df.head(15), palette="viridis")


  - Feature importance plot saved.
  - Actual vs. Predicted plot saved.
  - Residual plot saved.
  - Prediction error distribution plot saved.

--- Generating visuals for REGRESSION target: Points_next_2weeks ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="importance", y="feature", data=importance_df.head(15), palette="viridis")


  - Feature importance plot saved.
  - Actual vs. Predicted plot saved.
  - Residual plot saved.
  - Prediction error distribution plot saved.

--- Generating visuals for REGRESSION target: Points_next_4weeks ---



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="importance", y="feature", data=importance_df.head(15), palette="viridis")


  - Feature importance plot saved.
  - Actual vs. Predicted plot saved.
  - Residual plot saved.
  - Prediction error distribution plot saved.

--- Generating visuals for CLASSIFICATION target: Rank_change_direction_next_week ---

All visualizations generated successfully.


In [36]:
import pandas as pd
import joblib
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

MODEL_PATH = "results/regression/Points_next_week/model_next_week.pkl"
DATA_PATH = "Spotify_Model_Ready_Features_V2.csv"
OUTPUT_METRICS = "rq2_coldstart_metrics.csv"
OUTPUT_PLOT = "rq2_coldstart_mae_plot.png"


def analyze_coldstart(model_path, data_path):
    print("Step 1: Loading model and dataset")
    model = joblib.load(model_path)
    df = pd.read_csv(data_path, sep=";")
    df["Date"] = pd.to_datetime(df["Date"])
    print(f"Model and data loaded successfully. Shape: {df.shape}")

    # Step 2: Split into Train (past) and OOT (future)
    split_date = df["Date"].max() - pd.DateOffset(months=3)
    train_df = df[df["Date"] < split_date].copy()
    oot_df = df[df["Date"] >= split_date].copy()
    print(f"Data split at {split_date.date()}: Train = {len(train_df)}, OOT = {len(oot_df)}")

    # =============================
    # Step 3: Exposure-based Cold Start Definition
    # =============================
    print("\nStep 3: Calculating historical exposure counts (scientific definition)")

    # 每首歌在训练集出现的次数
    song_counts = train_df["id"].value_counts().to_dict()
    # 每位艺人在训练集出现的次数
    artist_counts = train_df["Artists"].value_counts().to_dict()

    # 映射到OOT
    oot_df["song_train_count"] = oot_df["id"].map(song_counts).fillna(0)
    oot_df["artist_train_count"] = oot_df["Artists"].map(artist_counts).fillna(0)

    # 根据训练集分布动态计算阈值（如75%分位）
    song_thr = np.quantile(list(song_counts.values()), 0.75)
    artist_thr = np.quantile(list(artist_counts.values()), 0.75)
    print(f"Song threshold = {song_thr:.0f}, Artist threshold = {artist_thr:.0f}")

    # 分类逻辑：更科学的冷/热启动定义
    def classify_start_type(row):
        if row["song_train_count"] == 0 and row["artist_train_count"] == 0:
            return "True Cold Start (New Song + New Artist)"
        elif row["song_train_count"] == 0 and row["artist_train_count"] >= artist_thr:
            return "Warm Start (New Song, Established Artist)"
        elif row["song_train_count"] > 0 and row["artist_train_count"] == 0:
            return "Artist Cold Start (Known Song, New Artist)"
        else:
            return "Hot Start (Seen Song & Artist)"

    oot_df["start_type"] = oot_df.apply(classify_start_type, axis=1)
    print("\nStart type distribution:")
    print(oot_df["start_type"].value_counts())

    # =============================
    # Step 4: Prediction
    # =============================
    feature_columns = [
        "Danceability", "Energy", "Loudness_Corrected", "Speechiness",
        "Acousticness", "Instrumentalness", "Valence",
        "Artist_Count", "Nationality_Count",
        "Rank", "Points (Total)", "Rank_last_week", "Points_last_week",
        "Rank_change", "Points_change", "Points_rolling_mean_4w",
        "Rank_rolling_mean_4w", "Weeks_on_chart", "Artist_Hotness"
    ]
    target_column = "Points_next_week"

    X_oot = oot_df[feature_columns]
    y_oot = oot_df[target_column]
    oot_df["predictions"] = model.predict(X_oot)

    # =============================
    # Step 5: Evaluate Each Group
    # =============================
    def evaluate_group(sub_df, group_name):
        if sub_df.empty:
            return None
        mae = mean_absolute_error(sub_df[target_column], sub_df["predictions"])
        r2 = r2_score(sub_df[target_column], sub_df["predictions"])
        spearman_corr, _ = spearmanr(sub_df[target_column], sub_df["predictions"])
        return {
            "Group": group_name,
            "Sample_Size": len(sub_df),
            "MAE": mae,
            "R2_Score": r2,
            "Spearman_Correlation": spearman_corr
        }

    print("\nStep 5: Computing evaluation metrics per group...")
    oot_df = oot_df.dropna(subset=[target_column]).copy()
    results = []
    for group_name, group_df in oot_df.groupby("start_type"):
        result = evaluate_group(group_df, group_name)
        if result:
            results.append(result)

    results_df = pd.DataFrame(results).sort_values("MAE")
    print("\nEvaluation summary by start type:")
    print(results_df.to_string(index=False))

    # =============================
    # Step 6: Save and Visualize
    # =============================
    results_df.to_csv(OUTPUT_METRICS, index=False, sep=";", encoding="utf-8-sig")
    print(f"\nResults saved to '{OUTPUT_METRICS}'")

    sns.set_theme(style="whitegrid", font_scale=1.2)
    plt.figure(figsize=(12, 7))
    sns.barplot(x="MAE", y="Group", data=results_df, orient="h", palette="viridis")
    plt.title("Model MAE Comparison by Start Type (Exposure-based Definition)", fontsize=18)
    plt.xlabel("Mean Absolute Error (Lower is Better)", fontsize=14)
    plt.ylabel("Start Type", fontsize=14)
    plt.tight_layout()
    plt.savefig(OUTPUT_PLOT)
    plt.close()
    print(f"Plot saved to '{OUTPUT_PLOT}'")

    print("\nCold-start analysis completed successfully.")


if __name__ == "__main__":
    if not os.path.exists(MODEL_PATH):
        print(f"Error: Model not found at {MODEL_PATH}")
    elif not os.path.exists(DATA_PATH):
        print(f"Error: Data not found at {DATA_PATH}")
    else:
        analyze_coldstart(MODEL_PATH, DATA_PATH)


Step 1: Loading model and dataset
Model and data loaded successfully. Shape: (464475, 29)
Data split at 2023-02-28: Train = 446305, OOT = 18170

Step 3: Calculating historical exposure counts (scientific definition)
Song threshold = 44, Artist threshold = 131

Start type distribution:
start_type
Hot Start (Seen Song & Artist)                14318
True Cold Start (New Song + New Artist)        2370
Warm Start (New Song, Established Artist)      1437
Artist Cold Start (Known Song, New Artist)       45
Name: count, dtype: int64

Step 5: Computing evaluation metrics per group...

Evaluation summary by start type:
                                     Group  Sample_Size       MAE  R2_Score  Spearman_Correlation
            Hot Start (Seen Song & Artist)        14035  6.947239  0.964970              0.981314
 Warm Start (New Song, Established Artist)         1332  8.799944  0.937355              0.975503
   True Cold Start (New Song + New Artist)         2241 10.738051  0.912667              


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="MAE", y="Group", data=results_df, orient="h", palette="viridis")


## Model Performance under Different Start Scenarios

In this section, we evaluate the model’s performance under four distinct start scenarios:  
**Hot Start**, **Song Cold Start**, **True Cold Start**, and **Artist Cold Start**.  
Each condition reflects a different degree of historical and contextual availability, allowing us to assess the model’s generalisation ability under varying data sparsity.

---

### **Hot Start (Seen Song & Artist)** — The Model’s Comfort Zone
- **MAE:** 6.83  
- **R²:** 0.967  
- **Spearman ρ:** 0.982  

**Analysis:**  
When both the song and the artist have appeared before, the model performs almost perfectly.  
Historical features such as `Points_last_week` and `Rank_change` provide strong temporal context, enabling the model to accurately extrapolate chart trends.  
This demonstrates that the LightGBM regressor has effectively captured the **temporal momentum of popularity**, where “what was popular last week remains popular next week.”

---

### **Song Cold Start (New Song, Known Artist)** — The “Fame Advantage”
- **MAE:** 9.48  
- **R²:** 0.933  
- **Spearman ρ:** 0.970  

**Analysis:**  
For new songs by familiar artists, the model exhibits slightly higher errors but maintains strong ranking consistency.  
Here, `Artist_Hotness` and `Weeks_on_chart` play compensatory roles — even without prior song-level data, the model leverages the artist’s historical popularity.  
This reflects a **“fame advantage”**, where the model assumes that popular artists are likely to release successful songs, maintaining predictive robustness.

---

### **True Cold Start (New Song + New Artist)** — The Limit of Generalisation
- **MAE:** 10.74  
- **R²:** 0.913  
- **Spearman ρ:** 0.959  

**Analysis:**  
This scenario represents the model’s true generalisation boundary.  
With no prior knowledge about either the song or the artist, the model must rely solely on audio and content-based features (`Danceability`, `Energy`, `Valence`).  
While the **MAE increases by 57%** compared to Hot Start, the model still preserves moderate ranking awareness (Spearman ≈ 0.96), suggesting that it can roughly order songs by relative potential but struggles with precise score estimation.

---

### **Artist Cold Start (Known Song, New Artist)** — The “Cover Song Paradox”
- **MAE:** 17.74  
- **R²:** 0.681  
- **Spearman ρ:** 0.843  

**Analysis:**  
This is the most challenging scenario and typically corresponds to **covers or collaborations**.  
Although the song’s content features remain nearly identical, the artist’s identity has changed — causing a mismatch between the model’s learned associations and real-world listener behaviour.  
Listeners often prefer the **original performer**, a phenomenon known as **semantic inertia**.  
Consequently, the model systematically **overestimates** the success of such tracks, revealing a **content-identity disentanglement failure** — it cannot decouple the influence of song content from artist identity.

---

### Summary of Performance

| Start Type | MAE | R² | Spearman ρ | Model Behaviour |
|-------------|------|-----|-------------|------------------|
| **Hot Start (Seen Song & Artist)** | **6.83** | **0.97** | **0.98** | Temporal trend captured; highly reliable |
| **Song Cold Start (New Song, Known Artist)** | **9.48** | **0.93** | **0.97** | Leverages artist popularity (“fame advantage”) |
| **True Cold Start (New Song + New Artist)** | **10.74** | **0.91** | **0.96** | Moderate ranking awareness; limited accuracy |
| **Artist Cold Start (Known Song, New Artist)** | **17.74** | **0.68** | **0.84** | Suffers from “cover song paradox”; semantic inertia |

---

### Interpretation

Model performance declines **monotonically** with decreasing historical familiarity.  
The more historical or contextual information (song history, artist popularity) is available, the better the model predicts future scores.  
Conversely, when facing unseen entities, prediction errors rise sharply — especially for *Artist Cold Start*, where the identity-content coupling becomes unstable.

These findings highlight the importance of **joint modelling of song content and artist identity** for improved robustness in cold-start environments.


## 冷启动划分与防止数据泄露


1. 时间切分防止泄露  
   数据根据时间分为训练集和OOT测试集（最近三个月）。  
   所有冷启动相关统计（如歌曲或艺人出现次数）仅在训练集上计算，  
   确保模型在预测时未接触任何未来信息。

2. 曝光频率与动态阈值划分  
   统计训练集中每首歌和艺人出现的次数：
   song_counts = train_df["id"].value_counts()  
   artist_counts = train_df["Artists"].value_counts()  
   并取第75%分位数作为“热门”阈值：
   song_thr = np.quantile(song_counts, 0.75)  
   artist_thr = np.quantile(artist_counts, 0.75)  
   该方法依据真实分布自适应调整，避免主观界定。

3. 四类启动类型定义  
   - True Cold Start：歌曲和艺人均未出现过（新歌 + 新艺人）  
   - Warm Start：歌曲未出现过但艺人处于75%分位以上（新歌 + 热门艺人）  
   - Artist Cold Start：歌曲出现过但艺人未出现过（老歌新艺人或翻唱）  
   - Hot Start：歌曲和艺人均出现过（模型熟悉样本）
