In [None]:
!pip install catboost

In [None]:
!unzip -n 智慧赢销dataset.zip -d 智慧赢销dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import joblib, re, gc

In [None]:
# ================== 数据预处理 ==================
def preprocess_data(df, is_train=True):
    """基础数据预处理"""
    df = df.copy()
    
    # 时间特征分解
    df['publish_time'] = pd.to_datetime(df['publish_time'], format='%Y%m%d')
    df['update_time'] = pd.to_datetime(df['update_time'], format='%Y%m%d')
    
    # 发布时间特征
    df['publish_day'] = df['publish_time'].dt.day
    df['publish_weekday'] = df['publish_time'].dt.weekday
    df['publish_month'] = df['publish_time'].dt.month
    
    # 更新时效性特征
    df['update_delay_days'] = (df['update_time'] - df['publish_time']).dt.days
    
    # 数据转换
    df['fans_cnt'] = df['fans_cnt'].map({'小于100':50}).fillna(0).astype(int)
    df['coin_cnt'] = df['coin_cnt'].map({'小于100':50}).fillna(0).astype(int)
    
    # 作者基础特征
    df['author_popularity'] = df['coin_cnt'] / (df['fans_cnt'] + 1)
    df['fans_video_ratio'] = df['fans_cnt'] / (df['video_cnt'] + 1)
    df['author_power'] = np.log1p(df['fans_cnt']) * np.log1p(df['coin_cnt'])
    
    return df

# ================== 文本特征处理 ==================
def process_text_features(df, text_columns, n_components=50):
    """文本特征处理"""
    text_features = []
    processors = {}
    
    for col in text_columns:
        # 训练模式
        tfidf = TfidfVectorizer(max_features=5000)
        svd = TruncatedSVD(n_components=n_components)
        
        tfidf_matrix = tfidf.fit_transform(df[col].fillna(''))
        svd_matrix = svd.fit_transform(tfidf_matrix)
        
        text_features.append(svd_matrix)
    
    # 合并文本特征
    text_features = np.hstack(text_features)
    text_columns = [f'text_{i}' for i in range(text_features.shape[1])]
    
    return pd.DataFrame(text_features, columns=text_columns)

# ================== 变量交叉验证特征生成器 ==================
def generate_kfold_features(df, target='interaction_cnt', group_key='uid',agg_funcs=['mean', 'std', 'max']):
    train = df[df.istest==0].copy()
    test = df[df.istest==1].copy()
    
    # 生成交叉验证折号
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    train['fold'] = -1
    for fold, (_, val_idx) in enumerate(folds.split(train)):
        train.loc[val_idx, 'fold'] = fold

    # 训练集特征生成
    features = []
    for func in agg_funcs:
        feat_name = f'{group_key}_{target}_{func}'
        train[feat_name] = 0
        
        # 交叉验证填充特征
        for fold in range(5):
            trn = train[train.fold != fold]
            val_idx = train[train.fold == fold].index
            
            agg_values = trn.groupby(group_key)[target].agg(func)
            train.loc[val_idx, feat_name] = train.loc[val_idx, group_key].map(agg_values)
        
        # 测试集特征
        test_agg = train.groupby(group_key)[target].agg(func)
        test[feat_name] = test[group_key].map(test_agg)
        
        features.append(feat_name)
    
    # 合并结果并填充缺失值
    full_df = pd.concat([train, test], axis=0)
    full_df[features] = full_df[features].fillna(full_df[features].mean())
    
    return full_df[features].reset_index(drop=True)

# ================== 完整特征工程 ==================
def build_features(df):
    """构建完整特征集"""
    # 预处理
    df = preprocess_data(df)
    
    # 文本特征
    text_cols = ['title', 'content', 'cover_ocr_content', 'video_content']
    df_text = process_text_features(df, text_cols)
    
    # 交叉统计特征
    df['user_site'] = df['uid'].astype(str) + '_' + df['site_id'].astype(str)
    df['user_post'] = df['uid'].astype(str) + '_' + df['post_type'].astype(str)
    user_features = generate_kfold_features(df, target='interaction_cnt', group_key='uid', agg_funcs=['mean', 'max', 'min', 'median'])
    user_site_features = generate_kfold_features(df, target='interaction_cnt', group_key='user_site', agg_funcs=['mean', 'max', 'min', 'median'])
    user_post_features = generate_kfold_features(df, target='interaction_cnt', group_key='user_post', agg_funcs=['mean', 'max', 'min', 'median'])
    
    # 截至到更新日的每天平均互动量
    df['interaction_cnt_update_delay_days'] = df['interaction_cnt'] / (df['update_delay_days'] + 1)
    user_features2 = generate_kfold_features(df, target='interaction_cnt_update_delay_days', group_key='uid', agg_funcs=['mean', 'max', 'min', 'median'])
    user_site_features2 = generate_kfold_features(df, target='interaction_cnt_update_delay_days', group_key='user_site', agg_funcs=['mean', 'max', 'min', 'median'])
    user_post_features2 = generate_kfold_features(df, target='interaction_cnt_update_delay_days', group_key='user_post', agg_funcs=['mean', 'max', 'min', 'median'])
    
    # 基础特征
    base_features = [
        'site_id', 'gender', 'age', 'city', 'post_type', 'fans_cnt',
        'video_cnt', 'coin_cnt', 'publish_day', 'publish_weekday', 'publish_month',
        'update_delay_days','author_popularity','fans_video_ratio','author_power',
        'istest','interaction_cnt'
    ]
    
    # 合并所有特征
    df_features = pd.concat([df[base_features].reset_index(drop=True),df_text,
                             user_features,user_site_features,user_post_features,
                             user_features2,user_site_features2,user_post_features2], axis=1)
    
    return df_features

# ================== 模型训练 ==================
def cross_validation_train(train_features, target, test_features, cat_features):
    """五折交叉验证训练"""
    import random
    kf = KFold(n_splits=5, shuffle=True, random_state=random.randint(0, 10000))
    models = []
    oof_preds = np.zeros(len(train_features))
    test_preds = np.zeros(len(test_features))
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_features)):
        print(f"\nFold {fold+1}/5")
        
        X_train = train_features.iloc[train_idx]
        y_train = target.iloc[train_idx]
        X_val = train_features.iloc[val_idx]
        y_val = target.iloc[val_idx]
        
        model = CatBoostRegressor(
            iterations=2000,
            learning_rate=0.1,
            depth=7,
            loss_function='MAE',
            eval_metric='MAE',
            cat_features=cat_features,
            random_seed=random.randint(0, 10000),
            early_stopping_rounds=200,
            verbose=200
        )
        
        model.fit(
            Pool(X_train, y_train, cat_features=cat_features),
            eval_set=Pool(X_val, y_val, cat_features=cat_features)
        )
        
        oof_preds[val_idx] = model.predict(X_val)
        models.append(model)
        print(f"Fold MAE: {mean_absolute_error(y_val, oof_preds[val_idx]):.2f}")
        
        test_preds += model.predict(test_features) / 5
        
    print(f"\nOOF MAE: {mean_absolute_error(target, oof_preds):.2f}")
    return models, test_preds

In [None]:
%%time
train_df = pd.read_csv("智慧赢销dataset/train.txt", sep='\t')
test_df = pd.read_csv("智慧赢销dataset/A.txt", sep='\t')
print(train_df.shape, test_df.shape)

# 结果文件
result_df = test_df[['id']]

# 训练集和测试集标记
train_df['istest'] = 0
test_df['istest'] = 1

# 合并训练集和测试集
df = pd.concat([train_df, test_df], axis=0)
del train_df,test_df
gc.collect()

In [None]:
%%time
# 构建特征
df = build_features(df)

# 扩展特征，根据每天平均互动量的统计结果还原互动量
add_cols = ['uid_interaction_cnt_update_delay_days_mean','uid_interaction_cnt_update_delay_days_median',
            'user_site_interaction_cnt_update_delay_days_mean','user_site_interaction_cnt_update_delay_days_median',
            'user_post_interaction_cnt_update_delay_days_mean','user_post_interaction_cnt_update_delay_days_median']
for col in add_cols:
    df[f'restore_{col}'] = df[col] * df['update_delay_days']
    
cat_features = ['site_id', 'gender', 'age', 'post_type', 'city']

# 缺失值填充
for col in cat_features:
    df[col] = df[col].replace('', np.nan).fillna('未知')

# 拆分训练集和测试集
train_df = df[df.istest==0]
test_df = df[df.istest==1]
del df
gc.collect()

# 入模特征
input_cols = [f for f in train_df.columns if f not in ['istest','interaction_cnt']]

In [None]:
%%time
# 交叉训练建模并预测
models, test_preds = cross_validation_train(train_df[input_cols], train_df['interaction_cnt'], test_df[input_cols], cat_features)

def postprocess_predictions(preds):
    """确保预测结果为非负整数"""
    preds = np.round(preds)  # 四舍五入
    preds = np.where(preds < 0, 0, preds)  # 处理负值
    return preds.astype(int)

result_df['interaction_cnt'] = postprocess_predictions(test_preds)
result_df['interaction_cnt'] = result_df['interaction_cnt'] / 2

result_df.to_csv("final_results.txt", index=False, sep='\t')

In [None]:
def get_feature_importance(model, feature_names):
    importance = model.get_feature_importance()
    return pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False)

# 获取特征重要性
importance_df = get_feature_importance(models[0], input_cols)

# 打印表格形式的重要性
print("="*50)
print("特征重要性排序（完整列表）：")
print(importance_df.to_string(index=False))
print("="*50)