In [1]:
import pandas as pd
import os
from multiprocessing import Pool, cpu_count
from tqdm import tqdm
from collections import defaultdict
from transformers import pipeline
from concurrent.futures import ThreadPoolExecutor, as_completed
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

In [2]:
# 数据路径
DATA_PATH = r"C:/Users/xqy16/Desktop/多模态视频推荐/MGTV_AI_Challenge_Video_Recommend_Rank_12th-main/Data_A/"
CLICK_PATH = os.path.join(DATA_PATH, "用户历史点击数据")
PLAY_PATH = os.path.join(DATA_PATH, "用户历史播放数据")
SHOW_PATH = os.path.join(DATA_PATH, "用户历史曝光数据")
TEST_PRED_PATH = os.path.join(DATA_PATH, "./A榜待预测的did/testA_pred_did.csv")
TEST_SHOW_PATH = os.path.join(DATA_PATH, "./A榜用户曝光数据/testA_did_show.csv")
VID_INFO_PATH = os.path.join(DATA_PATH, "./vid_info/vid_info_table.csv")
DID_FEATURES_PATH = os.path.join(DATA_PATH, "did_features/did_features_table.csv")
DANMU_PATH = os.path.join(DATA_PATH, "弹幕文本数据")
DATA_PARQUET_PATH = r"C:\Users\xqy16\Desktop\多模态视频推荐\用户历史日志_自加特征"

In [3]:
# 内存优化参数
CHUNK_SIZE = 500000
PARALLEL_JOBS = max(1, cpu_count() - 2)

# 分块加载数据
def load_csv_chunks(file_path, day, dtype=None):
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=CHUNK_SIZE, dtype=dtype):
        chunk['day'] = day 
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True) if chunks else pd.DataFrame()

### 弹幕特征分析

In [4]:
pd.set_option('display.max_columns', None)  # 显示所有列

In [5]:
danmu_parquet_df = pd.read_parquet(os.path.join(DATA_PARQUET_PATH, "bullet_fea.parquet"))

In [6]:
danmu_parquet_df.head(10)

Unnamed: 0,vid,content,emotion,bullet_cnt,bullet_len
0,562941,2025.4.15.23点03。好几年都没看过了，来回忆回忆。2025.4.15.23点03...,1,5,62
1,609881,2025。2025。2025年4月11日又来打卡。2025年哦都2025年了哦。2025年4...,1,5,54
2,610769,笑死我了。笑死我了,1,2,9
3,823055,2025.4.15打卡,1,1,11
4,836386,赵英俊！。2025.4.21,1,2,14
5,851923,好可爱的小女孩。2025四月二刷。2025了。看个开头已经开始想哭了😭。2025.4.13二...,1,12,114
6,852959,字幕呢。没字幕啊。很讨厌电影里讲方言。受不了了！我得去找粤语版的看。尬不尬。方言听的真难受。...,0,7,60
7,852961,好牛。怎么没有粤语的,1,2,10
8,860138,电影把徐若瑄的画面全删了。为啥没字幕,0,2,18
9,861277,2055.4.21。wa ～前面的朋友，我2025年4月17日打卡。你们难道不是山里的吗？。...,1,47,588


In [7]:
danmu_parquet_df.describe()

Unnamed: 0,vid,emotion,bullet_cnt,bullet_len
count,7991.0,7991.0,7991.0,7991.0
mean,15379660.0,0.651733,189.340758,2030.318
std,5901000.0,0.476451,3641.430137,36219.46
min,562941.0,0.0,1.0,1.0
25%,10694940.0,0.0,2.0,25.0
50%,16415730.0,1.0,8.0,91.0
75%,20631300.0,1.0,31.0,376.5
max,23336560.0,1.0,236407.0,2282155.0


### baseline代码给出的对弹幕特征的处理

这里加一个添加情感判断，魔塔那个跟环境配置冲突了，我用huggingface的transformers

In [8]:
model_path = "./saved_models/roberta"
sentiment_pipeline = pipeline("sentiment-analysis", model = model_path, tokenizer=model_path)
result = sentiment_pipeline("好！")[0]['label']
print("positive" in result)

Device set to use cuda:0


True


In [9]:
# 读取弹幕数据文件
def read_danmu_files():
    all_dfs = []        # list
    for i in tqdm(range(1, 19), desc="读取弹幕数据文件"): 
        file_name = f"{i}.xlsx"
        file_path = os.path.join(DANMU_PATH, file_name)
        df = pd.read_excel(file_path)
        all_dfs.append(df)
    print("弹幕数据读取完成...")
    result = pd.concat(all_dfs, ignore_index=True)

    model_path = "./saved_models/roberta"
    sentiment_pipeline = pipeline("sentiment-analysis", model = model_path)

    # 定义一个函数来处理每条文本并转换为 0 或 1
    def get_sentiment(text):
        try:
            # 获取情感分析结果
            result = sentiment_pipeline(text)[0]['label']
            
            # 根据模型输出转换为 0 或 1
            # 不同模型的输出标签可能不同，需要根据实际情况调整
            if "positive" in result:
                return 1
            else:
                return 0
        except:
            # 处理可能的错误
            return None

    # 对 DataFrame 的 'content' 列应用情感分析，并将结果保存到新列 'emotion'
    result['emotion'] = result['content'].apply(get_sentiment)

    del all_dfs
    return result

In [10]:
# 加载弹幕数据，添加列名
def load_danmu_data():
    df_danmu = read_danmu_files()
    if not df_danmu.empty:  
        df_danmu = df_danmu.rename(columns={'videoid': 'vid'})
        print(f"弹幕数据加载完成，记录总数: {len(df_danmu)}")
    else:
        print("未加载到弹幕数据")
    return df_danmu

In [11]:
# df_danmu = load_danmu_data()

In [12]:
# df_danmu.head(10)

In [13]:
# df_danmu.to_csv("df_danmu.csv")

In [14]:
df_danmu = pd.read_csv("df_danmu.csv")

新特征：

1. 每个vid的弹幕量

2. 每个vid的弹幕密度

3. 每个vid的所有弹幕的情感分析，

4. 每个vid的情感复杂度分析

5. 高质量弹幕比例，这里的长度先设定为>10，可以以后再调

返回一个dict，包含着vid和弹幕量/弹幕密度的元组pair

In [15]:
# 构建视频弹幕的特征
def calculate_danmu_features(danmu_parquet_df, df_danmu):
    print("计算视频弹幕热度")
    if danmu_parquet_df.empty:
        print("弹幕数据为空...")
        return {}
    
    # 按视频统计弹幕数量
    vid_danmu_counts = danmu_parquet_df.set_index('vid')['bullet_cnt'].to_dict()
    
    # 计算弹幕密度（弹幕数量/视频时长）
    df_vid_info = pd.read_csv(os.path.join(DATA_PATH,"vid_info/vid_info_table.csv"))
    vid_to_duration = df_vid_info.set_index('vid')['item_duration'].to_dict()

    vid_density = {}
    for vid, counts in vid_danmu_counts.items():
        duration = vid_to_duration.get(vid, 2766)   # 优化！duration缺失值可以用mean代替：2766
        vid_density[vid] = counts / duration

    # 计算视频情感分析
    print("计算视频整体情感")
    emotion_judgement = danmu_parquet_df.set_index('vid')['emotion'].to_dict()

    print("计算每个vid的情感复杂度")
    def calculate_entropy(x):
        # 计算情感值的分布频率
        p = x.value_counts(normalize=True)
        # 计算熵值 (-sum(p*log(p)))
        entropy = -np.sum(p * np.log2(p))
        return entropy
    emotion_entropy_df = df_danmu.groupby('vid')['emotion'].agg(calculate_entropy).reset_index()
    emotion_entropy_df.columns = ['vid', 'emotion_entropy']
    emotion_entropy = dict(zip(emotion_entropy_df['vid'], emotion_entropy_df['emotion_entropy']))

    # 计算高质量弹幕比例
    print("计算视频高质量弹幕比例")
    high_quality_length = 10
    df_danmu['content_length'] = df_danmu['content'].str.len()

    high_quality_ratio = (df_danmu['content_length'] > high_quality_length).groupby(df_danmu['vid']).mean().to_dict()
    

    danmu_features = {
        'vid_danmu_count': vid_danmu_counts,
        'vid_danmu_density': vid_density,
        'vid_emotion_judgement': emotion_judgement,
        'vid_emotion_entropy': emotion_entropy,
        'high_quality_ratio': high_quality_ratio
    }

    # 清理内存
    del df_vid_info, vid_to_duration, vid_danmu_counts, vid_density, emotion_judgement, high_quality_ratio, emotion_entropy_df
    
    return danmu_features

In [None]:
# danmu_features = calculate_danmu_features(danmu_parquet_df, df_danmu)

计算视频弹幕热度
计算视频整体情感
计算每个vid的情感复杂度
计算视频高质量弹幕比例


In [17]:
danmu_features['vid_emotion_entropy']  # 查看高质量弹幕比例

{562941: -0.0,
 562942: 0.9182958340544896,
 562943: 1.0,
 562944: -0.0,
 563512: 0.8112781244591328,
 563513: -0.0,
 563514: -0.0,
 563515: -0.0,
 563516: -0.0,
 563517: -0.0,
 563519: -0.0,
 563520: -0.0,
 563521: -0.0,
 563524: -0.0,
 563526: 1.0,
 563527: -0.0,
 563530: 0.8112781244591328,
 563531: 1.0,
 563532: -0.0,
 563536: -0.0,
 563537: -0.0,
 563538: 1.0,
 563539: -0.0,
 563541: -0.0,
 563542: 1.0,
 563543: -0.0,
 563544: -0.0,
 563546: -0.0,
 563547: 0.8112781244591328,
 563548: -0.0,
 563567: -0.0,
 563568: -0.0,
 563569: -0.0,
 563903: 0.8112781244591328,
 563905: -0.0,
 563921: -0.0,
 563944: -0.0,
 563946: -0.0,
 563948: -0.0,
 563976: 0.863120568566631,
 563981: 0.9852281360342515,
 563982: -0.0,
 564023: 1.0,
 564025: -0.0,
 564028: -0.0,
 564030: 0.954434002924965,
 564558: -0.0,
 564642: 0.6949749673676273,
 564643: 0.9182958340544896,
 564644: 0.7811153285444683,
 564645: 0.8586370819183629,
 564646: 0.5770042503157248,
 564647: 0.7682814090975242,
 564648: 0.794102

### 用户历史日志特征分析

这边可以加,我还没写完

In [18]:
df_demo = pd.read_parquet(os.path.join(DATA_PARQUET_PATH, "user_history_day02.parquet"))

In [19]:
df_demo.head(5)

Unnamed: 0,did,vid,click_time,play_time,item_cid,item_type,item_duration,item_assetSource,item_classify,item_isIntact,item_serialno,sid,stype,show_sum,click_sum,ctr,ptr,item_cid_click_sum,item_cid_ctr,item_cid_ptr,item_type_click_sum,item_type_ctr,item_type_ptr,item_assetSource_click_sum,item_assetSource_ctr,item_assetSource_ptr,item_classify_click_sum,item_classify_ctr,item_classify_ptr,item_isIntact_click_sum,item_isIntact_ctr,item_isIntact_ptr,sid_click_sum,sid_ctr,sid_ptr,stype_click_sum,stype_ctr,stype_ptr
0,000098cabe490a5bd6773009400a9a92,17620959,2025-03-24 12:26:52,2819.0,1389675,0,5358,0,1,1,5,17119196,0,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
1,000098cabe490a5bd6773009400a9a92,20161959,,,1482548,0,6467,0,1,1,35,19606404,0,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
2,000098cabe490a5bd6773009400a9a92,20290603,,,1499162,2,7726,0,1,1,1,10719952,2,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
3,000098cabe490a5bd6773009400a9a92,23172217,,,1658049,2,7337,0,1,1,25,10998600,2,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}
4,000098cabe490a5bd6773009400a9a92,23173979,,,1563214,0,2770,0,2,1,40,22633828,0,0,0,0.0,0.0,{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}


In [20]:
df_demo.describe()

Unnamed: 0,vid,play_time,item_cid,item_type,item_duration,item_assetSource,item_classify,item_isIntact,item_serialno,sid,stype,show_sum,click_sum,ctr,ptr
count,1346235.0,135532.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0,1346235.0
mean,21777310.0,1023.369677,1573323.0,0.283169,5262.463,0.01439496,1.623791,0.9999302,20.08543,19618510.0,0.283169,3.118155,0.3009831,0.04108089,0.07230594
std,3081467.0,970.831498,112311.5,0.6972472,2513.594,0.4647624,3.417269,0.008355809,15.25194,4678840.0,0.6972472,4.430264,0.5819513,0.08226001,0.1883416
min,609881.0,11.0,931765.0,0.0,41.0,0.0,0.0,0.0,0.0,194.0,0.0,0.0,0.0,0.0,0.0
25%,22386020.0,97.0,1547193.0,0.0,2770.0,0.0,1.0,1.0,3.0,18099110.0,0.0,0.0,0.0,0.0,0.0
50%,23173980.0,779.0,1627943.0,0.0,5400.0,0.0,1.0,1.0,22.0,22625780.0,0.0,0.0,0.0,0.0,0.0
75%,23181180.0,1750.0,1658049.0,0.0,7337.0,0.0,2.0,1.0,33.0,22633320.0,0.0,7.0,0.0,0.0,0.0
max,23192560.0,3599.0,1673412.0,2.0,19016.0,34.0,50.0,1.0,437.0,22701810.0,2.0,59.0,9.0,1.0,1.056575


In [21]:
df_demo['item_cid_click_sum'].unique()

array(['{}', '{"1650171": 1}', '{"1563214": 1}', ...,
       '{"1519449": 1, "1525220": 1}',
       '{"1598138": 1, "1650171": 1, "1563214": 1}',
       '{"1296229": 1, "1368420": 3}'], dtype=object)

### 对did，vid侧进行特征工程

In [22]:
# 加载历史的曝光点击播放数据，data_type代表着选择哪一类数据
def load_day_data(data_type, day):
    day_str = f"{day:02d}"
    path = {
        "click": os.path.join(CLICK_PATH, f"day{day_str}/day{day_str}_data.csv"),
        "play": os.path.join(PLAY_PATH, f"day{day_str}/day{day_str}_data.csv"),
        "show": os.path.join(SHOW_PATH, f"day{day_str}/did_show_data{day_str}.csv")
    }.get(data_type)
    
    return load_csv_chunks(path, day)

In [25]:
# 加载数据
def load_full_data():    
    # 加载点击及播放数据
    print("\n加载点击数据...")
    click_data = []
    for day in tqdm(range(1, 31), desc="天"):
        click = load_day_data("click", day)  # 点击数据
        click['click_time'] = pd.to_datetime(click['click_time'])
        play = load_day_data("play", day)   # 播放数据
        
        if not click.empty and not play.empty:  # 两者皆不为空的情况
            click = click.merge(play[['did', 'vid', 'play_time']], on=['did', 'vid'], how='left')
        click_data.append(click)
    
    df_click = pd.concat(click_data, ignore_index=True)
    print(f"点击数据加载完成，点击记录总数: {len(df_click)}")
    
    # 加载曝光数据
    print("\n加载曝光数据...")
    show_data = []
    for day in tqdm(range(1, 31), desc="天"):
        show = load_day_data("show", day)
        show_data.append(show)
    
    df_show = pd.concat(show_data, ignore_index=True)
    print(f"曝光数据加载完成，曝光记录总数: {len(df_show)}")
    
    # 加载视频信息
    print("\n加载视频信息...")
    df_vid_info = pd.read_csv(VID_INFO_PATH)
    
    # 合并视频合集信息
    # vid为key，item_cid为value的一个dict，然后做一个map
    vid_to_item_cid = df_vid_info.set_index('vid')['item_cid'].to_dict()
    df_click['item_cid'] = df_click['vid'].map(vid_to_item_cid).fillna(-1)
    df_show['item_cid'] = df_show['vid'].map(vid_to_item_cid).fillna(-1)

    # 增加！
    # 把df_vid_info的其他列也添加到df_click和df_show中
    df_click = df_click.merge(df_vid_info.drop(columns=['item_cid']), on='vid', how='left')
    df_show = df_show.merge(df_vid_info.drop(columns=['item_cid']), on='vid', how='left')
    
    # 加载弹幕数据
    # df_danmu = load_danmu_data()
    df_danmu = pd.read_csv("df_danmu.csv")
    danmu_parquet_df = pd.read_parquet(os.path.join(DATA_PARQUET_PATH, "bullet_fea.parquet"))
    danmu_features = calculate_danmu_features(danmu_parquet_df, df_danmu)
    
    return df_click, df_show, df_vid_info, df_danmu, danmu_features

In [26]:
df_click, df_show, df_vid_info, df_danmu, danmu_features = load_full_data()


加载点击数据...


天: 100%|██████████| 30/30 [00:09<00:00,  3.08it/s]


点击数据加载完成，点击记录总数: 3771396

加载曝光数据...


天: 100%|██████████| 30/30 [00:19<00:00,  1.52it/s]


曝光数据加载完成，曝光记录总数: 31905025

加载视频信息...
计算视频弹幕热度
计算视频整体情感
计算每个vid的情感复杂度
计算视频高质量弹幕比例


In [27]:
df_click.head(5)

Unnamed: 0,did,vid,item_cid,click_time,day,play_time,item_type,item_duration,item_assetSource,item_classify,item_isIntact,item_serialno,sid,stype
0,606c6851da77438cb5e59004a0ea8ae8,852961,940017,2025-03-23 15:13:09,1,540.0,0,6222,0,3,1,1,3328088,0
1,2c1177e30df6aedad76f294501969ea0,1148463,980728,2025-03-23 21:14:35,1,25.0,0,6503,0,3,1,1,5585838,0
2,3fae714c164af6703339fb2f2ffb14fc0b4a661e,1266474,986084,2025-03-23 21:11:10,1,780.0,0,7407,0,3,1,1,3730714,0
3,d49a53cbaceef4c899f461da65f10ceb143fecf3,1266478,931765,2025-03-23 11:12:20,1,2829.0,0,7364,0,0,0,0,3737974,0
4,3515c5d17245247b7edcb5fff39a5f48,1266478,931765,2025-03-23 20:27:30,1,594.0,0,7364,0,0,0,0,3737974,0


In [28]:
df_click['item_cid'].mode()[0]

np.int64(1671676)

In [29]:
print("NaN 值数量:", df_click['play_time'].isna().sum())
print("NaN 值数量:", df_click['click_time'].min())

NaN 值数量: 267703
NaN 值数量: 2025-03-23 00:00:00


In [30]:
# 特征工程
def build_features(df_click, df_show, df_danmu, danmu_features, start_date=21, end_date=30):
    df_click['play_time'] = df_click['play_time'].fillna(df_click['play_time'].min())
    print("\n===== 构建特征 =====")
    
    df_click_filtered = df_click[(df_click['day'] >= start_date) & (df_click['day'] < end_date)]
    df_show_filtered = df_show[(df_show['day'] >= start_date) & (df_show['day'] < end_date)]
    
    data = {}
    
    # 用户侧的特征

    # 活跃度指标
    ## 每个did有观看记录的天数，可以认为是“用户活跃天数”
    data['did_active_days'] = df_click_filtered.groupby('did')['click_time'].apply(lambda x: x.dt.date.nunique()).to_dict()
    ## 每个did每日平均观看次数，可以认为是“”用户平均每日观看次数”
    total_views = df_click_filtered.groupby('did').size()
    data['did_avg_daily_views'] = (total_views / pd.Series(data['did_active_days'])).to_dict()
    # 每个did点击的不同vid多少，可以认为是“用户点击活跃情况”
    data['did_click_unique_vid'] = df_click_filtered.groupby('did')['vid'].nunique().to_dict()
    # 每个did点击的不同vid的观看时长总和，可以认为是“用户观看总时长”
    data['did_click_total_watch_time'] = df_click_filtered.groupby('did')['play_time'].sum().to_dict()

    # 用户内容偏好特征
    # 每个did点击的不同item_cid多少，可以认为是“用户爱好广泛性”
    data['did_click_unique_item_cid'] = df_click_filtered.groupby('did')['item_cid'].nunique().to_dict()
    # 每个did点击的最多的item_cid多少，可以认为是“用户偏好合集id”, 这里默认值给最多的item_cid为1671676
    data['did_click_maximun_item_cid'] = df_click_filtered.groupby('did')['item_cid'].agg(lambda x: x.mode()[0] if not x.mode().empty else 1671676).to_dict()
    # 每个did点击的最多的sid多少，可以认为是“用户偏好系列id”, 这里默认值给最多的sid为11012562
    data['did_click_maximun_item_cid'] = df_click_filtered.groupby('did')['sid'].agg(lambda x: x.mode()[0] if not x.mode().empty else 11012562).to_dict()
    # 每个did点击的最多的item_type多少，可以认为是“用户偏好类型”，(横屏/竖屏？)这个和stype的数据分布是一样的
    data['did_click_maximun_item_type'] = df_click_filtered.groupby('did')['item_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else 50).to_dict()
    # 每个did看长视频的比例，可以认为是“用户偏好视频长度”
    global_avg_play = df_click_filtered['item_duration'].mean()
    long_play_threshold = global_avg_play * 1.5
    data['did_click_long_play_ratio'] = df_click_filtered.groupby('did').apply(lambda x: (x['item_duration'] > long_play_threshold).mean()).to_dict()
    # 每个did偏好item_cid占比，可以认为是“用户品味大众还是小众程度”
    def top_cid_ratio(group):
        if len(group) == 0:
            return 0
        top_cid = group['item_cid'].mode()[0]
        return (group['item_cid'] == top_cid).mean()
    data['did_click_top_cid_ratio'] = df_click_filtered.groupby('did').apply(top_cid_ratio).to_dict()
    # 每个did首次观看的vid占该did的所有vid的比例，可以认为是“用户新内容探索率”
    user_first_views = df_click_filtered.groupby('vid')['click_time'].min().reset_index()
    user_first_views['is_first_view'] = True
    df_merged = pd.merge(df_click_filtered, user_first_views, 
                         on=['vid', 'click_time'], how='left', suffixes=('', '_first'))
    df_merged['is_first_view'] = df_merged['is_first_view'].fillna(False)
    data['did_click_new_content_ratio'] = df_merged.groupby('did')['is_first_view'].mean().to_dict()

    # 时间类特征
    # 每个did点击的每天的不同vid的平均时间段，可以认为是“用户活跃时间”
    data['did_click_avg_watch_time'] = df_click_filtered.groupby('did')['play_time'].mean().to_dict()
    # 每个did看视频的时间的众数，可以认为是“用户偏好观看时间段”
    df_click_filtered['click_hour'] = df_click_filtered['click_time'].dt.hour
    data['did_click_most_hour'] = df_click_filtered.groupby('did')['click_hour'].agg(lambda x: x.mode()[0] if not x.mode().empty else 20).to_dict()
    # 每个did观看时间小时的标准差，可以认为是“观看时间波动”
    hour_std = df_click_filtered.groupby('did').apply(
        lambda x: x['click_time'].dt.hour.std())
    data['did_click_hour_std'] = hour_std.fillna(0).to_dict()

    # 最后点击特征
    # 每个did最后点击的视频的vid的集号，可以认为是“用户最后观看集数”，这个感觉是一个很强的特征
    latest_play_data = df_click_filtered.loc[df_click_filtered.groupby('did')['click_time'].idxmax()]
    latest_play_data['combined'] = latest_play_data['item_cid'].astype(str) + '_' + latest_play_data['item_serialno'].astype(str)
    data['did_click_latest_item_serialno'] = latest_play_data.set_index('did')['combined'].to_dict()

    # 视频侧特征

    # 视频热度指标
    # 这个filter的时间段总共有多少个不同的vid被点击了，可以认为是“时间段总体热度1”
    data['vid_click_count'] = df_click_filtered['vid'].value_counts().to_dict()
    # 这个filter的时间段总共有多少个item_cid被点击了，可以认为是“时间段总体热度2”
    data['item_cid_click_count'] = df_click_filtered['item_cid'].value_counts().to_dict()
    # 每个视频的播放时间的平均值和标准差，可以认为是“视频观看时长特征”
    vid_play_stats = df_click_filtered.groupby('vid')['play_time'].agg(['mean', 'std'])
    data['vid_avg_play_time'] = vid_play_stats['mean'].to_dict()
    data['vid_play_time_std'] = vid_play_stats['std'].fillna(0).to_dict()
    # 每个vid的平均完播率，是每个did的play_time除以item_duration的平均值，可以认为是“视频完播率”
    df_click_filtered['completion_rate'] = df_click_filtered['play_time'] / df_click_filtered['item_duration']
    data['vid_avg_completion_rate'] = df_click_filtered.groupby('vid')['completion_rate'].mean().to_dict()
    # 每个vid的“视频重复观看率”
    data['vid_repeat_rate'] = df_click_filtered.groupby('vid')['did'].agg(lambda x: x.duplicated().mean() if len(x) > 1 else 0).to_dict()
    
    # 视频合集内部特征
    # 每个item_cid的vid含有多少，“合集内视频多样性”
    data['collection_vid_diversity'] = df_click_filtered.groupby('item_cid')['vid'].nunique().to_dict()
    # 每个item_cid的vid完播率多少，“合集平均观看深度”
    data['collection_avg_completion'] = df_click_filtered.groupby('item_cid')['completion_rate'].mean().to_dict()
    # 每个item_cid的观看次数，计算用户在合集内的重复观看行为，“合集用户留存率”
    collection_user_retention = df_click_filtered.groupby(['item_cid', 'did']).size().reset_index(name='views')
    collection_user_retention['is_retained'] = collection_user_retention['views'] > 1
    collection_retention_rate = collection_user_retention.groupby('item_cid')['is_retained'].mean()
    data['collection_retention_rate'] = collection_retention_rate.to_dict()
    # 计算每个vid视频在item_cid中的点击率排名，“视频在合集内的排名”
    def vid_rank_in_collection(df):
        # 计算合集内每个视频的点击量
        collection_vid_counts = df.groupby(['item_cid', 'vid']).size().reset_index(name='counts')
        
        # 在每个合集内按点击量排名
        collection_vid_counts['rank'] = collection_vid_counts.groupby('item_cid')['counts'].rank(
            method='dense', ascending=False)
        
        return collection_vid_counts.set_index('vid')['rank'].to_dict()

    data['vid_rank_in_collection'] = vid_rank_in_collection(df_click_filtered)

    
    # 交叉点击特征
    data['did_vid_clicks'] = defaultdict(int)
    for (did, vid), count in df_click_filtered.groupby(['did', 'vid']).size().items():
        data['did_vid_clicks'][(did, vid)] = count

    # 弹幕特征
    data.update(danmu_features)
    
    # 计算CTR特征
    print("计算视频CTR...")
    # 相当于增加一个action列，然后不同的操作label不同
    df_all = pd.concat([
        df_click[['did', 'vid', 'item_cid']].assign(action='click'),
        df_show[['did', 'vid', 'item_cid']].assign(action='show')
    ], ignore_index=True)
    
    # 分块计算CTR
    # 即初始化为key为vid，value为一个字典，字典包含clicks和impressions两个键值对，初始值为0
    vid_stats = defaultdict(lambda: {'clicks': 0, 'impressions': 0})
    for i in range(0, len(df_all), CHUNK_SIZE):
        chunk = df_all.iloc[i:i+CHUNK_SIZE]
        for vid, group in chunk.groupby('vid'): # 注意group是一个dataframe
            clicks = (group['action'] == 'click').sum()
            impressions = len(group)
            vid_stats[vid]['clicks'] += clicks
            vid_stats[vid]['impressions'] += impressions
    
    # 转换为CTR字典
    data['vid_ctr'] = {
        vid: stats['clicks'] / stats['impressions'] if stats['impressions'] > 0 else 0
        for vid, stats in vid_stats.items()
    }
    
    # 释放内存
    del df_all, vid_stats, latest_play_data, df_click_filtered
    
    return data

In [31]:
feature_data = build_features(df_click, df_show, df_danmu, danmu_features, start_date=21, end_date=30)


===== 构建特征 =====


  data['did_click_long_play_ratio'] = df_click_filtered.groupby('did').apply(lambda x: (x['item_duration'] > long_play_threshold).mean()).to_dict()
  data['did_click_top_cid_ratio'] = df_click_filtered.groupby('did').apply(top_cid_ratio).to_dict()
  df_merged['is_first_view'] = df_merged['is_first_view'].fillna(False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_click_filtered['click_hour'] = df_click_filtered['click_time'].dt.hour
  hour_std = df_click_filtered.groupby('did').apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_click_filtered['comp

计算视频CTR...


In [33]:
import pickle

# 保存字典到Pickle文件
file_path = "feature_data_dict.pkl"
with open(file_path, 'wb') as f:  # 'wb'表示二进制写入
    pickle.dump(feature_data, f)

In [34]:
with open(file_path, 'rb') as f:  # 'rb'表示二进制读取
    loaded_dict = pickle.load(f)

# 查看字典内容（使用pprint美化输出）
from pprint import pprint

print("\n特征字典内容：")
pprint(loaded_dict['did_click_latest_item_serialno'])


特征字典内容：
{'000014bdc80cae9dce5f69d950827f70': '1260489_13',
 '00001a47e0e8e747bceaed2a9ed3bf64': '1553281_21',
 '0000a5a708f705b71ee16a5f230356b6': '1663282_29',
 '0000a829ee9f519e59c717b77fb02d98cd7fc1fd': '1658049_40',
 '0000a93d6cd3a41fdab4239444621b5b': '1446211_12',
 '0000dd2ce9e15fcc3aeae09a4704b1a831f7ad6b': '1658049_40',
 '0000e25083f5e4d01ead8433a03098e25e4a6874': '1580366_1',
 '00016f5c8c002da69fa06fb9c23e7e9c86547545': '1671676_22',
 '000191bfb534d6d8961f54988f35112c': '1598138_21',
 '0001f33896e51de5b23d771b35b14171': '1671676_2',
 '000258ba988ef5ebc37a74438a36ba8972b22974': '1549159_35',
 '000276c8f94c7fc12ca7dace73d05afc97506a64': '1671676_22',
 '0002af23a1a20955329925a13a1c5c4b2295086e': '1343165_11',
 '0002b0d96ed78085bf511ad212a762c5860a8fbc': '1305632_2',
 '0002be48a1244e1d483319ca93902879bd4894bf': '1261736_7',
 '00036272b1816746f58d793056e80942cffe26cd': '1390132_16',
 '00037719e4b22835ed9c2eda46055d84': '1671676_17',
 '000377857a5af14233fad3272ae9c77b814b390b': '16

KeyboardInterrupt: 