# 数据集探索

本笔记本用于探索数据集的结构和特征。

In [None]:
import torch
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
from tqdm.notebook import tqdm
from omegaconf import OmegaConf
from data.dataset import Dataset
from data.datamodule import DataModule
import pandas as pd
import hydra
from hydra import compose, initialize

from hydra.core.config_store import ConfigStore
from hydra.core.hydra_config import HydraConfig

def load_config(config):
    cs = ConfigStore.instance()
    cs.store(name="cfg", node=config)
    with initialize(config_path="configs", version_base="1.3"):
        cfg = compose(config_name="cfg", return_hydra_config=True)
        HydraConfig.instance().set_config(cfg)  # 手动设置 HydraConfig
    return cfg

## Config & Dataset

In [None]:
from configs.experiments.base import BaseTrainConfig
cfg = load_config(BaseTrainConfig)
print(OmegaConf.to_yaml(cfg))

In [None]:
train_val_dataset = hydra.utils.instantiate(cfg.dataset)

views = train_val_dataset.targets

## Data distribution

In [None]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# 假设你已经定义好了 views 是一个一维 np.ndarray
# 示例: views = np.random.randint(0, 1000000, size=10000)
assert isinstance(views, np.ndarray), "views must be a NumPy array"
assert views.ndim == 1, "views must be a 1D array"

# 可视化
plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.histplot(x=views, bins=50)
plt.title('Views Distribution')
plt.xlabel('Views')
plt.ylabel('Frequency')

plt.subplot(2, 2, 2)
sns.histplot(x=views, bins=50, log_scale=(True, False))
plt.title('Views Distribution (Logarithmic X Scale)')
plt.xlabel('Views (Log)')
plt.ylabel('Frequency')

plt.subplot(2, 2, 3)
sns.histplot(x=np.log1p(views), bins=50)
plt.title('log1p(Views) Distribution')
plt.xlabel('log1p(Views)')
plt.ylabel('Frequency')

plt.subplot(2, 2, 4)
sns.boxplot(y=np.log1p(views))
plt.title('log1p(Views) Boxplot')
plt.ylabel('log1p(Views)')

plt.tight_layout()
plt.show()

# 数值统计
print("Views Statistics:")
print(f"Min: {views.min():,.0f}")
print(f"Max: {views.max():,.0f}")
print(f"Average: {views.mean():,.0f}")
print(f"Median: {np.median(views):,.0f}")
print(f"Standard Deviation: {views.std():,.0f}")

log_views = np.log1p(views)
print("\nlog1p(Views) Statistics:")
print(f"Min: {log_views.min():.2f}")
print(f"Max: {log_views.max():.2f}")
print(f"Average: {log_views.mean():.2f}")
print(f"Median: {np.median(log_views):.2f}")
print(f"Standard Deviation: {log_views.std():.2f}")


## Baseline prediction - mean

In [None]:
from torch.utils.data import DataLoader

baseline_value = np.expm1(log_views.mean())

test_dataset = Dataset(
    cfg.datamodule.dataset_path,
    "test",
    transforms=hydra.utils.instantiate(cfg.datamodule.test_transform),
    metadata=cfg.datamodule.metadata,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=20,
)

# 构造 submission 记录
records = []
for batch in test_loader:
    ids = batch["id"]
    for id_ in ids:
        records.append({
            "ID": id_.item(),
            "views": baseline_value,
        })

# 写入 CSV
submission = pd.DataFrame(records)
submission.to_csv(f"{cfg.root_dir}/submissions/mean_baseline.csv", index=False)
print(f"Baseline submission saved to: {cfg.root_dir}")

## Predictions on validation set

In [None]:
val_csv = r"D:\Personal\Polytechnique\2A\DL_projet\CSC_43M04_EP_challenge\outputs\2025-05-17\17-00-56\weighted_loss_experiment_epoch32_20250517-172946.csv"
df = pd.read_csv(val_csv)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# ===== 基础列计算 =====
df['log_actual'] = np.log1p(df['actual_views'])
df['log_pred'] = np.log1p(df['predicted_views'])
df['log_error'] = df['log_pred'] - df['log_actual']
df['msle_contribution'] = df['log_error'] ** 2

# ===== 图1：Predicted vs Actual (log1p scale) =====
plt.figure(figsize=(10, 8))
plt.scatter(df['log_actual'], df['log_pred'], alpha=0.5)
plt.plot([0, df['log_actual'].max()], [0, df['log_actual'].max()], 'r--')
plt.xlabel('log1p(Actual Views)')
plt.ylabel('log1p(Predicted Views)')
plt.title('Predicted vs Actual (log1p scale)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
# plt.savefig('pred_vs_actual_log1p.png', dpi=300)
plt.show()

# ===== 图2：Log Residuals =====
plt.figure(figsize=(10, 6))
plt.scatter(df['log_actual'], df['log_error'], alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('log1p(Actual Views)')
plt.ylabel('log Residual (log(pred+1) - log(actual+1))')
plt.title('Log-space Residuals (Core to MSLE)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
# plt.savefig('log_residuals.png', dpi=300)
plt.show()

# ===== 图3：Log Residual Histogram =====
plt.figure(figsize=(8, 6))
plt.hist(df['log_error'].clip(-5, 5), bins=50)
plt.xlabel('Log Residual')
plt.ylabel('Frequency')
plt.title('Distribution of log-space residuals')
plt.grid(True, alpha=0.3)
plt.tight_layout()
# plt.savefig('log_residual_hist.png', dpi=300)
plt.show()

# ===== 图4：Boxplot of Log Residuals by log1p Actual Bin =====
df['log_bin'] = pd.cut(df['log_actual'], bins=10)
plt.figure(figsize=(10, 6))
sns.boxplot(x='log_bin', y='log_error', data=df)
plt.xticks(rotation=45)
plt.xlabel('log1p(Actual Views)')
plt.ylabel('log Residual')
plt.title('Log Residuals by log1p Actual View Count Bins')
plt.grid(True, alpha=0.3)
plt.tight_layout()
# plt.savefig('log_error_by_log_actual.png', dpi=300)
plt.show()

# ===== 图5：Top 15 MSLE Contributors =====
top_msle = df.sort_values('msle_contribution', ascending=False).head(15)
plt.figure(figsize=(12, 8))
sns.barplot(x='ID', y='msle_contribution', data=top_msle)
plt.xticks(rotation=90)
plt.xlabel('Video ID')
plt.ylabel('MSLE Contribution')
plt.title('Top 15 MSLE-Contributing Examples')
plt.tight_layout()
plt.savefig('top_msle_contributors.png', dpi=300)
plt.show()

# ===== 图6：MSLE Contribution Heatmap =====
df['log_pred_bin'] = pd.cut(df['log_pred'], bins=10)
df['log_actual_bin'] = pd.cut(df['log_actual'], bins=10)

pivot = pd.pivot_table(df, values='msle_contribution',
                       index='log_actual_bin',
                       columns='log_pred_bin',
                       aggfunc='mean', fill_value=0)

plt.figure(figsize=(10, 8))
sns.heatmap(pivot, cmap='YlOrRd')
plt.xlabel('log1p(Predicted Views)')
plt.ylabel('log1p(Actual Views)')
plt.title('MSLE Contribution Heatmap (Mean per Bin)')
plt.tight_layout()
# plt.savefig('msle_heatmap_logbins.png', dpi=300)
plt.show()


In [None]:
# 设置爆款门槛：100万次以上
viral_threshold = int(np.expm1(12))

# 总数 & 爆款数
total_videos = len(df)
viral_videos = (df['actual_views'] > viral_threshold).sum()

# 占比
viral_ratio = viral_videos / total_videos

print(f"总视频数：{total_videos}")
print(f"爆款（>{viral_threshold:,} views）数量：{viral_videos}")
print(f"占比：{viral_ratio:.2%}")


## Idea : new loss

Penalize underestimation of popular videos

In [None]:
x = torch.linspace(10, 18, 200)
center = (13.5 + 16.5) / 2
sharpness = 6 / (16.5 - 13.5)
w = 1 + 2 * torch.sigmoid((x - center) * sharpness)
plt.plot(x.numpy(), w.numpy())