# Chapter 7: Validation and Leakage Protocol
AFML focus: avoid label overlap leakage with purging/embargo mindset.


In [1]:
import math
import matplotlib.pyplot as plt
import polars as pl
import openquant
import sys
from pathlib import Path
sys.path.insert(0, str(Path('notebooks/python/scripts').resolve()))
from afml_chapter_utils import (
    fetch_panel,
    simple_returns,
    probs_and_sides_from_momentum,
    timestamps_from_dates,
    lag_corr,
    fracdiff_ffd,
)

panel = fetch_panel(window=900)
dates = panel['date'].to_list()
uso = panel['USO'].to_list()
uso_ret = simple_returns(uso)
probs, sides = probs_and_sides_from_momentum(uso)
timestamps = timestamps_from_dates(dates)
asset_names = ['USO', 'BNO', 'XLE', 'GLD', 'UNG']
asset_prices = panel.select(asset_names).rows()
print('rows', panel.height, 'range', dates[0], dates[-1])


rows 900 range 2022-07-08 2026-02-06


In [2]:
idx = openquant.filters.cusum_filter_indices(uso, 0.001)[:220]
label_endtime = [(i, min(i + 10, len(uso)-1)) for i in idx]
bar_index = list(range(len(uso)))
ind_mat = openquant.sampling.get_ind_matrix(label_endtime, bar_index)

n = len(idx)
split = int(n * 0.7)
embargo = 8
train = list(range(0, split))
test = list(range(split + embargo, n))

# overlap check by interval intersection
intervals = label_endtime
leaks = 0
for ti in train:
    s1, e1 = intervals[ti]
    for vi in test:
        s2, e2 = intervals[vi]
        if not (e1 < s2 or e2 < s1):
            leaks += 1

plt.figure(figsize=(10,2.2))
mask = [0]*n
for i in train: mask[i]=1
for i in test: mask[i]=2
plt.plot(mask, lw=1)
plt.yticks([0,1,2], ['dropped','train','test'])
plt.title('Chapter 7: Purge/Embargo-style Split Mask')
plt.tight_layout()
plt.show()

print('train', len(train), 'test', len(test), 'interval overlaps', leaks)


train 154 test 58 interval overlaps 3


## Interpretation
This demonstrates leakage-aware protocol design: with embargo and interval checks, train/test overlap pressure is explicitly controlled.

