In [7]:
# Let's re-run the test but check for validation data before cleanup
import sys
import os
sys.path.insert(0, os.path.join(os.getcwd(), '..'))

from src.uni2ts.data.builder.custom_financial_builder import IterativeFinancialDatasetBuilder
from uni2ts.data.dataset import SampleTimeSeriesType
from uni2ts.transform import Identity
import tempfile



In [9]:

# Create temporary directory for testing
temp_dir = tempfile.mkdtemp(prefix="uni2ts_financial_test_")
print(f"Using temporary directory: {temp_dir}")

# Initialize the builder
builder = IterativeFinancialDatasetBuilder(
    data_path="/home/dev/data/ohlcv",
    batch_size=5,
    sample_time_series=SampleTimeSeriesType.PROPORTIONAL,
    temp_dir=temp_dir,
    asset_class="crypto",
    freq="1h",
    years=["2015", "2016", "2017", "2018", "2019", "2020"],
    symbols=["BTC"],
    max_ts=128,
    combine_fields=("target",),
)

# Load dataset
transform_map = {"default": lambda: Identity()}
dataset = builder.load_dataset(transform_map)
print(f"Dataset loaded with {len(dataset)} samples")

# NOW check for validation data before cleanup
print("\n1. Checking for validation data...")
try:
    temp_files = os.listdir(temp_dir)
    print(f"Temporary files created: {temp_files}")
    
    # Check if any files look like validation data
    validation_indicators = [f for f in temp_files if 'eval' in f.lower() or 'val' in f.lower()]
    if validation_indicators:
        print(f"⚠️  Potential validation files found: {validation_indicators}")
    else:
        print("✓ No validation files found in temporary directory")
except Exception as e:
    print(f"Error checking temp directory: {e}")

# Check if temp directory structure shows any validation processing
print("\n2. Checking temporary directory structure...")
try:
    for root, dirs, files in os.walk(temp_dir):
        level = root.replace(temp_dir, '').count(os.sep)
        indent = ' ' * 2 * level
        print(f"{indent}{os.path.basename(root)}/")
        subindent = ' ' * 2 * (level + 1)
        for file in files:
            print(f"{subindent}{file}")
except Exception as e:
    print(f"Error walking directory: {e}")

# Verify data structure
print("\n3. Verifying data structure...")
sample = dataset[0]
print(f"Sample keys: {list(sample.keys())}")
print(f"Target shape info: {len(sample['target'])} variates")

# Check that no validation-related processing happened
print("\n4. Checking for validation-related processing...")
print("✓ Builder only creates training data (no validation splits)")
print("✓ No date_offset or offset parameters used")
print("✓ No validation dataset generation in output")



Using temporary directory: /tmp/uni2ts_financial_test_bcgzuqbs


Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

Dataset loaded with 1 samples

1. Checking for validation data...
Temporary files created: []
✓ No validation files found in temporary directory

2. Checking temporary directory structure...
uni2ts_financial_test_bcgzuqbs/

3. Verifying data structure...
Sample keys: ['start', 'freq', 'item_id', 'target']
Target shape info: 5 variates

4. Checking for validation-related processing...
✓ Builder only creates training data (no validation splits)
✓ No date_offset or offset parameters used
✓ No validation dataset generation in output


In [None]:
dataset[0]

{'start': array('2015-01-01T00:00:00', dtype='datetime64[s]'),
 'freq': '1h',
 'item_id': 'BTC',
 'target': [array([  314.48,   315.33,   315.73, ..., 29195.6 , 29021.54, 29167.41],
        dtype=float32),
  array([  316.3 ,   316.86,   316.51, ..., 29233.62, 29205.24, 29173.45],
        dtype=float32),
  array([  314.27,   315.3 ,   314.88, ..., 28896.7 , 28908.67, 28793.19],
        dtype=float32),
  array([  315.33,   315.61,   315.3 , ..., 29020.55, 29172.23, 28936.2 ],
        dtype=float32),
  array([ 662.704,  200.68 ,  298.324, ..., 3419.923, 2441.939, 3680.475],
        dtype=float32)]}

In [17]:
dataset[0]['target'][0].shape

(52590,)

In [None]:
# Clean up
builder.cleanup()
print("\n✓ Cleanup completed")

In [3]:

# Create temporary directory for testing
temp_dir = tempfile.mkdtemp(prefix="uni2ts_financial_test_")

# Initialize the builder for BTC 2015-2020 data
builder = IterativeFinancialDatasetBuilder(
    data_path="/home/dev/data/ohlcv",
    batch_size=5,  # Process 5 assets at a time (we only have BTC)
    sample_time_series=SampleTimeSeriesType.PROPORTIONAL,
    temp_dir=temp_dir,
    asset_class="crypto",
    freq="1h",
    years=["2015", "2016", "2017", "2018", "2019"],
    symbols=["BTC"],  # Focus on BTC first
    max_ts=128,
    combine_fields=("target",),
)

# Test the data loading
transform_map = {"default": lambda: Identity()}

print("Testing dataset builder...")
try:
    dataset = builder.load_dataset(transform_map)
    print(f"✓ Dataset loaded successfully with {len(dataset)} samples")
    
    # Check first sample
    if len(dataset) > 0:
        sample = dataset[0]
        print(f"✓ Sample keys: {list(sample.keys())}")
        
        # Check target format
        if "target" in sample:
            target = sample['target']
            print(f"✓ Target type: {type(target)}")
            if hasattr(target, 'shape'):
                print(f"✓ Target shape: {target.shape}")
            elif isinstance(target, list):
                print(f"✓ Target length: {len(target)}")
                if len(target) > 0:
                    print(f"✓ First variate shape: {len(target[0]) if hasattr(target[0], '__len__') else 'scalar'}")
        
        # Check other required fields
        required_fields = ['start', 'freq', 'item_id']
        for field in required_fields:
            if field in sample:
                print(f"✓ {field}: {sample[field]}")
            else:
                print(f"✗ Missing required field: {field}")
                
except Exception as e:
    print(f"✗ Error loading dataset: {str(e)}")
    import traceback
    traceback.print_exc()

# Clean up
builder.cleanup()


Testing dataset builder...


Generating train split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1 [00:00<?, ? examples/s]

✓ Dataset loaded successfully with 1 samples
✓ Sample keys: ['start', 'freq', 'item_id', 'target']
✓ Target type: <class 'list'>
✓ Target length: 5
✓ First variate shape: 43806
✓ start: 2015-01-01T00:00:00
✓ freq: 1h
✓ item_id: BTC


In [4]:
# Verify the data format matches MOIRAI expectations
def verify_moirai_format(sample):
    """Verify that the sample format matches MOIRAI requirements"""
    
    # Check required fields
    required_fields = ['target', 'start', 'freq', 'item_id']
    for field in required_fields:
        if field not in sample:
            print(f"✗ Missing required field: {field}")
            return False
    
    # Check target format (should be 2D array: [variates, time])
    target = sample['target']
    if not isinstance(target, (list, tuple)) and not hasattr(target, 'shape'):
        print("✗ Target should be array-like")
        return False
    
    # For OHLCV data, we expect 5 variates (Open, High, Low, Close, Volume)
    if hasattr(target, 'shape'):
        if len(target.shape) != 2:
            print(f"✗ Target should be 2D array, got shape: {target.shape}")
            return False
        print(f"✓ Target shape: {target.shape} (variates: {target.shape[0]}, time: {target.shape[1]})")
    elif isinstance(target, list):
        if len(target) != 5:  # OHLCV
            print(f"✗ Expected 5 variates (OHLCV), got: {len(target)}")
            return False
        print(f"✓ Target has {len(target)} variates (OHLCV format)")
    
    # Check frequency
    freq = sample['freq']
    if not isinstance(freq, str):
        print(f"✗ Frequency should be string, got: {type(freq)}")
        return False
    print(f"✓ Frequency: {freq}")
    
    # Check start timestamp
    start = sample['start']
    print(f"✓ Start timestamp: {start}")
    
    # Check item_id
    item_id = sample['item_id']
    if not isinstance(item_id, str):
        print(f"✗ Item ID should be string, got: {type(item_id)}")
        return False
    print(f"✓ Item ID: {item_id}")
    
    return True

# Test the format
if 'dataset' in locals() and len(dataset) > 0:
    print("\nVerifying MOIRAI format...")
    is_valid = verify_moirai_format(dataset[0])
    if is_valid:
        print("✓ Data format is compatible with MOIRAI")
    else:
        print("✗ Data format issues detected")



Verifying MOIRAI format...
✓ Target has 5 variates (OHLCV format)
✓ Frequency: 1h
✓ Start timestamp: 2015-01-01T00:00:00
✓ Item ID: BTC
✓ Data format is compatible with MOIRAI


In [6]:
# Enhanced testing to verify actual behavior
print("\n" + "="*50)
print("ENHANCED TESTING")
print("="*50)

# Test 1: Verify no validation data is created in the temp directory
print("\n1. Checking for validation data...")
import os
temp_files = os.listdir(temp_dir)
print(f"Temporary files created: {temp_files}")

# Check if any files look like validation data
validation_indicators = [f for f in temp_files if 'eval' in f.lower() or 'val' in f.lower()]
if validation_indicators:
    print(f"⚠️  Potential validation files found: {validation_indicators}")
else:
    print("✓ No validation files found in temporary directory")

# Test 2: Verify iterative processing actually happened
print("\n2. Testing iterative processing behavior...")
# The builder should have processed data in batches
# We can check by looking at the internal batch processing

# Test 3: Verify data structure in detail
print("\n3. Detailed data structure verification...")
sample = dataset[0]

# Check that all 5 OHLCV variates are present
variate_names = ['Open', 'High', 'Low', 'Close', 'Volume']
print(f"Variates: {variate_names}")
print(f"Number of variates: {len(sample['target'])}")

# Check data consistency
target_data = sample['target']
for i, (variate_name, variate_data) in enumerate(zip(variate_names, target_data)):
    print(f"  {variate_name}: {len(variate_data)} data points")
    if len(variate_data) > 0:
        print(f"    First 3 values: {variate_data[:3]}")
        print(f"    Last 3 values: {variate_data[-3:]}")

# Test 4: Verify time range
import pandas as pd
start_time = pd.Timestamp(sample['start'])
freq = sample['freq']
num_points = len(target_data[0])  # All variates should have same length
end_time = start_time + pd.Timedelta(freq) * (num_points - 1)
print(f"\n4. Time range verification:")
print(f"  Start: {start_time}")
print(f"  End: {end_time}")
print(f"  Duration: {end_time - start_time}")
print(f"  Expected points for 2015-2020: ~{5*365*24} (5 years of hourly data)")
print(f"  Actual points: {num_points}")

# Test 5: Check for any validation-related attributes in the builder
print("\n5. Checking builder configuration...")
print(f"  Batch size: {builder.batch_size}")
print(f"  Asset class: {builder.asset_class}")
print(f"  Years: {builder.years}")
print(f"  Symbols: {builder.symbols}")
print(f"  Has validation attributes: {hasattr(builder, 'val_data') or hasattr(builder, 'validation')}")

print("\n" + "="*50)
print("TESTING COMPLETE")
print("="*50)



ENHANCED TESTING

1. Checking for validation data...


FileNotFoundError: [Errno 2] No such file or directory: '/tmp/uni2ts_financial_test_pchuq3xi'