### Load dataframe

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime

df = pd.read_csv("/Users/aleksandr/Desktop/Meta_Test.csv")
df = df.dropna()

### Initial cleaning

In [2]:
from clean import preprocess_tick_data

df_clean, df_diagnostics, outlier_counter = preprocess_tick_data(df)
df = df_clean
df = df.drop(columns="VOLATILITY")

Starting preprocessing with 570771 rows
After filtering trading hours: 282810 rows
After cleaning outliers: 282301 rows
Final clean dataset: 278585 rows

Outlier counts by detection method:
  zscore: 64
  extreme_deviation: 69
  isolated_point: 390
  price_reversal: 93
  timestamp_group: 34
  price_velocity: 3703
  suspicious_cluster: 52
  wavelet_outlier: 24


### Volatility estimation

In [3]:
from volatility import estimate_tick_volatility

df = estimate_tick_volatility(df, method = 'wavelet')

Estimating advanced tick-level volatility for 278585 ticks...
Computing wavelet-based volatility for META.O...
Completed advanced tick-level volatility estimation


In [4]:
df.drop(columns=['return', "SYMBOL"], inplace= True)
df.rename(columns={'wavelet_vol' : 'Volatility', 
                  'TIMESTAMP':'Timestamp',
                   'VALUE' : 'Value',
                   'VOLUME' : 'Volume'}, inplace=True)
df.head()

Unnamed: 0,Timestamp,Value,Volume,Volatility
0,2025-01-30 09:30:00.740000+00:00,694.24,13.0,0.00026
1,2025-01-30 09:30:00.740000+00:00,694.17,15.0,0.00026
2,2025-01-30 09:30:00.740000+00:00,694.17,15.0,0.000261
3,2025-01-30 09:30:00.740000+00:00,694.11,8.0,0.000261
4,2025-01-30 09:30:00.740000+00:00,694.1,249.0,0.000261


In [7]:
t = df[:2000]

In [1]:
import pandas as pd

t = pd.read_csv("/Users/aleksandr/Desktop/my_data.csv")

In [1]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns
from Feature_engineering.volatility_regime_model import VolatilityFeatureTransformer
import torch

t = pd.read_csv("/Users/aleksandr/Desktop/my_data.csv")

# Initialize the model with reasonable parameters for your data
model = VolatilityFeatureTransformer(
    d_model=128,         # Size of transformer embeddings
    n_heads=8,           # Number of attention heads
    n_encoder_layers=4,  # Number of transformer layers
    dim_feedforward=512, # Size of feedforward network
    dropout=0.1,         # Dropout rate
    context_length=50,   # Look-back window size
    latent_dim=32       # Size of final feature vector
)

# Prepare your data
features, volatility = model.prepare_data(
    df=t,  # Your dataframe
    timestamp_col='Timestamp',
    price_col='Value',
    volume_col='Volume',
    volatility_col='Volatility'
)

print(f"Feature tensor shape: {features.shape}")
print(f"Volatility tensor shape: {volatility.shape}")

# Train the model
history = model.train(
    train_features=features,
    train_volatility=volatility,
    num_epochs=10,
    batch_size=32,
    learning_rate=1e-4
)

# Extract learned features
latent_features = model.extract_features(features)
print(f"\nExtracted feature shape: {latent_features.shape}")

# Save model if needed
#model.save_model('volatility_feature_model.pth')

# Print feature statistics
feature_df = pd.DataFrame(latent_features)
print("\nFeature Statistics:")
print(feature_df.describe())

2025-04-07 01:48:50,886 - INFO - Initializing VolatilityFeatureTransformer...
2025-04-07 01:48:50,887 - INFO - Context length: 50
2025-04-07 01:48:50,887 - INFO - Model dimension (d_model): 128
2025-04-07 01:48:50,887 - INFO - Number of heads: 8
2025-04-07 01:48:50,887 - INFO - Number of encoder layers: 4
2025-04-07 01:48:50,887 - INFO - Transformer config:
2025-04-07 01:48:50,888 - INFO - Input size: 1
2025-04-07 01:48:50,888 - INFO - d_model: 128
2025-04-07 01:48:50,936 - INFO - 
=== Data Preparation Start ===
2025-04-07 01:48:50,936 - INFO - Input DataFrame shape: (2000, 4)
2025-04-07 01:48:50,936 - INFO - Input DataFrame columns: ['Timestamp', 'Value', 'Volume', 'Volatility']
2025-04-07 01:48:51,074 - INFO - 
=== Feature Information ===
2025-04-07 01:48:51,074 - INFO - Number of features extracted: 26
2025-04-07 01:48:51,074 - INFO - Feature columns: ['price_change', 'log_return', 'time_delta', 'trade_direction', 'is_buy', 'tick_imbalance', 'jump_diffusion', 'jump_magnitude', 'jump

Feature tensor shape: torch.Size([1951, 50, 26])
Volatility tensor shape: torch.Size([1951, 1])


2025-04-07 01:49:10,258 - INFO - 
=== Epoch 1/10 Summary ===
2025-04-07 01:49:10,259 - INFO - Average Loss: 1.129089
2025-04-07 01:49:10,259 - INFO - Min Batch Loss: 0.011985
2025-04-07 01:49:10,261 - INFO - Max Batch Loss: 3.207731
2025-04-07 01:49:10,262 - INFO - Std Dev Loss: 0.890762
2025-04-07 01:49:29,310 - INFO - 
=== Epoch 2/10 Summary ===
2025-04-07 01:49:29,311 - INFO - Average Loss: 1.064486
2025-04-07 01:49:29,311 - INFO - Min Batch Loss: 0.019321
2025-04-07 01:49:29,312 - INFO - Max Batch Loss: 3.127303
2025-04-07 01:49:29,312 - INFO - Std Dev Loss: 0.865999
2025-04-07 01:49:48,568 - INFO - 
=== Epoch 3/10 Summary ===
2025-04-07 01:49:48,569 - INFO - Average Loss: 1.040285
2025-04-07 01:49:48,570 - INFO - Min Batch Loss: 0.019502
2025-04-07 01:49:48,570 - INFO - Max Batch Loss: 3.090244
2025-04-07 01:49:48,571 - INFO - Std Dev Loss: 0.850452
2025-04-07 01:50:09,057 - INFO - 
=== Epoch 4/10 Summary ===
2025-04-07 01:50:09,058 - INFO - Average Loss: 1.038447
2025-04-07 01:50


Extracted feature shape: (1951, 32)

Feature Statistics:
                0            1            2            3            4   \
count  1951.000000  1951.000000  1951.000000  1951.000000  1951.000000   
mean     -0.360930    -0.297049     0.054054     0.407858    -0.152264   
std       0.178175     0.167602     0.339746     0.281333     0.236324   
min      -0.980967    -0.746818    -0.950620    -0.798142    -0.732960   
25%      -0.463479    -0.417250    -0.100244     0.264574    -0.329449   
50%      -0.389228    -0.326974     0.176970     0.458038    -0.163243   
75%      -0.282185    -0.212338     0.298806     0.590614     0.015202   
max       0.477165     0.349649     0.541864     1.183748     0.488788   

                5            6            7            8            9   ...  \
count  1951.000000  1951.000000  1951.000000  1951.000000  1951.000000  ...   
mean      0.163402    -0.235848    -0.029627     0.363024     0.054032  ...   
std       0.506976     0.236444     0.

In [3]:
feature_df.shape

(1951, 32)

### Transformer - Encoder feature engine