### Load dataframe

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from datetime import datetime

df = pd.read_csv("/Users/aleksandr/Desktop/Meta_Test.csv")
df = df.dropna()

### Initial cleaning

In [2]:
from clean import preprocess_tick_data

df_clean, df_diagnostics, outlier_counter = preprocess_tick_data(df)
df = df_clean
df = df.drop(columns="VOLATILITY")

Starting preprocessing with 570771 rows
After filtering trading hours: 282810 rows
After cleaning outliers: 282301 rows
Final clean dataset: 278585 rows

Outlier counts by detection method:
  zscore: 64
  extreme_deviation: 69
  isolated_point: 390
  price_reversal: 93
  timestamp_group: 34
  price_velocity: 3703
  suspicious_cluster: 52
  wavelet_outlier: 24


### Volatility estimation

In [3]:
from volatility import estimate_tick_volatility

df = estimate_tick_volatility(df, method = 'wavelet')

Estimating advanced tick-level volatility for 278585 ticks...
Computing wavelet-based volatility for META.O...
Completed advanced tick-level volatility estimation


In [4]:
df.drop(columns=['return', "SYMBOL"], inplace= True)
df.rename(columns={'wavelet_vol' : 'Volatility', 
                  'TIMESTAMP':'Timestamp',
                   'VALUE' : 'Value',
                   'VOLUME' : 'Volume'}, inplace=True)
df.head()

Unnamed: 0,Timestamp,Value,Volume,Volatility
0,2025-01-30 09:30:00.740000+00:00,694.24,13.0,0.00026
1,2025-01-30 09:30:00.740000+00:00,694.17,15.0,0.00026
2,2025-01-30 09:30:00.740000+00:00,694.17,15.0,0.000261
3,2025-01-30 09:30:00.740000+00:00,694.11,8.0,0.000261
4,2025-01-30 09:30:00.740000+00:00,694.1,249.0,0.000261


### Transformer - Encoder feature engine

In [4]:
import torch
import pandas as pd
import numpy as np
from Feature_engineering.feature_model import (
    VolatilityRegimeFeatureExtractor,
    prepare_data_for_model,
    train_model
)

# 1. Load and prepare real data
df = pd.read_csv("/Users/aleksandr/Desktop/my_data.csv")

# Add time index if not present
df['time_idx'] = np.arange(len(df)) / len(df)

# 2. Set model parameters
# Adjust input_size based on your actual features
feature_cols = ['Value', 'Volume', 'Volatility']  # Replace with your actual column names
input_size = len(feature_cols)

model_params = {
    'input_size': input_size,     # Adjusted based on your features
    'context_length': 50,         # Adjust based on your needs
    'd_model': 64,               # Can increase if you have more complex data
    'num_encoder_layers': 3,      # Can increase for more complex patterns
    'num_attention_heads': 4,     # Can adjust based on data complexity
    'dim_feedforward': 128,       # Can increase for more complex data
    'dropout': 0.1,              
    'attention_dropout': 0.1,     
    'batch_size': 64,            # Adjust based on your data size
    'learning_rate': 1e-4        
}

# 3. Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VolatilityRegimeFeatureExtractor(**model_params).to(device)

# 4. Prepare data
time_feature_cols = ['time_idx']

train_data = prepare_data_for_model(
    df=df,
    feature_cols=feature_cols,
    context_length=model_params['context_length'],
    time_feature_cols=time_feature_cols
)

# 5. Train model
training_params = {
    'num_epochs': 10,            # Might need more epochs for real data
    'alpha': 0.1,    
    'beta': 0.1,     
    'gamma': 0.2,    
    'verbose': True
}

history = train_model(
    model=model,
    train_data=train_data,
    **training_params
)

# 6. Extract features
features = model.extract_features(
    values=train_data['values'].to(device),
    time_features=train_data['time_features'].to(device),
    attention_mask=train_data['attention_mask'].to(device)
)

# Convert to numpy array
features_np = features.cpu().detach().numpy()

print("Input shape:", df.shape)
print("Features shape:", features_np.shape)

# 7. Save the trained model
#torch.save(model.state_dict(), 'volatility_regime_model.pth')

# 8. Optional: Save the extracted features
#features_df = pd.DataFrame(features_np)
#features_df.to_csv('extracted_features.csv', index=False)

Epoch 5/10 - Loss: 1.0488 - Recon: 1.4090 - Temp: 0.0397 - Div: -3.8874 - Vol: 0.1229
Epoch 10/10 - Loss: 0.9284 - Recon: 1.2970 - Temp: 0.0180 - Div: -3.8884 - Vol: 0.0921
Input shape: (2000, 5)
Features shape: (1951, 16)
