In [None]:
import pandas as pd
import numpy as np

# Load data
features_df = pd.read_csv('features.csv')

# Look at first few features that show NaN in your time series
nan_features = ['feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_73', 'feature_74']

# Extract tag patterns for these features
nan_patterns = features_df[features_df['feature'].isin(nan_features)]
print("Tag patterns for features that show NaN values:")
print(nan_patterns)

# Count True values for each tag column in NaN features
print("\nCount of True values for each tag in NaN features:")
true_counts = nan_patterns.iloc[:, 1:].sum()
print(true_counts[true_counts > 0])  # Only show tags that are True for any of these features

# Also look at some features that don't have NaN
non_nan_features = ['feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09']
non_nan_patterns = features_df[features_df['feature'].isin(non_nan_features)]
print("\nTag patterns for features that don't show NaN values:")
print(non_nan_patterns)

In [None]:
import pandas as pd
from methods import prepare_regression_data, train_model, evaluate_model, plot_separate_timeseries, create_timeseries_for_symbol

# Example usage
if __name__ == "__main__":
    # Load data
    df = pd.read_parquet("train.parquet/partition_id=0/part-0.parquet")
    symbol_id = df['symbol_id'].unique()[0]
    
    # Create time series
    features, responders, target = create_timeseries_for_symbol(df, symbol_id)
    
    # Plot
    plot_separate_timeseries(features, responders, target)

    # Prepare regression data
    X, y = prepare_regression_data(features, responders, target)
    
    # Train and evaluate model
    model, X_train, X_test, y_train, y_test = train_model(X, y)
    evaluate_model(model, X_train, X_test, y_train, y_test)

In [None]:
from methods import train_xgboost_model

# Example usage
if __name__ == "__main__":
    # Load data
    df = pd.read_parquet("train.parquet/partition_id=0/part-0.parquet")
    symbol_id = df['symbol_id'].unique()[0]
    
    # Create time series
    features, responders, target = create_timeseries_for_symbol(df, symbol_id)
    
    # Prepare regression data
    X, y = prepare_regression_data(features, responders, target)
    
    # Train and evaluate model
    model, X_train, X_test, y_train, y_test = train_xgboost_model(X, y)
    evaluate_model(model, X_train, X_test, y_train, y_test)

    # Print feature importances
    feature_importance = pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_}).sort_values('importance', ascending=False)
    
    print("\nTop 10 most important features:")
    print(feature_importance.head(10))