# Feature Engineering for Cryptocurrency Forecasting

This notebook demonstrates the feature engineering process for cryptocurrency price prediction.

In [None]:
# Import necessary libraries
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Import our modules
from data.data_loader import CryptoDataLoader
from data.feature_engineering import FeatureEngineer
from data.preprocessor import DataPreprocessor
from utils.helpers import calculate_returns, calculate_volatility

## 1. Load Data

In [None]:
# Initialize data loader
loader = CryptoDataLoader('../data/raw')

# Load Bitcoin data for the last year
ticker = 'BTC-USD'
data = loader.get_latest_data(ticker, days=365)

print(f"Loaded {len(data)} records for {ticker}")
print(f"Date range: {data.index[0].date()} to {data.index[-1].date()}")

# Display basic info
print("\nData Info:")
print(data.info())

# Display first few rows
print("\nFirst 5 rows:")
print(data.head())

## 2. Basic Price Features

In [None]:
# Create a copy for feature engineering
df = data.copy()

# Add basic price-based features
df['Price_Change'] = df['Close'].pct_change()
df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))
df['High_Low_Ratio'] = df['High'] / df['Low']
df['Close_Open_Ratio'] = df['Close'] / df['Open']

# Plot basic features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price Change
axes[0, 0].plot(df.index, df['Price_Change'])
axes[0, 0].set_title('Daily Price Change')
axes[0, 0].set_ylabel('Price Change')
axes[0, 0].grid(True, alpha=0.3)

# Log Return
axes[0, 1].plot(df.index, df['Log_Return'])
axes[0, 1].set_title('Log Returns')
axes[0, 1].set_ylabel('Log Return')
axes[0, 1].grid(True, alpha=0.3)

# High-Low Ratio
axes[1, 0].plot(df.index, df['High_Low_Ratio'])
axes[1, 0].set_title('High-Low Ratio')
axes[1, 0].set_ylabel('Ratio')
axes[1, 0].grid(True, alpha=0.3)

# Close-Open Ratio
axes[1, 1].plot(df.index, df['Close_Open_Ratio'])
axes[1, 1].set_title('Close-Open Ratio')
axes[1, 1].set_ylabel('Ratio')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Moving Averages

In [None]:
# Add moving averages
for window in [5, 10, 20, 50]:
    df[f'SMA_{window}'] = df['Close'].rolling(window=window).mean()
    df[f'EMA_{window}'] = df['Close'].ewm(span=window).mean()

# Plot moving averages
plt.figure(figsize=(14, 7))
plt.plot(df.index, df['Close'], label='Close Price', linewidth=2, color='black')
plt.plot(df.index, df['SMA_20'], label='20-day SMA', linewidth=2)
plt.plot(df.index, df['EMA_20'], label='20-day EMA', linewidth=2)
plt.plot(df.index, df['SMA_50'], label='50-day SMA', linewidth=2)

plt.title('Bitcoin Price with Moving Averages', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Price (USD)', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Add moving average crossovers
df['SMA_5_20_Cross'] = df['SMA_5'] / df['SMA_20'] - 1
df['SMA_20_50_Cross'] = df['SMA_20'] / df['SMA_50'] - 1

# Plot crossovers
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# SMA 5-20 Crossover
axes[0].plot(df.index, df['SMA_5_20_Cross'])
axes[0].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[0].set_title('SMA 5-20 Crossover')
axes[0].set_ylabel('Crossover Value')
axes[0].grid(True, alpha=0.3)

# SMA 20-50 Crossover
axes[1].plot(df.index, df['SMA_20_50_Cross'])
axes[1].axhline(y=0, color='red', linestyle='--', alpha=0.5)
axes[1].set_title('SMA 20-50 Crossover')
axes[1].set_ylabel('Crossover Value')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Momentum Indicators

In [None]:
# Initialize feature engineer
engineer = FeatureEngineer()

# Add technical indicators using our feature engineer
df_with_indicators = engineer.add_technical_indicators(data)

# Display momentum indicators
momentum_cols = ['RSI', 'Stoch_K', 'Stoch_D', 'Williams_R', 'UO']
print("Momentum Indicators:")
print(df_with_indicators[momentum_cols].describe())

# Plot momentum indicators
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(momentum_cols):
    if i < len(axes):
        axes[i].plot(df_with_indicators.index, df_with_indicators[col])
        axes[i].set_title(col)
        axes[i].set_ylabel('Value')
        axes[i].grid(True, alpha=0.3)
        
        # Add reference lines if applicable
        if col == 'RSI':
            axes[i].axhline(y=70, color='red', linestyle='--', alpha=0.5)
            axes[i].axhline(y=30, color='green', linestyle='--', alpha=0.5)
        elif col in ['Stoch_K', 'Stoch_D']:
            axes[i].axhline(y=80, color='red', linestyle='--', alpha=0.5)
            axes[i].axhline(y=20, color='green', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## 5. Trend Indicators

In [None]:
# Display trend indicators
trend_cols = ['MACD', 'MACD_Signal', 'MACD_Diff', 'ADX', 'CCI']
print("Trend Indicators:")
print(df_with_indicators[trend_cols].describe())

# Plot MACD
plt.figure(figsize=(14, 7))
plt.plot(df_with_indicators.index, df_with_indicators['MACD'], label='MACD', linewidth=2)
plt.plot(df_with_indicators.index, df_with_indicators['MACD_Signal'], label='Signal', linewidth=2)
plt.bar(df_with_indicators.index, df_with_indicators['MACD_Diff'], alpha=0.3, label='Difference')

plt.title('MACD Indicator', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Value', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Plot ADX and CCI
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# ADX
axes[0].plot(df_with_indicators.index, df_with_indicators['ADX'])
axes[0].axhline(y=25, color='red', linestyle='--', alpha=0.5, label='Trend Threshold')
axes[0].set_title('ADX (Average Directional Index)')
axes[0].set_ylabel('ADX Value')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# CCI
axes[1].plot(df_with_indicators.index, df_with_indicators['CCI'])
axes[1].axhline(y=100, color='red', linestyle='--', alpha=0.5)
axes[1].axhline(y=-100, color='green', linestyle='--', alpha=0.5)
axes[1].set_title('CCI (Commodity Channel Index)')
axes[1].set_ylabel('CCI Value')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Volatility Indicators

In [None]:
# Plot Bollinger Bands
plt.figure(figsize=(14, 7))
plt.plot(df_with_indicators.index, df_with_indicators['Close'], label='Close Price', linewidth=2, color='black')
plt.plot(df_with_indicators.index, df_with_indicators['BB_Upper'], label='Upper Band', linewidth=2)
plt.plot(df_with_indicators.index, df_with_indicators['BB_Middle'], label='Middle Band', linewidth=2)
plt.plot(df_with_indicators.index, df_with_indicators['BB_Lower'], label='Lower Band', linewidth=2)

# Fill between bands
plt.fill_between(df_with_indicators.index, 
                 df_with_indicators['BB_Upper'], 
                 df_with_indicators['BB_Lower'], 
                 color='gray', alpha=0.2)

plt.title('Bollinger Bands', fontsize=16)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Price (USD)', fontsize=12)
plt.legend(fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Plot Bollinger Band Width and Position
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Bollinger Band Width
axes[0].plot(df_with_indicators.index, df_with_indicators['BB_Width'])
axes[0].set_title('Bollinger Band Width')
axes[0].set_ylabel('Width')
axes[0].grid(True, alpha=0.3)

# Bollinger Band Position
axes[1].plot(df_with_indicators.index, df_with_indicators['BB_Position'])
axes[1].axhline(y=0.8, color='red', linestyle='--', alpha=0.5, label='Overbought')
axes[1].axhline(y=0.2, color='green', linestyle='--', alpha=0.5, label='Oversold')
axes[1].set_title('Bollinger Band Position')
axes[1].set_ylabel('Position')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Volume Indicators

In [None]:
# Plot volume indicators
volume_cols = ['Volume', 'Volume_SMA', 'Volume_Ratio', 'OBV', 'VWAP']

fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.flatten()

for i, col in enumerate(volume_cols):
    if i < len(axes) and col in df_with_indicators.columns:
        axes[i].plot(df_with_indicators.index, df_with_indicators[col])
        axes[i].set_title(col)
        axes[i].set_ylabel('Value')
        axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Statistical Features

In [None]:
# Plot statistical features
stat_cols = ['Volatility', 'Skewness', 'Kurtosis']

fig, axes = plt.subplots(3, 1, figsize=(14, 12))

for i, col in enumerate(stat_cols):
    if col in df_with_indicators.columns:
        axes[i].plot(df_with_indicators.index, df_with_indicators[col])
        axes[i].set_title(f'{col} (20-day rolling)')
        axes[i].set_ylabel('Value')
        axes[i].grid(True, alpha=0.3)
        
        # Add zero line for skewness
        if col == 'Skewness':
            axes[i].axhline(y=0, color='red', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

## 9. Time-Based Features

In [None]:
# Analyze time-based patterns
time_cols = ['Day_of_Week', 'Month', 'Quarter']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Day of Week
day_names = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
day_returns = df_with_indicators.groupby('Day_of_Week')['Log_Return'].mean()
axes[0].bar(range(len(day_returns)), day_returns.values)
axes[0].set_xticks(range(len(day_returns)))
axes[0].set_xticklabels(day_names)
axes[0].set_title('Average Returns by Day of Week')
axes[0].set_ylabel('Average Return')
axes[0].grid(True, alpha=0.3)

# Month
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_returns = df_with_indicators.groupby('Month')['Log_Return'].mean()
axes[1].bar(range(len(month_returns)), month_returns.values)
axes[1].set_xticks(range(len(month_returns)))
axes[1].set_xticklabels(month_names, rotation=45)
axes[1].set_title('Average Returns by Month')
axes[1].set_ylabel('Average Return')
axes[1].grid(True, alpha=0.3)

# Quarter
quarter_returns = df_with_indicators.groupby('Quarter')['Log_Return'].mean()
axes[2].bar(range(len(quarter_returns)), quarter_returns.values)
axes[2].set_xticks(range(len(quarter_returns)))
axes[2].set_xticklabels(['Q1', 'Q2', 'Q3', 'Q4'])
axes[2].set_title('Average Returns by Quarter')
axes[2].set_ylabel('Average Return')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Feature Correlation Analysis

In [None]:
# Calculate correlation with target
feature_cols = [col for col in df_with_indicators.columns if col != 'Close']
correlations = df_with_indicators[feature_cols].corrwith(df_with_indicators['Close'])

# Get top correlated features
top_correlations = correlations.abs().sort_values(ascending=False).head(20)
print("Top 20 Features Correlated with Close Price:")
print(top_correlations)

# Plot top correlations
plt.figure(figsize=(12, 8))
top_correlations.plot(kind='bar')
plt.title('Top 20 Features Correlated with Close Price')
plt.ylabel('Correlation Coefficient')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Create correlation heatmap for top features
top_features = top_correlations.index.tolist()
correlation_matrix = df_with_indicators[top_features].corr()

plt.figure(figsize=(14, 12))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Correlation Matrix of Top Features', fontsize=16)
plt.tight_layout()
plt.show()

## 11. Feature Selection

In [None]:
# Select features using different methods
selection_methods = ['correlation', 'mutual_info', 'variance']
selected_features = {}

for method in selection_methods:
    features = engineer.select_features(df_with_indicators, method=method, top_k=30)
    selected_features[method] = features
    print(f"\n{method.title()} method selected {len(features)} features:")
    print(features[:10])  # Show first 10

# Compare selected features
from collections import Counter

# Count feature occurrences across methods
feature_counts = Counter()
for features in selected_features.values():
    feature_counts.update(features)

# Get most commonly selected features
common_features = feature_counts.most_common(20)
print("\nMost Commonly Selected Features:")
for feature, count in common_features:
    print(f"{feature}: {count} methods")

## 12. Feature Importance Analysis

In [None]:
# Use Random Forest to get feature importance
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Prepare data
df_clean = df_with_indicators.dropna()
X = df_clean[feature_cols]
y = df_clean['Close']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_scaled, y)

# Get feature importance
importance = rf.feature_importances_
feature_importance = pd.DataFrame({'feature': feature_cols, 'importance': importance})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot top 20 important features
plt.figure(figsize=(12, 8))
top_20 = feature_importance.head(20)
plt.barh(top_20['feature'], top_20['importance'])
plt.title('Top 20 Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 13. Create Sequences for Modeling

In [None]:
# Create sequences using selected features
selected_feature_list = common_features[:15]  # Use top 15 common features
engineer.feature_columns = selected_feature_list

# Create sequences with different window sizes
window_sizes = [7, 14, 30]

for window_size in window_sizes:
    X, y = engineer.create_sequences(
        df_with_indicators, 
        sequence_length=window_size,
        prediction_horizon=1
    )
    
    print(f"\nWindow Size: {window_size}")
    print(f"X shape: {X.shape}")
    print(f"y shape: {y.shape}")
    print(f"Sample X[0]:\n{X[0][:3]}")  # Show first 3 timesteps
    print(f"Sample y[0]: {y[0]}")

## 14. Summary of Feature Engineering

### Key Features Created:

1. **Price-based Features**: Price changes, log returns, high-low ratios
2. **Moving Averages**: SMA and EMA for multiple windows
3. **Momentum Indicators**: RSI, Stochastic, Williams %R, Ultimate Oscillator
4. **Trend Indicators**: MACD, ADX, CCI, DMI
5. **Volatility Indicators**: Bollinger Bands, ATR, Keltner Channels
6. **Volume Indicators**: OBV, VWAP, Volume ratios
7. **Statistical Features**: Rolling volatility, skewness, kurtosis
8. **Time-based Features**: Day of week, month, quarter
9. **Lag Features**: Previous values for different time lags

### Feature Selection Methods:

1. **Correlation-based**: Select features highly correlated with target
2. **Mutual Information**: Select features with high mutual information
3. **Variance-based**: Select features with high variance
4. **Tree-based**: Use Random Forest feature importance

### Key Insights:

1. **Lag Features**: Recent prices and returns are most predictive
2. **Moving Averages**: Medium-term trends (20-50 days) are important
3. **Volatility**: Recent volatility is a strong predictor
4. **Volume**: Volume-related features provide additional information
5. **Technical Indicators**: RSI and MACD are among the most useful

These features will be used to train our deep learning models for cryptocurrency price prediction.