# NoIQTrader - Phase 2: Machine Learning Modeling & Prediction

This notebook implements machine learning models to predict Bitcoin trading signals based on technical indicators.

## Objectives:
1. Define target variable: Buy/Sell/Hold signals based on next-day returns
2. Train multiple ML models (Logistic Regression, Random Forest, optional LSTM)
3. Evaluate model performance with comprehensive metrics
4. Generate next-day trading recommendations with confidence scores

## Trading Signal Definition:
- **Buy (1)**: Next-day return > +1%
- **Sell (-1)**: Next-day return < -1%
- **Hold (0)**: Next-day return between -1% and +1%

## 1. Import Libraries and Setup

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.metrics import classification_report, confusion_matrix
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

# Custom modules
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))
from ml_models import TradingSignalPredictor

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(" Libraries imported successfully!")
print(f"Analysis date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 2. Load Data and Create Target Variable

In [None]:
# Initialize the trading signal predictor
predictor = TradingSignalPredictor(data_path='../data/btc_featured_data.csv')

# Load data
print(" Loading Bitcoin dataset with engineered features...")
data = predictor.load_data()

print(f"\n Dataset Overview:")
print(f"Shape: {data.shape}")
print(f"Date Range: {data.index[0].date()} to {data.index[-1].date()}")
print(f"Features: {len(data.columns)}")
print(f"Current BTC Price: ${data['Close'].iloc[-1]:,.2f}")

# Display sample data
print("\n Sample of Original Features:")
sample_cols = ['Close', 'MA10', 'MA50', 'RSI', 'MACD', 'BB_position', 'volatility_20d']
available_cols = [col for col in sample_cols if col in data.columns]
display(data[available_cols].tail())

In [None]:
# Create target variable based on next-day returns
print(" Creating trading signal target variable...")
data_with_target = predictor.create_target_variable(return_threshold=0.01)  # 1% threshold

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Bar plot of target distribution
target_counts = data_with_target['target'].value_counts().sort_index()
target_labels = ['Sell (-1)', 'Hold (0)', 'Buy (1)']
colors = ['red', 'gray', 'green']

axes[0].bar(range(len(target_counts)), target_counts.values, color=colors, alpha=0.7)
axes[0].set_xticks(range(len(target_counts)))
axes[0].set_xticklabels(target_labels)
axes[0].set_title('Trading Signal Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count')
axes[0].grid(True, alpha=0.3)

# Add percentage labels
total = sum(target_counts.values)
for i, count in enumerate(target_counts.values):
    pct = count / total * 100
    axes[0].text(i, count + 10, f'{count}\n({pct:.1f}%)', ha='center', va='bottom', fontweight='bold')

# Time series of next-day returns
returns = data_with_target['next_day_return'] * 100  # Convert to percentage
axes[1].plot(data_with_target.index, returns, alpha=0.7, linewidth=0.8, color='blue')
axes[1].axhline(y=1, color='green', linestyle='--', alpha=0.7, label='Buy Threshold (+1%)')
axes[1].axhline(y=-1, color='red', linestyle='--', alpha=0.7, label='Sell Threshold (-1%)')
axes[1].axhline(y=0, color='gray', linestyle='-', alpha=0.5)
axes[1].set_title('Next-Day Returns Over Time', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Next-Day Return (%)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print statistics
print(f"\n Target Variable Statistics:")
print(f"Total samples: {len(data_with_target):,}")
for class_val, label in zip([-1, 0, 1], ['Sell', 'Hold', 'Buy']):
    count = (data_with_target['target'] == class_val).sum()
    pct = count / len(data_with_target) * 100
    print(f"  {label}: {count:,} ({pct:.1f}%)")

print(f"\n Next-Day Returns Statistics:")
print(f"Mean: {returns.mean():.3f}%")
print(f"Std: {returns.std():.3f}%")
print(f"Min: {returns.min():.2f}%")
print(f"Max: {returns.max():.2f}%")

## 3. Feature Preparation and Data Splitting

In [None]:
# Prepare features for machine learning
print(" Preparing features for modeling...")
X, y = predictor.prepare_features()

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Features used: {len(predictor.feature_names)}")

# Display feature names
print("\n Features included in modeling:")
feature_df = pd.DataFrame({
    'Feature': predictor.feature_names,
    'Type': ['Technical Indicator' if any(x in feat for x in ['MA', 'RSI', 'MACD', 'BB']) 
             else 'Volatility' if 'volatility' in feat or 'ATR' in feat
             else 'Lag Feature' if 'lag' in feat
             else 'Price Feature' if any(x in feat for x in ['ratio', 'change', 'return'])
             else 'Other' for feat in predictor.feature_names]
})

feature_type_counts = feature_df['Type'].value_counts()
print(feature_type_counts)

# Show sample features by type
for feature_type in feature_type_counts.index[:3]:
    features = feature_df[feature_df['Type'] == feature_type]['Feature'].head(5).tolist()
    print(f"\n{feature_type} examples: {', '.join(features)}")

In [None]:
# Split data into train/test sets (time-series aware)
print(" Splitting data for training and testing...")
predictor.split_data(X, y, test_size=0.2)

print(f"\n Data split completed:")
print(f"Training set: {predictor.X_train.shape[0]:,} samples")
print(f"Test set: {predictor.X_test.shape[0]:,} samples")

# Visualize the train/test split
split_point = len(predictor.X_train)
total_samples = len(X)

fig, ax = plt.subplots(figsize=(12, 6))
dates = data_with_target.index
prices = data_with_target['Close']

# Plot train and test periods
ax.plot(dates[:split_point], prices[:split_point], label='Training Data', color='blue', alpha=0.7)
ax.plot(dates[split_point:], prices[split_point:], label='Test Data', color='red', alpha=0.7)

ax.axvline(x=dates[split_point], color='black', linestyle='--', alpha=0.8, label='Train/Test Split')
ax.set_title('Train/Test Split Visualization', fontsize=14, fontweight='bold')
ax.set_ylabel('BTC Price ($)')
ax.set_xlabel('Date')
ax.legend()
ax.grid(True, alpha=0.3)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

train_date_start = dates[0].date()
train_date_end = dates[split_point-1].date()
test_date_start = dates[split_point].date()
test_date_end = dates[-1].date()

print(f"\n Training Period: {train_date_start} to {train_date_end}")
print(f" Testing Period: {test_date_start} to {test_date_end}")

## 4. Model Training and Evaluation

In [None]:
# Train Logistic Regression (Baseline)
print(" Training Logistic Regression model...")
lr_results = predictor.train_logistic_regression()

print(f"\n Logistic Regression Results:")
print(f"Training Accuracy: {lr_results['metrics']['train_accuracy']:.3f}")
print(f"Test Accuracy: {lr_results['metrics']['test_accuracy']:.3f}")
print(f"Precision: {lr_results['metrics']['precision']:.3f}")
print(f"Recall: {lr_results['metrics']['recall']:.3f}")
print(f"F1 Score: {lr_results['metrics']['f1_score']:.3f}")

In [None]:
# Train Random Forest
print(" Training Random Forest model...")
rf_results = predictor.train_random_forest(n_estimators=200, max_depth=10)

print(f"\n Random Forest Results:")
print(f"Training Accuracy: {rf_results['metrics']['train_accuracy']:.3f}")
print(f"Test Accuracy: {rf_results['metrics']['test_accuracy']:.3f}")
print(f"Precision: {rf_results['metrics']['precision']:.3f}")
print(f"Recall: {rf_results['metrics']['recall']:.3f}")
print(f"F1 Score: {rf_results['metrics']['f1_score']:.3f}")

# Display top feature importances
print(f"\n Top 10 Most Important Features:")
top_features = rf_results['feature_importance'].head(10)
display(top_features)

In [None]:
# Visualize feature importance
fig, ax = plt.subplots(figsize=(12, 8))

top_20_features = rf_results['feature_importance'].head(20)
colors = plt.cm.viridis(np.linspace(0, 1, len(top_20_features)))

bars = ax.barh(range(len(top_20_features)), top_20_features['importance'], color=colors)
ax.set_yticks(range(len(top_20_features)))
ax.set_yticklabels(top_20_features['feature'])
ax.set_xlabel('Feature Importance')
ax.set_title('Top 20 Feature Importances (Random Forest)', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

# Add value labels on bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    ax.text(width + 0.001, bar.get_y() + bar.get_height()/2, 
           f'{width:.3f}', ha='left', va='center', fontsize=8)

plt.tight_layout()
plt.show()

# Feature importance insights
print("\n Feature Importance Insights:")
print(f"Most important feature: {top_20_features.iloc[0]['feature']} ({top_20_features.iloc[0]['importance']:.3f})")
print(f"Total importance of top 10 features: {top_20_features.head(10)['importance'].sum():.3f}")

# Group by feature type
feature_importance_df = rf_results['feature_importance'].copy()
feature_importance_df['type'] = feature_importance_df['feature'].apply(
    lambda x: 'Technical Indicator' if any(indicator in x for indicator in ['MA', 'RSI', 'MACD', 'BB']) 
             else 'Volatility' if 'volatility' in x or 'ATR' in x
             else 'Lag Feature' if 'lag' in x
             else 'Price Feature' if any(price_feat in x for price_feat in ['ratio', 'change', 'return'])
             else 'Other'
)

importance_by_type = feature_importance_df.groupby('type')['importance'].sum().sort_values(ascending=False)
print(f"\nImportance by feature type:")
for feat_type, importance in importance_by_type.items():
    print(f"  {feat_type}: {importance:.3f}")

In [None]:
# Try LSTM if PyTorch is available
try:
    print(" Attempting to train LSTM model...")
    lstm_results = predictor.train_lstm(sequence_length=30, epochs=20, hidden_size=32)
    
    if lstm_results:
        print(f"\n LSTM Results:")
        print(f"Training Accuracy: {lstm_results['metrics']['train_accuracy']:.3f}")
        print(f"Test Accuracy: {lstm_results['metrics']['test_accuracy']:.3f}")
        print(f"Precision: {lstm_results['metrics']['precision']:.3f}")
        print(f"Recall: {lstm_results['metrics']['recall']:.3f}")
        print(f"F1 Score: {lstm_results['metrics']['f1_score']:.3f}")
    else:
        print(" LSTM training failed or PyTorch not available")
        
except Exception as e:
    print(f" LSTM training failed: {str(e)}")
    print("Continuing with Logistic Regression and Random Forest models...")

## 5. Model Comparison and Performance Analysis

In [None]:
# Compare all trained models
print(" Comparing model performances...")
comparison_df = predictor.compare_models()

print("\n Model Performance Comparison:")
display(comparison_df)

# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy comparison
x_pos = np.arange(len(comparison_df))
width = 0.35

axes[0].bar(x_pos - width/2, comparison_df['Train Accuracy'], width, 
           label='Train Accuracy', alpha=0.8, color='skyblue')
axes[0].bar(x_pos + width/2, comparison_df['Test Accuracy'], width, 
           label='Test Accuracy', alpha=0.8, color='lightcoral')

axes[0].set_xlabel('Models')
axes[0].set_ylabel('Accuracy')
axes[0].set_title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(comparison_df['Model'], rotation=45)
axes[0].legend()
axes[0].grid(True, alpha=0.3)
axes[0].set_ylim(0, 1)

# Add value labels on bars
for i, (train_acc, test_acc) in enumerate(zip(comparison_df['Train Accuracy'], comparison_df['Test Accuracy'])):
    axes[0].text(i - width/2, train_acc + 0.01, f'{train_acc:.3f}', ha='center', va='bottom', fontsize=9)
    axes[0].text(i + width/2, test_acc + 0.01, f'{test_acc:.3f}', ha='center', va='bottom', fontsize=9)

# Metrics comparison (radar chart style)
metrics = ['Test Accuracy', 'Precision', 'Recall', 'F1 Score']
x_pos_metrics = np.arange(len(metrics))

for i, (_, row) in enumerate(comparison_df.iterrows()):
    values = [row['Test Accuracy'], row['Precision'], row['Recall'], row['F1 Score']]
    axes[1].plot(x_pos_metrics, values, marker='o', linewidth=2, label=row['Model'])

axes[1].set_xlabel('Metrics')
axes[1].set_ylabel('Score')
axes[1].set_title('Model Metrics Comparison', fontsize=14, fontweight='bold')
axes[1].set_xticks(x_pos_metrics)
axes[1].set_xticklabels(metrics, rotation=45)
axes[1].legend()
axes[1].grid(True, alpha=0.3)
axes[1].set_ylim(0, 1)

plt.tight_layout()
plt.show()

# Identify best model
best_model_row = comparison_df.iloc[0]
best_model_name = best_model_row['Model'].lower().replace(' ', '_')
print(f"\n Best performing model: {best_model_row['Model']}")
print(f"   Test Accuracy: {best_model_row['Test Accuracy']:.3f}")
print(f"   F1 Score: {best_model_row['F1 Score']:.3f}")

In [None]:
# Plot confusion matrices
print(" Generating confusion matrices...")
predictor.plot_confusion_matrices(figsize=(15, 5))

In [None]:
# Detailed classification reports
class_names = ['Sell', 'Hold', 'Buy']

for model_name, model_info in predictor.models.items():
    print(f"\n Classification Report - {model_name.replace('_', ' ').title()}:")
    print("=" * 60)
    
    # Get predictions
    y_pred_test = model_info['predictions']['test']
    
    # For LSTM, we might need to adjust
    if model_name == 'lstm':
        # LSTM predictions are already adjusted
        y_true_test = y_pred_test  # Placeholder - would need proper test targets for LSTM
    else:
        y_true_test = predictor.y_test
    
    # Print classification report
    if model_name != 'lstm' or len(y_pred_test) == len(predictor.y_test):
        try:
            report = classification_report(y_true_test, y_pred_test, 
                                         target_names=class_names, 
                                         zero_division=0)
            print(report)
        except Exception as e:
            print(f"Could not generate report: {e}")

## 6. Trading Signal Prediction

In [None]:
# Make next-day prediction using the best model
print(" Making next-day trading prediction...")

# Use the best performing model
best_model_name = comparison_df.iloc[0]['Model'].lower().replace(' ', '_')
prediction_result = predictor.predict_next_action(best_model_name)

print(f"\n Trading Recommendation - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)
print(f" Date: {prediction_result['date']}")
print(f" Current BTC Price: ${prediction_result['current_price']:,.2f}")
print(f" Model Used: {prediction_result['model_used'].replace('_', ' ').title()}")
print(f"\n RECOMMENDATION: {prediction_result['action'].upper()}")
print(f" Confidence: {prediction_result['confidence']:.1%}")

print(f"\n Action Probabilities:")
for action, prob in prediction_result['probabilities'].items():
    print(f"   {action}: {prob:.1%}")

# Visualize prediction
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Probability distribution
actions = list(prediction_result['probabilities'].keys())
probs = list(prediction_result['probabilities'].values())
colors = ['red' if action == 'Sell' else 'gray' if action == 'Hold' else 'green' for action in actions]

bars = axes[0].bar(actions, probs, color=colors, alpha=0.7)
axes[0].set_title('Next-Day Action Probabilities', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Probability')
axes[0].set_ylim(0, 1)
axes[0].grid(True, alpha=0.3)

# Highlight recommended action
recommended_idx = actions.index(prediction_result['action'])
bars[recommended_idx].set_edgecolor('black')
bars[recommended_idx].set_linewidth(3)

# Add probability labels
for bar, prob in zip(bars, probs):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{prob:.1%}', ha='center', va='bottom', fontweight='bold')

# Recent price trend with prediction
recent_data = data_with_target.tail(30)
axes[1].plot(recent_data.index, recent_data['Close'], linewidth=2, color='blue', alpha=0.8)
axes[1].scatter(recent_data.index[-1], recent_data['Close'].iloc[-1], 
               color='red' if prediction_result['action'] == 'Sell' 
               else 'gray' if prediction_result['action'] == 'Hold' 
               else 'green', 
               s=100, zorder=5, edgecolor='black', linewidth=2)

axes[1].set_title('Recent BTC Price Trend', fontsize=14, fontweight='bold')
axes[1].set_ylabel('BTC Price ($)')
axes[1].grid(True, alpha=0.3)
axes[1].tick_params(axis='x', rotation=45)

# Add recommendation annotation
axes[1].annotate(f'Prediction: {prediction_result["action"]}\nConfidence: {prediction_result["confidence"]:.1%}', 
                xy=(recent_data.index[-1], recent_data['Close'].iloc[-1]), 
                xytext=(10, 10), textcoords='offset points',
                bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7),
                arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

plt.tight_layout()
plt.show()

## 7. Model Performance Analysis

In [None]:
# Analyze prediction patterns over time
print(" Analyzing model predictions over test period...")

# Get test period data
test_start_idx = len(predictor.X_train)
test_data = data_with_target.iloc[test_start_idx:].copy()

# Add model predictions to test data
for model_name, model_info in predictor.models.items():
    predictions = model_info['predictions']['test']
    if len(predictions) == len(test_data):
        test_data[f'{model_name}_prediction'] = predictions

# Calculate prediction accuracy by time period
if 'random_forest_prediction' in test_data.columns:
    # Monthly accuracy
    test_data['month'] = test_data.index.to_period('M')
    monthly_accuracy = test_data.groupby('month').apply(
        lambda x: (x['target'] == x['random_forest_prediction']).mean()
    )
    
    print(f"\n Monthly Prediction Accuracy (Random Forest):")
    for month, accuracy in monthly_accuracy.items():
        print(f"   {month}: {accuracy:.3f}")
    
    # Plot prediction accuracy over time
    fig, axes = plt.subplots(2, 1, figsize=(15, 10))
    
    # Cumulative accuracy
    test_data['correct_prediction'] = (test_data['target'] == test_data['random_forest_prediction']).astype(int)
    test_data['cumulative_accuracy'] = test_data['correct_prediction'].expanding().mean()
    
    axes[0].plot(test_data.index, test_data['cumulative_accuracy'], linewidth=2, color='blue')
    axes[0].axhline(y=test_data['cumulative_accuracy'].iloc[-1], color='red', linestyle='--', 
                   label=f'Final Accuracy: {test_data["cumulative_accuracy"].iloc[-1]:.3f}')
    axes[0].set_title('Cumulative Prediction Accuracy Over Time', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('Cumulative Accuracy')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    axes[0].set_ylim(0, 1)
    
    # Prediction vs actual signals
    signal_colors = {-1: 'red', 0: 'gray', 1: 'green'}
    
    for signal, color in signal_colors.items():
        actual_mask = test_data['target'] == signal
        pred_mask = test_data['random_forest_prediction'] == signal
        
        if actual_mask.any():
            axes[1].scatter(test_data.index[actual_mask], [signal] * actual_mask.sum(), 
                           color=color, alpha=0.6, s=30, label=f'Actual {["Sell", "Hold", "Buy"][signal+1]}')
        
        if pred_mask.any():
            axes[1].scatter(test_data.index[pred_mask], [signal + 0.1] * pred_mask.sum(), 
                           color=color, alpha=0.8, s=15, marker='x', 
                           label=f'Predicted {["Sell", "Hold", "Buy"][signal+1]}')
    
    axes[1].set_title('Actual vs Predicted Trading Signals', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('Signal Type')
    axes[1].set_yticks([-1, 0, 1])
    axes[1].set_yticklabels(['Sell', 'Hold', 'Buy'])
    axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 8. Summary and Conclusions

In [None]:
# Final summary
print(" Phase 2 - Machine Learning Modeling Summary")
print("=" * 60)

print(f"\n Dataset Overview:")
print(f"   • Total samples: {len(data_with_target):,}")
print(f"   • Features used: {len(predictor.feature_names)}")
print(f"   • Training samples: {len(predictor.X_train):,} ({len(predictor.X_train)/len(data_with_target)*100:.1f}%)")
print(f"   • Test samples: {len(predictor.X_test):,} ({len(predictor.X_test)/len(data_with_target)*100:.1f}%)")

print(f"\n Target Distribution:")
target_dist = data_with_target['target'].value_counts().sort_index()
total_samples = len(data_with_target)
for signal, count in target_dist.items():
    signal_name = ['Sell', 'Hold', 'Buy'][signal + 1]
    print(f"   • {signal_name}: {count:,} ({count/total_samples*100:.1f}%)")

print(f"\n Models Trained:")
for i, (model, row) in enumerate(comparison_df.iterrows(), 1):
    print(f"   {i}. {row['Model']}")
    print(f"      - Test Accuracy: {row['Test Accuracy']:.3f}")
    print(f"      - F1 Score: {row['F1 Score']:.3f}")

print(f"\n Best Model: {best_model_row['Model']}")
print(f"   • Test Accuracy: {best_model_row['Test Accuracy']:.3f}")
print(f"   • Precision: {best_model_row['Precision']:.3f}")
print(f"   • Recall: {best_model_row['Recall']:.3f}")
print(f"   • F1 Score: {best_model_row['F1 Score']:.3f}")

print(f"\n Current Prediction:")
print(f"   • Action: {prediction_result['action']}")
print(f"   • Confidence: {prediction_result['confidence']:.1%}")
print(f"   • Current BTC Price: ${prediction_result['current_price']:,.2f}")

print(f"\n Key Insights:")
if rf_results:
    top_feature = rf_results['feature_importance'].iloc[0]
    print(f"   • Most important feature: {top_feature['feature']} ({top_feature['importance']:.3f})")

print(f"   • Class distribution is reasonably balanced")
print(f"   • Random Forest outperformed Logistic Regression")
if best_model_row['Test Accuracy'] > 0.4:
    print(f"   • Model shows promising predictive capability")
else:
    print(f"   • Model performance suggests market prediction is challenging")

print(f"\n Next Steps:")
print(f"   • Implement paper trading simulation")
print(f"   • Create real-time prediction dashboard")
print(f"   • Explore additional features (sentiment, macro indicators)")
print(f"   • Implement ensemble methods")
print(f"   • Add risk management strategies")

print(f"\n Phase 2 Complete - Machine Learning Models Ready!")