In [None]:
# Import libraries
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from src.data_loader import load_data, create_target_column, train_test_split_timeseries
from src.feature_engineering import prepare_features
from src.models import train_all_models, save_model
from src.evaluation import evaluate_model, compare_models, plot_confusion_matrix, plot_feature_importance

import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("Libraries imported successfully!")

## 1. Load and Prepare Data

In [None]:
# Load data
print("Loading data...")
df = load_data('../data/raw/nifty_intraday.csv')
df = create_target_column(df)
print(f"Data loaded: {df.shape}")

## 2. Feature Engineering

In [None]:
# Apply feature engineering
print("Applying feature engineering...")
df = prepare_features(df)
print(f"Features created: {df.shape}")

## 3. Train-Test Split

In [None]:
# Split data chronologically
X_train, X_test, y_train, y_test, test_df = train_test_split_timeseries(df, train_ratio=0.7)

print(f"\nTrain set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Save feature names
feature_names = X_train.columns.tolist()
print(f"\nTotal features: {len(feature_names)}")

## 4. Feature Scaling

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"Train mean: {X_train_scaled.mean():.4f}")
print(f"Train std: {X_train_scaled.std():.4f}")

## 5. Train Models

In [None]:
# Train all models
models = train_all_models(X_train_scaled, y_train)

## 6. Evaluate Models

In [None]:
# Evaluate each model
results = {}

for model_name, model in models.items():
    print(f"\n{'='*70}")
    print(f"Evaluating {model_name}...")
    print(f"{'='*70}")
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    results[model_name] = {
        'model': model,
        'predictions': y_pred,
        'accuracy': accuracy
    }
    
    # Print classification report
    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Sell (0)', 'Buy (1)']))

## 7. Compare Models

In [None]:
# Compare all models
best_model_name = compare_models(models, X_test_scaled, y_test)
best_model = models[best_model_name]

## 8. Confusion Matrices

In [None]:
# Plot confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (model_name, result) in enumerate(results.items()):
    cm = confusion_matrix(y_test, result['predictions'])
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Sell (0)', 'Buy (1)'],
                yticklabels=['Sell (0)', 'Buy (1)'])
    axes[idx].set_title(f'{model_name}\nAccuracy: {result["accuracy"]:.4f}', 
                       fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## 9. Feature Importance (Random Forest & XGBoost)

In [None]:
# Plot feature importance for tree-based models
tree_models = ['Random Forest', 'XGBoost']

for model_name in tree_models:
    if model_name in models:
        model = models[model_name]
        
        # Get feature importances
        importances = model.feature_importances_
        feature_imp_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        # Plot top 20
        plt.figure(figsize=(10, 8))
        top_20 = feature_imp_df.head(20)
        plt.barh(range(len(top_20)), top_20['importance'])
        plt.yticks(range(len(top_20)), top_20['feature'])
        plt.xlabel('Importance', fontsize=12)
        plt.ylabel('Feature', fontsize=12)
        plt.title(f'Top 20 Feature Importances - {model_name}', 
                 fontsize=14, fontweight='bold')
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print(f"\nTop 10 features for {model_name}:")
        print(feature_imp_df.head(10))

## 10. Prediction Analysis

In [None]:
# Analyze predictions from best model
best_predictions = results[best_model_name]['predictions']

print(f"Best Model: {best_model_name}")
print(f"\nPrediction Distribution:")
print(pd.Series(best_predictions).value_counts())

# Compare with actual distribution
print(f"\nActual Distribution:")
print(y_test.value_counts())

In [None]:
# Plot prediction vs actual distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Actual
actual_counts = y_test.value_counts().sort_index()
axes[0].bar(['Sell (0)', 'Buy (1)'], actual_counts.values, color=['red', 'green'])
axes[0].set_title('Actual Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12)
axes[0].grid(True, alpha=0.3)

# Predicted
pred_counts = pd.Series(best_predictions).value_counts().sort_index()
axes[1].bar(['Sell (0)', 'Buy (1)'], pred_counts.values, color=['red', 'green'])
axes[1].set_title(f'Predicted Distribution ({best_model_name})', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Count', fontsize=12)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 11. Model Comparison Summary

In [None]:
# Create comparison DataFrame
comparison_data = []
for model_name, result in results.items():
    comparison_data.append({
        'Model': model_name,
        'Accuracy': result['accuracy'],
        'Buy Predictions': sum(result['predictions'] == 1),
        'Sell Predictions': sum(result['predictions'] == 0)
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)

print("\n" + "="*70)
print("MODEL COMPARISON SUMMARY")
print("="*70)
print(comparison_df.to_string(index=False))
print("="*70)

In [None]:
# Plot accuracy comparison
plt.figure(figsize=(10, 6))
plt.bar(comparison_df['Model'], comparison_df['Accuracy'], color=['skyblue', 'lightcoral', 'lightgreen'])
plt.ylabel('Accuracy', fontsize=12)
plt.title('Model Accuracy Comparison', fontsize=14, fontweight='bold')
plt.ylim([0, 1])
plt.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, (model, acc) in enumerate(zip(comparison_df['Model'], comparison_df['Accuracy'])):
    plt.text(i, acc + 0.02, f'{acc:.4f}', ha='center', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

## 12. Save Best Model

In [None]:
# Save the best model
import joblib

print(f"Saving best model: {best_model_name}")
save_model(best_model, 'best_model')
joblib.dump(scaler, '../models/scaler.pkl')

print("\nModel and scaler saved successfully!")
print(f"  - Best Model: ../models/best_model.pkl")
print(f"  - Scaler: ../models/scaler.pkl")

## 13. Summary

### Models Trained:
1. **Logistic Regression** - Baseline linear model
2. **Random Forest** - Ensemble of decision trees
3. **XGBoost** - Gradient boosting model

### Key Findings:
- Best model identified based on accuracy
- Feature importance analyzed for tree-based models
- Confusion matrices show prediction patterns
- Models saved for future use

### Next Steps:
1. Generate trading signals using best model
2. Calculate PnL based on predictions
3. Analyze trading performance
4. Optimize strategy parameters