In [None]:
# Import libraries
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loader import load_data, create_target_column
from src.feature_engineering import prepare_features
from sklearn.ensemble import RandomForestClassifier

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)

print("Libraries imported successfully!")

## 1. Load and Prepare Data

In [None]:
# Load data
df = load_data('../data/raw/nifty_intraday.csv')
df = create_target_column(df)

print(f"Initial shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

## 2. Apply Feature Engineering

In [None]:
# Apply all feature engineering
df_features = prepare_features(df)

print(f"\nFinal shape: {df_features.shape}")
print(f"Total features: {len(df_features.columns)}")

In [None]:
# Display feature names
print("\nAll Feature Names:")
print("="*70)
for i, col in enumerate(df_features.columns, 1):
    print(f"{i:2d}. {col}")

## 3. Check for Missing Values

In [None]:
# Check for NaN values
missing = df_features.isnull().sum()
print("Missing values per column:")
print(missing[missing > 0])
print(f"\nTotal missing values: {missing.sum()}")

## 4. Feature Statistics

In [None]:
# Display statistics for a few key features
key_features = ['Close', 'returns', 'RSI_14', 'MACD', 'SMA_20', 'ATR_14', 'body_size']
available_features = [f for f in key_features if f in df_features.columns]

print("Statistics for key features:")
df_features[available_features].describe()

## 5. Correlation Analysis

In [None]:
# Select numeric features only (exclude Timestamp)
numeric_cols = df_features.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'target']

# Calculate correlation with target
correlations = df_features[numeric_cols + ['target']].corr()['target'].drop('target')
correlations = correlations.abs().sort_values(ascending=False)

print("Top 20 features correlated with target:")
print(correlations.head(20))

In [None]:
# Plot top correlations
plt.figure(figsize=(10, 8))
top_corr = correlations.head(20)
plt.barh(range(len(top_corr)), top_corr.values)
plt.yticks(range(len(top_corr)), top_corr.index)
plt.xlabel('Absolute Correlation with Target', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 20 Features by Correlation with Target', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Feature Importance using Random Forest

In [None]:
# Prepare data for Random Forest
X = df_features[numeric_cols]
y = df_features['target']

print(f"Training Random Forest for feature importance...")
print(f"X shape: {X.shape}, y shape: {y.shape}")

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X, y)

print("Random Forest trained successfully!")

In [None]:
# Get feature importances
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 features by Random Forest importance:")
print(feature_importance.head(20))

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.title('Top 20 Features by Random Forest Importance', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Visualize Technical Indicators

In [None]:
# Select a sample period for visualization
sample_size = 500
df_sample = df_features.head(sample_size).copy()

# Create figure with subplots
fig, axes = plt.subplots(4, 1, figsize=(16, 14))

# Plot 1: Price with Moving Averages
axes[0].plot(df_sample.index, df_sample['Close'], label='Close', linewidth=2)
axes[0].plot(df_sample.index, df_sample['SMA_5'], label='SMA 5', alpha=0.7)
axes[0].plot(df_sample.index, df_sample['SMA_20'], label='SMA 20', alpha=0.7)
axes[0].set_title('Close Price with Moving Averages', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Price', fontsize=12)
axes[0].legend(loc='best')
axes[0].grid(True, alpha=0.3)

# Plot 2: RSI
axes[1].plot(df_sample.index, df_sample['RSI_14'], color='purple', linewidth=2)
axes[1].axhline(y=70, color='red', linestyle='--', linewidth=1, label='Overbought (70)')
axes[1].axhline(y=30, color='green', linestyle='--', linewidth=1, label='Oversold (30)')
axes[1].set_title('Relative Strength Index (RSI)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('RSI', fontsize=12)
axes[1].legend(loc='best')
axes[1].grid(True, alpha=0.3)

# Plot 3: MACD
axes[2].plot(df_sample.index, df_sample['MACD'], label='MACD', linewidth=2)
axes[2].plot(df_sample.index, df_sample['MACD_signal'], label='Signal', linewidth=2)
axes[2].bar(df_sample.index, df_sample['MACD_hist'], label='Histogram', alpha=0.3)
axes[2].set_title('MACD (Moving Average Convergence Divergence)', fontsize=14, fontweight='bold')
axes[2].set_ylabel('MACD', fontsize=12)
axes[2].legend(loc='best')
axes[2].grid(True, alpha=0.3)

# Plot 4: Bollinger Bands
axes[3].plot(df_sample.index, df_sample['Close'], label='Close', linewidth=2)
axes[3].plot(df_sample.index, df_sample['BB_upper'], label='Upper Band', linestyle='--', alpha=0.7)
axes[3].plot(df_sample.index, df_sample['BB_middle'], label='Middle Band', linestyle='--', alpha=0.7)
axes[3].plot(df_sample.index, df_sample['BB_lower'], label='Lower Band', linestyle='--', alpha=0.7)
axes[3].fill_between(df_sample.index, df_sample['BB_lower'], df_sample['BB_upper'], alpha=0.1)
axes[3].set_title('Bollinger Bands', fontsize=14, fontweight='bold')
axes[3].set_ylabel('Price', fontsize=12)
axes[3].set_xlabel('Index', fontsize=12)
axes[3].legend(loc='best')
axes[3].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Feature Distribution Analysis

In [None]:
# Plot distributions of key features
key_features = ['returns', 'RSI_14', 'body_size', 'ATR_14']
available = [f for f in key_features if f in df_features.columns]

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.ravel()

for i, feature in enumerate(available):
    axes[i].hist(df_features[feature].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[i].set_title(f'Distribution of {feature}', fontsize=12, fontweight='bold')
    axes[i].set_xlabel(feature, fontsize=10)
    axes[i].set_ylabel('Frequency', fontsize=10)
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Save Feature-Engineered Data

In [None]:
# Save to processed folder
output_path = '../data/processed/features_engineered.csv'
df_features.to_csv(output_path, index=False)
print(f"Feature-engineered data saved to: {output_path}")
print(f"Shape: {df_features.shape}")

## 10. Summary

### Features Created:
- **Technical Indicators**: RSI, MACD, Bollinger Bands, SMA, EMA, ATR, Stochastic, ADX
- **Candlestick Features**: Body size, wicks, ranges, ratios
- **Lag Features**: Previous closes, returns, highs, lows
- **Rolling Features**: Rolling mean, std, max, min, range
- **Temporal Features**: Hour, minute, day of week
- **Price Position Features**: Distance from MAs, BB position

### Key Insights:
- Total features generated: 70+
- Most important features identified via Random Forest
- All features ready for model training

### Next Steps:
1. Train multiple ML models
2. Compare model performance
3. Generate trading signals and calculate PnL