In [None]:
# Import libraries
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loader import load_data, create_target_column

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

## 1. Load Data

In [None]:
# Load the data
data_path = '../data/raw/nifty_intraday.csv'

df = load_data(data_path)
print(f"\nDataset shape: {df.shape}")

## 2. Basic Data Inspection

In [None]:
# Display first few rows
print("First 10 rows:")
df.head(10)

In [None]:
# Display last few rows
print("Last 10 rows:")
df.tail(10)

In [None]:
# Data info
print("Dataset Info:")
df.info()

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()

## 3. Check for Missing Values

In [None]:
# Missing values
print("Missing Values:")
missing = df.isnull().sum()
print(missing)
print(f"\nTotal missing values: {missing.sum()}")

## 4. Time Range Analysis

In [None]:
# Time range
print("Time Range:")
print(f"Start Date: {df['Timestamp'].min()}")
print(f"End Date: {df['Timestamp'].max()}")
print(f"Duration: {df['Timestamp'].max() - df['Timestamp'].min()}")
print(f"\nTotal trading days: {df['Timestamp'].dt.date.nunique()}")

## 5. Price Analysis

In [None]:
# Price statistics
print("Price Statistics:")
print(f"Highest High: {df['High'].max():.2f}")
print(f"Lowest Low: {df['Low'].min():.2f}")
print(f"Price Range: {df['High'].max() - df['Low'].min():.2f}")
print(f"\nAverage Close: {df['Close'].mean():.2f}")
print(f"Median Close: {df['Close'].median():.2f}")

In [None]:
# Plot closing price over time
plt.figure(figsize=(16, 6))
plt.plot(df['Timestamp'], df['Close'], linewidth=1)
plt.title('NIFTY Closing Price Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Timestamp', fontsize=12)
plt.ylabel('Close Price', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Create and Analyze Target Variable

In [None]:
# Create target variable
df = create_target_column(df)

In [None]:
# Target distribution
target_counts = df['target'].value_counts()
print("\nTarget Distribution:")
print(target_counts)
print(f"\nClass 0 (Sell): {target_counts[0]} ({target_counts[0]/len(df)*100:.2f}%)")
print(f"Class 1 (Buy): {target_counts[1]} ({target_counts[1]/len(df)*100:.2f}%)")

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
axes[0].bar(['Sell (0)', 'Buy (1)'], target_counts.values, color=['red', 'green'])
axes[0].set_title('Target Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Count', fontsize=12)
axes[0].grid(True, alpha=0.3)

# Pie chart
axes[1].pie(target_counts.values, labels=['Sell (0)', 'Buy (1)'], autopct='%1.1f%%',
            colors=['red', 'green'], startangle=90)
axes[1].set_title('Target Distribution (%)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 7. OHLC Candlestick Analysis (Sample)

In [None]:
# Sample 100 candles for visualization
sample_df = df.head(100).copy()

# Calculate candle colors
sample_df['color'] = sample_df.apply(
    lambda row: 'green' if row['Close'] > row['Open'] else 'red', axis=1
)

# Plot
plt.figure(figsize=(16, 8))

# Plot candles
for idx, row in sample_df.iterrows():
    # Candle body
    plt.plot([idx, idx], [row['Open'], row['Close']], 
             color=row['color'], linewidth=3)
    # Upper wick
    plt.plot([idx, idx], [row['Close'], row['High']], 
             color=row['color'], linewidth=1)
    # Lower wick
    plt.plot([idx, idx], [row['Open'], row['Low']], 
             color=row['color'], linewidth=1)

plt.title('OHLC Candlestick Chart (First 100 Candles)', fontsize=16, fontweight='bold')
plt.xlabel('Candle Index', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 8. Volume Analysis (if available)

In [None]:
# Check if Volume column exists
if 'Volume' in df.columns:
    print("Volume Statistics:")
    print(df['Volume'].describe())
    
    # Plot volume
    plt.figure(figsize=(16, 6))
    plt.bar(range(len(df)), df['Volume'], width=1, alpha=0.6)
    plt.title('Trading Volume Over Time', fontsize=16, fontweight='bold')
    plt.xlabel('Candle Index', fontsize=12)
    plt.ylabel('Volume', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
else:
    print("Volume column not found in dataset.")

## 9. Returns Analysis

In [None]:
# Calculate returns
df['returns'] = df['Close'].pct_change()

print("Returns Statistics:")
print(df['returns'].describe())
print(f"\nPositive returns: {(df['returns'] > 0).sum()} ({(df['returns'] > 0).sum()/len(df)*100:.2f}%)")
print(f"Negative returns: {(df['returns'] < 0).sum()} ({(df['returns'] < 0).sum()/len(df)*100:.2f}%)")

In [None]:
# Plot returns distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Histogram
axes[0].hist(df['returns'].dropna(), bins=100, edgecolor='black', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--', linewidth=2)
axes[0].set_title('Distribution of Returns', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Returns', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].grid(True, alpha=0.3)

# Time series
axes[1].plot(df['Timestamp'], df['returns'], linewidth=0.5, alpha=0.7)
axes[1].axhline(y=0, color='red', linestyle='--', linewidth=1)
axes[1].set_title('Returns Over Time', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Timestamp', fontsize=12)
axes[1].set_ylabel('Returns', fontsize=12)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 10. Data Quality Checks

In [None]:
# Check for anomalies
print("Data Quality Checks:")
print("="*50)

# Check if High >= Low
invalid_high_low = (df['High'] < df['Low']).sum()
print(f"Invalid High < Low: {invalid_high_low}")

# Check if High >= Close and Low <= Close
invalid_high = (df['High'] < df['Close']).sum()
invalid_low = (df['Low'] > df['Close']).sum()
print(f"Invalid High < Close: {invalid_high}")
print(f"Invalid Low > Close: {invalid_low}")

# Check if High >= Open and Low <= Open
invalid_high_open = (df['High'] < df['Open']).sum()
invalid_low_open = (df['Low'] > df['Open']).sum()
print(f"Invalid High < Open: {invalid_high_open}")
print(f"Invalid Low > Open: {invalid_low_open}")

# Check for duplicate timestamps
duplicates = df['Timestamp'].duplicated().sum()
print(f"\nDuplicate timestamps: {duplicates}")

print("="*50)
if (invalid_high_low + invalid_high + invalid_low + 
    invalid_high_open + invalid_low_open + duplicates) == 0:
    print("✅ All data quality checks passed!")
else:
    print("⚠️ Some data quality issues detected!")

## 11. Summary

### Key Findings:
- Dataset contains intraday OHLC data for NIFTY
- Target variable is balanced (approximately 50-50 split)
- Data quality checks passed
- Ready for feature engineering and model training

### Next Steps:
1. Feature engineering (technical indicators)
2. Model training
3. Evaluation and PnL calculation