# Data Exploration & Alpha Factor Demo

Two sections:
1. **Database sanity check** — verify `stock_data.db` is built correctly and preview sample data (OHLCV + amount, adj_factor, industry, missing values).
2. **Alpha computation** — compute Alpha#6, #12, #38, #41, #101 via `Alpha101`.

> **Prerequisites**: run `engine.download_data()` first so that `data/stock_data.db` exists.

In [None]:
import sys
sys.path.insert(0, '../src')

import numpy as np
import pandas as pd
from data_loader import DataEngine
from alphas import Alpha101

In [None]:
engine = DataEngine()
data = engine.load_data()

df_price    = data['df_price']
df_mv       = data['df_mv']
df_industry = data['df_industry']
df_adj      = data['df_adj']

print('Tables loaded.')
print(f'  daily_price : {df_price.shape}  (rows = date×code combinations, cols include amount)')
print(f'  df_mv       : {df_mv.shape}')
print(f'  stock_info  : {df_industry.shape}')
print(f'  adj_factor  : {df_adj.shape}')

---
## Part 1 — Database Sanity Check

### 1.1 Daily Price (OHLCV + Amount)

In [None]:
dates = df_price.index.get_level_values('date')
codes_idx = df_price.index.get_level_values('code')
print(f'Date range   : {dates.min()}  ->  {dates.max()}')
print(f'Unique stocks: {codes_idx.nunique()}')
print(f'Columns      : {df_price.columns.tolist()}')
df_price.head(10)

### 1.2 Adjustment Factor (adj_factor)

In [None]:
# adj_factor overview
print(f'adj_factor shape: {df_adj.shape}')
print(f'Date range: {df_adj.index.get_level_values("date").min()}  ->  {df_adj.index.get_level_values("date").max()}')
print(f'Unique stocks: {df_adj.index.get_level_values("code").nunique()}')
df_adj.head(10)

In [None]:
# Show adj_factor history for one sample stock (pivot to wide form: dates × codes)
sample_codes = df_adj.index.get_level_values('code').unique()[:4].tolist()
adj_wide = df_adj['adj_factor'].unstack('code')[sample_codes]
print('adj_factor (wide form) — last 5 trading dates:')
adj_wide.tail(5)

### 1.3 Market Cap (total_mv)

In [None]:
df_mv.head(10)

### 1.4 Industry Distribution

In [None]:
df_industry.head(10)

In [None]:
df_industry['industry'].value_counts()

### 1.5 Missing Data Check

In [None]:
print('=== daily_price null counts ===')
print(df_price.isnull().sum())
print()
print('=== df_mv null counts ===')
print(df_mv.isnull().sum())
print()
print('=== adj_factor null counts ===')
print(df_adj.isnull().sum())
print()
# stocks present in daily_price but missing from adj_factor
price_codes = set(df_price.index.get_level_values('code').unique())
adj_codes   = set(df_adj.index.get_level_values('code').unique())
missing_adj = price_codes - adj_codes
print(f'Stocks in daily_price but missing adj_factor: {len(missing_adj)}')

---
## Part 2 — Alpha Factor Computation

Using `Alpha101` from `src/alphas.py`, which implements alphas from  
*'101 Formulaic Alphas'* (Kakushadze, 2015).  
Raw values (NaN, inf) are preserved — data cleaning comes later.

In [None]:
alpha = Alpha101(data)
print('Alpha101 initialized.')
print(f'  Price matrix shape (dates × codes): {alpha.close.shape}')

### 2.1 Individual Alphas (wide form: dates × codes)

In [None]:
# Alpha#6: -1 * correlation(open, volume, 10)
a6 = alpha.alpha006()
print('Alpha#6  shape:', a6.shape)
a6.tail(5).iloc[:, :6]   # last 5 dates, first 6 stocks

In [None]:
# Alpha#12: sign(delta(volume, 1)) * (-1 * delta(close, 1))
a12 = alpha.alpha012()
print('Alpha#12 shape:', a12.shape)
a12.tail(5).iloc[:, :6]

In [None]:
# Alpha#38: (-1 * rank(ts_rank(close, 10))) * rank(close / open)
a38 = alpha.alpha038()
print('Alpha#38 shape:', a38.shape)
a38.tail(5).iloc[:, :6]

In [None]:
# Alpha#41: sqrt(high * low) - vwap
a41 = alpha.alpha041()
print('Alpha#41 shape:', a41.shape)
a41.tail(5).iloc[:, :6]

In [None]:
# Alpha#101: (close - open) / (high - low + 0.001)
a101 = alpha.alpha101()
print('Alpha#101 shape:', a101.shape)
a101.tail(5).iloc[:, :6]

### 2.2 Combined Alpha DataFrame  (MultiIndex: date × code)

In [None]:
df_alphas = alpha.get_all_alphas()
print('Combined alpha DataFrame shape:', df_alphas.shape)
print('Columns:', df_alphas.columns.tolist())
df_alphas.head(10)

### 2.3 Descriptive Statistics

In [None]:
df_alphas.describe()

### 2.4 NaN Coverage per Alpha

In [None]:
total = len(df_alphas)
null_pct = df_alphas.isnull().sum() / total * 100
null_pct.rename('NaN %').to_frame()

### 2.5 Cross-Sectional Snapshot on the Latest Date

In [None]:
latest_date = df_alphas.index.get_level_values('date').max()
print(f'Latest date: {latest_date}')
df_alphas.loc[latest_date].dropna().head(10)