# Data Exploration

Verify that `stock_data.db` has been built correctly and preview sample data.

> **Prerequisites**: run `python src/data_loader.py` or call `engine.download_data()` first.

In [1]:
import sys
sys.path.insert(0, '../src')

from data_loader import DataEngine

In [2]:
engine = DataEngine()
data = engine.load_data()

df_price    = data['df_price']
df_mv       = data['df_mv']
df_industry = data['df_industry']

print('Tables loaded.')
print(f'  daily_price : {df_price.shape}')
print(f'  df_mv       : {df_mv.shape}')
print(f'  stock_info  : {df_industry.shape}')

Tables loaded.
  daily_price : (223275, 5)
  df_mv       : (222518, 1)
  stock_info  : (300, 2)


## 1. Daily Price (OHLCV)

In [3]:
# Date range and number of unique stocks
dates  = df_price.index.get_level_values('date')
codes  = df_price.index.get_level_values('code')
print(f'Date range  : {dates.min()}  ->  {dates.max()}')
print(f'Unique stocks: {codes.nunique()}')
df_price.head(10)

Date range  : 20220104  ->  20250221
Unique stocks: 299


Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,vol
date,code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
20220104,000001.SZ,16.48,16.66,16.18,16.66,1169259.33
20220104,000002.SZ,19.49,20.65,19.36,20.49,1947202.02
20220104,000063.SZ,33.58,33.64,33.13,33.42,290034.38
20220104,000100.SZ,6.18,6.26,6.14,6.24,1612641.47
20220104,000157.SZ,7.17,7.22,7.14,7.21,442457.79
20220104,000166.SZ,5.13,5.18,5.09,5.13,714768.81
20220104,000301.SZ,19.35,19.48,18.86,19.11,375623.65
20220104,000333.SZ,74.0,75.5,73.6,75.36,310408.01
20220104,000338.SZ,17.93,18.0,17.52,17.69,1214435.55
20220104,000408.SZ,41.6,41.77,36.99,38.28,401552.96


## 2. Market Cap (total_mv)

In [4]:
df_mv.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_mv
date,code,Unnamed: 2_level_1
20220104,000001.SZ,32330260.0
20220104,000002.SZ,23820410.0
20220104,000063.SZ,15806060.0
20220104,000100.SZ,8755121.0
20220104,000157.SZ,6256832.0
20220104,000166.SZ,12845490.0
20220104,000301.SZ,9239500.0
20220104,000333.SZ,52640910.0
20220104,000338.SZ,15437280.0
20220104,000408.SZ,7544719.0


## 3. Industry Distribution

In [5]:
df_industry.head(10)

Unnamed: 0_level_0,name,industry
code,Unnamed: 1_level_1,Unnamed: 2_level_1
000001.SZ,平安银行,银行
000002.SZ,万科Ａ,全国地产
000063.SZ,中兴通讯,通信设备
000100.SZ,TCL科技,元器件
000157.SZ,中联重科,工程机械
000301.SZ,东方盛虹,化纤
000408.SZ,藏格矿业,农药化肥
000425.SZ,徐工机械,工程机械
000538.SZ,云南白药,中成药
000568.SZ,泸州老窖,白酒


In [6]:
# Count of stocks per industry
df_industry['industry'].value_counts()

industry
银行      24
证券      22
半导体     19
电气设备    16
元器件     16
        ..
广告包装     1
乳制品      1
医药商业     1
供气供热     1
服饰       1
Name: count, Length: 66, dtype: int64

## 4. Missing Data Check

In [None]:
print('=== daily_price null counts ===')
print(df_price.isnull().sum())
print()
print('=== df_mv null counts ===')
print(df_mv.isnull().sum())