In [None]:
import pandas as pd
import os

data_folder = 'data'
parquet_files = [f for f in os.listdir(data_folder) if f.endswith('.parquet')]

print(f"Loading {len(parquet_files)} files...")
all_stocks = []

for file in parquet_files:
    stock_df = pd.read_parquet(os.path.join(data_folder, file))
    
    ticker_code = file.split('_')[0]
    stock_df['Ticker'] = ticker_code
    
    all_stocks.append(stock_df)

df_all = pd.concat(all_stocks)
df_all.index = pd.to_datetime(df_all.index)
df_all.index.name = 'Date'

Loading 200 files...


In [None]:
# 데이터 항목
print("Data Summary:")
print(f"Total rows: {len(df_all):,}")
print(f"Total columns: {len(df_all.columns)}")
print(f"\nColumn list:")
for i, col in enumerate(df_all.columns, 1):
    print(f"{i}. {col}")

print(f"\nMissing values:")
print(df_all.isnull().sum())

Data Summary:
Total rows: 471,300
Total columns: 24

Column list:
1. Date
2. Open
3. High
4. Low
5. Close
6. Volume
7. Change
8. Ticker
9. Return_1d
10. Return_5d
11. Return_20d
12. Return_30d
13. Return_50d
14. Return_60d
15. Return_100d
16. Return_120d
17. Return_200d
18. vol_20
19. vol_60
20. vol_60_sqrt252
21. log_vol
22. vol_ratio_60
23. avg_log_vol_ratio_60
24. std_log_vol_ratio_60

Missing values:
Date                    0
Open                    0
High                    0
Low                     0
Close                   0
Volume                  0
Change                  0
Ticker                  0
Return_1d               0
Return_5d               0
Return_20d              0
Return_30d              0
Return_50d              0
Return_60d              0
Return_100d             0
Return_120d             0
Return_200d             0
vol_20                  0
vol_60                  0
vol_60_sqrt252          0
log_vol                 0
vol_ratio_60            0
avg_log_vol_ratio_60

In [None]:
# 날짜 & 종목 필터
start_date = '2024-01-01'
end_date   = '2024-12-31'
tickers_to_find = ['AAPL', 'TSLA', 'NVDA'] # 비워두면 모든 종목

In [None]:
# 삼성전자 예시
ticker = '005930'

stock_file = [f for f in parquet_files if f.startswith(ticker)][0]
df = pd.read_parquet(os.path.join(data_folder, stock_file))

df.index = pd.to_datetime(df.index)

print(f"Loaded: {stock_file}")
print(f"Shape: {df.shape}")
print(f"Index type: {type(df.index)}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"\nColumns ({len(df.columns)}):")
print(list(df.columns))
print(f"\nFirst 10 rows:")
print(df.head(10))
print(f"\nLast 10 rows:")
print(df.tail(10))

Loaded: 005930_삼성전자.parquet
Shape: (2458, 24)
Index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
Date range: 1970-01-01 00:00:00 to 1970-01-01 00:00:00.000002457

Columns (24):
['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Change', 'Ticker', 'Return_1d', 'Return_5d', 'Return_20d', 'Return_30d', 'Return_50d', 'Return_60d', 'Return_100d', 'Return_120d', 'Return_200d', 'vol_20', 'vol_60', 'vol_60_sqrt252', 'log_vol', 'vol_ratio_60', 'avg_log_vol_ratio_60', 'std_log_vol_ratio_60']

First 10 rows:
                                    Date   Open   High    Low  Close  Volume  \
1970-01-01 00:00:00.000000000 2015-01-02  26799  26799  26540  26600  175499   
1970-01-01 00:00:00.000000001 2015-01-05  26720  26720  26260  26660  202790   
1970-01-01 00:00:00.000000002 2015-01-06  26300  26340  25760  25900  304710   
1970-01-01 00:00:00.000000003 2015-01-07  25880  26220  25640  26140  286455   
1970-01-01 00:00:00.000000004 2015-01-08  26780  26780  26199  26280  289552   
1

In [None]:
# 여러 종목 예시
tickers_to_load = ['005930', '000660', '005380']  # Samsung, SK Hynix, Hyundai

stocks = {}
for ticker in tickers_to_load:
    file = [f for f in parquet_files if f.startswith(ticker)][0]
    df_temp = pd.read_parquet(os.path.join(data_folder, file))
    df_temp.index = pd.to_datetime(df_temp.index)
    stocks[ticker] = df_temp
    print(f"Loaded {ticker}: {file} - {stocks[ticker].shape}")

samsung = stocks['005930']
print(f"\nSamsung data shape: {samsung.shape}")
print(f"Index type: {type(samsung.index)}")
print(samsung.head())

Loaded 005930: 005930_삼성전자.parquet - (2458, 24)
Loaded 000660: 000660_SK하이닉스.parquet - (2458, 24)
Loaded 005380: 005380_현대차.parquet - (2458, 24)

Samsung data shape: (2458, 24)
Index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
                                    Date   Open   High    Low  Close  Volume  \
1970-01-01 00:00:00.000000000 2015-01-02  26799  26799  26540  26600  175499   
1970-01-01 00:00:00.000000001 2015-01-05  26720  26720  26260  26660  202790   
1970-01-01 00:00:00.000000002 2015-01-06  26300  26340  25760  25900  304710   
1970-01-01 00:00:00.000000003 2015-01-07  25880  26220  25640  26140  286455   
1970-01-01 00:00:00.000000004 2015-01-08  26780  26780  26199  26280  289552   

                                 Change  Ticker  Return_1d  Return_5d  ...  \
1970-01-01 00:00:00.000000000  0.002261  005930   0.000000        0.0  ...   
1970-01-01 00:00:00.000000001  0.002256  005930   0.225564        0.0  ...   
1970-01-01 00:00:00.000000002 -0.028507  0

In [None]:
# 모든 종목 예시
print("Loading all stocks...\n")

all_stocks = []
for file in parquet_files:
    stock_df = pd.read_parquet(os.path.join(data_folder, file))
    ticker_code = file.split('_')[0]
    stock_df['Ticker'] = ticker_code
    all_stocks.append(stock_df)

df_all = pd.concat(all_stocks)

# Ensure index is datetime (important for date filtering!)
df_all.index = pd.to_datetime(df_all.index)

print(f"✓ All stocks loaded!")
print(f"Total shape: {df_all.shape}")
print(f"Unique stocks: {df_all['Ticker'].nunique()}")
print(f"Date range: {df_all.index.min()} to {df_all.index.max()}")
print(f"Index type: {type(df_all.index)}")
print(f"\nFirst 10 rows:")
print(df_all.head(10))

Loading all stocks...

✓ All stocks loaded!
Total shape: (471300, 24)
Unique stocks: 200
Date range: 1970-01-01 00:00:00 to 1970-01-01 00:00:00.000002457
Index type: <class 'pandas.core.indexes.datetimes.DatetimeIndex'>

First 10 rows:
                                    Date   Open   High    Low  Close  Volume  \
1970-01-01 00:00:00.000000000 2015-01-02  27916  28017  27415  27617   27714   
1970-01-01 00:00:00.000000001 2015-01-05  27615  27715  26378  26916   34948   
1970-01-01 00:00:00.000000002 2015-01-06  26715  27050  26481  26850   18115   
1970-01-01 00:00:00.000000003 2015-01-07  26848  27216  26581  26783   20686   
1970-01-01 00:00:00.000000004 2015-01-08  26981  27049  26681  26716   11595   
1970-01-01 00:00:00.000000005 2015-01-09  26746  27514  26746  27383   11730   
1970-01-01 00:00:00.000000006 2015-01-12  27115  27682  27049  27217   14146   
1970-01-01 00:00:00.000000007 2015-01-13  27215  27315  26881  27150   16861   
1970-01-01 00:00:00.000000008 2015-01-14  26

In [None]:
ticker = '005930'
samsung_df = df_all[df_all['Ticker'] == ticker]
print(f"Samsung data from combined DataFrame:")
print(f"Shape: {samsung_df.shape}")
print(samsung_df.head())

Samsung data from combined DataFrame:
Shape: (2458, 24)
                                    Date   Open   High    Low  Close  Volume  \
1970-01-01 00:00:00.000000000 2015-01-02  26799  26799  26540  26600  175499   
1970-01-01 00:00:00.000000001 2015-01-05  26720  26720  26260  26660  202790   
1970-01-01 00:00:00.000000002 2015-01-06  26300  26340  25760  25900  304710   
1970-01-01 00:00:00.000000003 2015-01-07  25880  26220  25640  26140  286455   
1970-01-01 00:00:00.000000004 2015-01-08  26780  26780  26199  26280  289552   

                                 Change  Ticker  Return_1d  Return_5d  ...  \
1970-01-01 00:00:00.000000000  0.002261  005930   0.000000        0.0  ...   
1970-01-01 00:00:00.000000001  0.002256  005930   0.225564        0.0  ...   
1970-01-01 00:00:00.000000002 -0.028507  005930  -2.850713        0.0  ...   
1970-01-01 00:00:00.000000003  0.009266  005930   0.926641        0.0  ...   
1970-01-01 00:00:00.000000004  0.005356  005930   0.535578        0.0  ..