In [18]:
# Import libraries
import pandas as pd
import FinanceDataReader as fdr
from datetime import datetime
from tqdm import tqdm
import warnings
import time
import os
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


In [19]:
# Load ticker list from list.csv
import os

# Direct path to list.csv
list_path = '/home/dave/projects/dev_backend/데이터/README/list.csv'

print(f"Loading tickers from: {list_path}")

# Load the CSV with cp949 encoding for Korean characters
ticker_df = pd.read_csv(list_path, encoding='cp949')

print(f"✓ Successfully loaded {len(ticker_df)} stocks")
print("\nFirst few rows:")
print(ticker_df.head())

# Extract ticker codes (첫 번째 컬럼) and pad with zeros to 6 digits
tickers = ticker_df.iloc[:, 0].astype(str).str.zfill(6).tolist()

print(f"\n✓ Extracted {len(tickers)} tickers")
print(f"First 5 tickers: {tickers[:5]}")

Loading tickers from: /home/dave/projects/dev_backend/데이터/README/list.csv
✓ Successfully loaded 200 stocks

First few rows:
    종목코드     종목명       종가    대비   등락률       상장시가총액
0   5930    삼성전자  1330000  3000  0.23  195908118.0
1   5380     현대차   169000     0  0.00   37226725.0
2    660  SK하이닉스    47750     0  0.00   34762113.0
3  15760    한국전력    42700     0  0.00   27411866.0
4   5490   POSCO   283500  8000  2.90   24717468.0

✓ Extracted 200 tickers
First 5 tickers: ['005930', '005380', '000660', '015760', '005490']


In [20]:
# Set date range
start_date = '2015-01-01'
end_date = '2024-12-31'

print(f"Data collection period: {start_date} to {end_date}")
print(f"Number of stocks to download: {len(tickers)}")

Data collection period: 2015-01-01 to 2024-12-31
Number of stocks to download: 200


In [21]:
# Download OHLCV data for all stocks
all_data = []
failed_tickers = []

print("Starting data collection...\n")

for ticker in tqdm(tickers):
    try:
        # Download stock data using FinanceDataReader
        df = fdr.DataReader(ticker, start_date, end_date)
        
        if not df.empty:
            # Add Ticker column
            df['Ticker'] = ticker
            df = df.reset_index()
            
            all_data.append(df)
        else:
            failed_tickers.append(ticker)
            
    except Exception as e:
        failed_tickers.append(ticker)
        print(f"Error: {ticker} - {e}")

print(f"\n✓ Downloaded: {len(all_data)} stocks")
print(f"✗ Failed: {len(failed_tickers)} stocks")

Starting data collection...



100%|██████████| 200/200 [00:18<00:00, 10.86it/s]


✓ Downloaded: 200 stocks
✗ Failed: 0 stocks





In [22]:
# Combine all data into one DataFrame
if all_data:
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Sort by Ticker and Date
    combined_df = combined_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)
    
    print(f"Combined DataFrame shape: {combined_df.shape}")
    print(f"\nDate range: {combined_df['Date'].min()} to {combined_df['Date'].max()}")
    print(f"Number of unique stocks: {combined_df['Ticker'].nunique()}")
    print(f"\nFirst few rows:")
    print(combined_df.head(10))
    print(f"\nData types:")
    print(combined_df.dtypes)
else:
    print("No data collected!")

Combined DataFrame shape: (471300, 8)

Date range: 2015-01-02 00:00:00 to 2024-12-30 00:00:00
Number of unique stocks: 200

First few rows:
        Date   Open   High   Low  Close   Volume    Change  Ticker
0 2015-01-02  10000  10000  9760   9790  2918197 -0.021000  000030
1 2015-01-05   9750   9760  9530   9630  3563551 -0.016343  000030
2 2015-01-06   9520   9580  9410   9440  2619991 -0.019730  000030
3 2015-01-07   9380   9510  9370   9400  1679442 -0.004237  000030
4 2015-01-08   9420   9620  9410   9530  2166767  0.013830  000030
5 2015-01-09   9630   9660  9510   9560  1737936  0.003148  000030
6 2015-01-12   9500   9540  9360   9380  1449448 -0.018828  000030
7 2015-01-13   9400   9490  9320   9390  1380551  0.001066  000030
8 2015-01-14   9390   9420  9190   9220  2399519 -0.018104  000030
9 2015-01-15   9190   9200  9000   9050  2884385 -0.018438  000030

Data types:
Date      datetime64[ns]
Open               int64
High               int64
Low                int64
Close     

In [23]:
# Check for missing values
print("Missing values per column:")
print(combined_df.isnull().sum())
print(f"\nTotal missing values: {combined_df.isnull().sum().sum()}")

# Basic statistics
print("\n" + "="*50)
print("Basic Statistics:")
print("="*50)
print(combined_df[['Open', 'High', 'Low', 'Close', 'Volume']].describe())

Missing values per column:
Date      0
Open      0
High      0
Low       0
Close     0
Volume    0
Change    0
Ticker    0
dtype: int64

Total missing values: 0

Basic Statistics:
               Open          High           Low         Close        Volume
count  4.713000e+05  4.713000e+05  4.713000e+05  4.713000e+05  4.713000e+05
mean   8.339571e+04  8.467511e+04  8.212201e+04  8.355240e+04  5.737874e+05
std    1.477938e+05  1.499914e+05  1.456154e+05  1.477575e+05  2.069512e+06
min    0.000000e+00  0.000000e+00  0.000000e+00  9.000000e+01  0.000000e+00
25%    1.340000e+04  1.365000e+04  1.320000e+04  1.362425e+04  4.904875e+04
50%    3.657350e+04  3.710000e+04  3.605000e+04  3.670000e+04  1.501130e+05
75%    8.103100e+04  8.220000e+04  7.990000e+04  8.120000e+04  4.404748e+05
max    2.124000e+06  2.407000e+06  1.756000e+06  2.000000e+06  1.745905e+08


In [24]:
# Save individual parquet files for each stock
import os

# Create data folder
os.makedirs('data', exist_ok=True)

print("Saving individual stock files...\n")

for ticker in tqdm(combined_df['Ticker'].unique(), desc="Saving files"):
    # Get data for this ticker
    stock_data = combined_df[combined_df['Ticker'] == ticker].copy()
    
    # Get stock name from ticker_df
    stock_name = ticker_df[ticker_df.iloc[:, 0].astype(str).str.zfill(6) == ticker].iloc[:, 1].values[0] if len(ticker_df[ticker_df.iloc[:, 0].astype(str).str.zfill(6) == ticker]) > 0 else "Unknown"
    
    # Create filename: ticker_name.parquet
    filename = f"{ticker}_{stock_name}.parquet"
    filepath = os.path.join('data', filename)
    
    # Save to parquet
    stock_data.to_parquet(filepath, index=False, compression='snappy')

print(f"\n✓ Saved {len(combined_df['Ticker'].unique())} individual stock files to 'data/' folder")

# List first 10 files as confirmation
files = sorted(os.listdir('data'))
print(f"\nFirst 10 files:")
for f in files[:10]:
    print(f"  - {f}")

Saving individual stock files...



Saving files: 100%|██████████| 200/200 [00:02<00:00, 83.44it/s]


✓ Saved 200 individual stock files to 'data/' folder

First 10 files:
  - 000030_우리은행.parquet
  - 000070_삼양홀딩스.parquet
  - 000080_하이트진로.parquet
  - 000100_유한양행.parquet
  - 000120_CJ대한통운.parquet
  - 000140_하이트진로홀딩스.parquet
  - 000150_두산.parquet
  - 000210_대림산업.parquet
  - 000240_한국타이어월드와이드.parquet
  - 000270_기아차.parquet





In [25]:
# Test: Load all parquet files and verify
import os
import pandas as pd

data_folder = 'data'
parquet_files = [f for f in os.listdir(data_folder) if f.endswith('.parquet')]

print(f"Found {len(parquet_files)} parquet files in 'data/' folder")
print("\nLoading all files to verify...")

# Load all files
all_dfs = []
for file in tqdm(parquet_files[:5], desc="Loading sample files"):  # Load first 5 as sample
    filepath = os.path.join(data_folder, file)
    df = pd.read_parquet(filepath)
    all_dfs.append(df)
    print(f"\n{file}:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"  Date range: {df['Date'].min()} to {df['Date'].max()}")

# Combine sample and show
sample_combined = pd.concat(all_dfs, ignore_index=True)
print(f"\n{'='*60}")
print(f"Sample combined data (first 5 stocks):")
print(f"{'='*60}")
print(f"Total rows: {len(sample_combined):,}")
print(f"Unique stocks: {sample_combined['Ticker'].nunique()}")
print(f"\nFirst 10 rows:")
print(sample_combined.head(10))

print(f"\n✓ All parquet files loaded successfully!")

Found 200 parquet files in 'data/' folder

Loading all files to verify...


Loading sample files: 100%|██████████| 5/5 [00:00<00:00, 327.04it/s]


002240_고려제강.parquet:
  Shape: (2458, 8)
  Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Change', 'Ticker']
  Date range: 2015-01-02 00:00:00 to 2024-12-30 00:00:00

042660_대우조선해양.parquet:
  Shape: (2458, 8)
  Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Change', 'Ticker']
  Date range: 2015-01-02 00:00:00 to 2024-12-30 00:00:00

064960_S&T모티브.parquet:
  Shape: (2458, 8)
  Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Change', 'Ticker']
  Date range: 2015-01-02 00:00:00 to 2024-12-30 00:00:00

009290_광동제약.parquet:
  Shape: (2458, 8)
  Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Change', 'Ticker']
  Date range: 2015-01-02 00:00:00 to 2024-12-30 00:00:00

088350_한화생명.parquet:
  Shape: (2458, 8)
  Columns: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Change', 'Ticker']
  Date range: 2015-01-02 00:00:00 to 2024-12-30 00:00:00

Sample combined data (first 5 stocks):
Total rows: 12,290
Unique stocks: 5

First 10 rows:
    


