# Data Crawler

## Install & import libraries

In [1]:
%%capture
!pip install yfinance

In [2]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

## Crawl data

In [3]:
DATASETS = ['BIDV', 'VCB', 'EIB']
TICKER_SYMBOLS = ['BID.VN', 'VCB.VN', 'EIB.VN']
START_DATE = '2019-01-01'
END_DATE = '2024-06-01'

INTERVAL = '1d'
LOCALIZE = 'Asia/Ho_Chi_Minh'

In [4]:
adjusted_start_date = pd.to_datetime(START_DATE).tz_localize(LOCALIZE)
adjusted_end_date = pd.to_datetime(END_DATE).tz_localize(LOCALIZE) + timedelta(days=1)

In [5]:
%%time
raw_data_dict = {}

for ticker_symbol, dataset in zip(TICKER_SYMBOLS, DATASETS):
    ticker_data = yf.Ticker(ticker_symbol)
    historical_data = ticker_data.history(start=adjusted_start_date, end=adjusted_end_date, interval=INTERVAL)

    raw_data_dict[dataset] = historical_data
    

CPU times: total: 46.9 ms
Wall time: 5.8 s


In [6]:
manual_run = True

if manual_run:
    line_length = 24

    for dataset, data in raw_data_dict.items():
        print('=' * line_length)
        print(f'{dataset}: Shape {data.shape}')
        print('-' * line_length)
        print('Null checking')
        print('-' * line_length)
        print(f'{data.isnull().sum()}')
else:
    print("Manual Run Only!")

BIDV: Shape (1349, 7)
------------------------
Null checking
------------------------
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
dtype: int64
VCB: Shape (1344, 7)
------------------------
Null checking
------------------------
Open            1
High            1
Low             1
Close           1
Volume          0
Dividends       0
Stock Splits    0
dtype: int64
EIB: Shape (1348, 7)
------------------------
Null checking
------------------------
Open            0
High            0
Low             0
Close           0
Volume          0
Dividends       0
Stock Splits    0
dtype: int64


## Save datasets to ```/raw-data``` folder

In [7]:
import os

In [8]:
directory = 'raw-data'

if not os.path.exists(directory):
    os.makedirs(directory)

In [9]:
for dataset, data in raw_data_dict.items():
    file_name = f"{dataset}_{START_DATE}_{END_DATE}.raw-data.csv"
    file_path = os.path.join(directory, file_name)
    data.to_csv(file_path)
    print(f"Saved {file_name}")

Saved BIDV_2019-01-01_2024-06-01.raw-data.csv
Saved VCB_2019-01-01_2024-06-01.raw-data.csv
Saved EIB_2019-01-01_2024-06-01.raw-data.csv
