# Data pre-processing

## Install & import libraries

In [1]:
import pandas as pd
import os

## Read datasets

In [2]:
DATASETS = ['BIDV', 'VCB', 'EIB']
START_DATE = '2019-01-01'
END_DATE = '2024-06-01'

In [3]:
raw_directory = 'raw-data'
raw_data_dict = {}

for dataset in DATASETS:
    file_name = f"{dataset}_{START_DATE}_{END_DATE}.raw-data.csv"
    file_path = os.path.join(raw_directory, file_name)
    
    raw_data = pd.read_csv(file_path)
    raw_data_dict[dataset] = raw_data

## Process datasets

In [4]:
def process_data(stock_data, start_date, end_date):
    # drop null and duplicate values
    stock_data.dropna(inplace=True)
    stock_data.drop_duplicates(inplace=True)

    # create a DataFrame with new date range
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    stock_data.sort_values(by=['Date'], inplace=True, ascending=True)
    timezones = stock_data['Date'].dt.tz

    adjusted_start_date = pd.to_datetime(start_date).tz_localize(timezones)
    adjusted_end_date = pd.to_datetime(end_date).tz_localize(timezones)
    date_range = pd.date_range(start=adjusted_start_date, end=adjusted_end_date, freq='D')

    filtered_dates = [date for date in date_range if date.weekday() < 5]  # 0-4: Monday-Friday
    full_date = pd.DataFrame({'Date': filtered_dates})

    filtered_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume']
    data = pd.merge(full_date, stock_data[filtered_columns], on='Date', how='left')

    # handle missing values
    if data.iloc[0].isnull().any(): #first row
        null_columns = data.isnull().columns

        for col in null_columns:
            first_valid_value = data[col].dropna().iloc[0]
            data.loc[0, col] = first_valid_value

    data.ffill(inplace=True) #another rows

    return data

In [5]:
processed_data_dict = {}

for dataset, data in raw_data_dict.items():
    processed_data = process_data(data, START_DATE, END_DATE)
    processed_data_dict[dataset] = processed_data

## Save processed datasets

In [6]:
processed_directory = 'processed-data'

if not os.path.exists(processed_directory):
    os.makedirs(processed_directory)

In [7]:
for dataset, data in processed_data_dict.items():
    file_name = f"{dataset}_{START_DATE}_{END_DATE}.processed-data.csv"
    file_path = os.path.join(processed_directory, file_name)
    data.to_csv(file_path)
    print(f"Saved {file_name}")

Saved BIDV_2019-01-01_2024-06-01.processed-data.csv
Saved VCB_2019-01-01_2024-06-01.processed-data.csv
Saved EIB_2019-01-01_2024-06-01.processed-data.csv
