# ETL Pipeline

In [None]:
import importlib
from pathlib import Path
import time

import pandas as pd

import helpers
importlib.reload(helpers)
from helpers import create_df, match_string

import utils
importlib.reload(utils)
from utils import logger

start_time = time.time()
logger.info('ETL pipeline started...')

# Set floats to 4 decimal places for extra precision in analysis.
pd.set_option('display.float_format', lambda x: '%.4f' % x)

account_positions_csv_file = Path().cwd().parent / 'data' / 'account_positions.csv'
accounts_csv_file = Path().cwd().parent / 'data' / 'accounts.csv'
price_history_csv_file = Path().cwd().parent / 'data' / 'price_history.csv'
transactions_csv_file = Path().cwd().parent / 'data' / 'transactions.csv'

# Create dataframes from csv files.
account_positions = pd.read_csv(account_positions_csv_file)
accounts = pd.read_csv(accounts_csv_file)
price_history = pd.read_csv(price_history_csv_file)
transactions = pd.read_csv(transactions_csv_file)
logger.info('Dataframes created from .csv files.')

In [None]:
#====================================
# Dimension account dataframe.
#====================================
dim_account = create_df('dim_account', accounts, ['account_id', 'user_name', 'first_name', 'last_name', 'account_type', 'created_at'], 'account_id') # Drop rows where account ID is NaN since no positions or transactions can be linked to them.

In [None]:
# Fix typos in account types column.
correct_account_types = ['joint', 'individual', 'retirement']

for index in dim_account.index:
    dim_account.loc[index, 'account_type'] = match_string(dim_account.loc[index, 'account_type'], correct_account_types)

logger.info('Account type errors fixed.')

In [None]:
#=================================
# Dimension date dataframe.
#=================================
# Extract all dates from all dataframes.
account_position_dates = create_df('account_position_dates', account_positions, ['last_updated']).rename(columns={'last_updated': 'date'})
account_dates = create_df('account_dates', accounts, ['created_at']).rename(columns={'created_at': 'date'})
price_history_dates = create_df('price_history_dates', price_history, ['date'])
transaction_dates = create_df('transaction_dates', transactions, ['trade_date']).rename(columns={'trade_date': 'date'})

In [None]:
# Concatenate date dataframes to create master date dataframe.
unique_dates = pd.concat([account_position_dates, account_dates, price_history_dates, transaction_dates]).drop_duplicates()

# Create dimension date dataframe and reset index.
dim_date = create_df('dim_date', unique_dates, ['date'], 'date', 'date').reset_index(drop=True)

# Reset date ID.
dim_date['date_id'] = dim_date.index + 1

In [None]:
#===================================
# Dimension ticker dataframe.
#===================================
dim_ticker = create_df('dim_ticker', price_history, ['ticker_symbol'], None, 'ticker').reset_index(drop=True)

In [None]:
#===================================
# Fact account position dataframe.
#===================================
fact_account_position = create_df('fact_account_position', account_positions, ['account_id', 'ticker_symbol', 'shares_held', 'last_updated']) # Drop rows where account ID is NaN since no accounts or transactions can be linked to them.

In [None]:
# Drop rows with invalid tickers and save them to csv.
# Extract numeric part and ensure it matches the exact format.
numeric_part = fact_account_position['ticker_symbol'].str.extract(r'^stk(\d+)$', expand=False)

# Convert to numeric (handles NaN), then check the range.
valid_mask = (
    numeric_part.notna() &
    (pd.to_numeric(numeric_part, errors='coerce').between(101, 500))
)

# Separate valid/invalid tickers, then save invalids to csv.
valid_account_positions = fact_account_position[valid_mask]
invalid_account_positons = fact_account_position[~valid_mask]
invalid_account_positions_csv = Path().cwd().parent / 'data' / 'invalid' / 'invalid_account_positions.csv'
invalid_account_positons.to_csv(invalid_account_positions_csv, index=False)

# Set fact account position to valid positions.
fact_account_position = valid_account_positions

logger.info(f'{len(invalid_account_positons)} rows with invalid ticker symbols dropped from fact account position and saved to new csv file.')

In [None]:
#===================================
# Fact price_history dataframe.
#===================================
fact_price_history = create_df('fact_price_history', price_history, list(price_history.columns))

In [None]:
#===================================
# Fact transaction dataframe.
#===================================
fact_transaction = create_df('fact_transaction', transactions, list(transactions.columns), None, 'transaction')

In [None]:
# Fix typos in trade type column by defining regex and applying mask for each trade type.
buy_regex = r'[buy]'
sell_regex = r'[sell]'

buy_mask = fact_transaction['trade_type'].str.contains(buy_regex, case=False, na=False)
sell_mask = fact_transaction['trade_type'].str.contains(sell_regex, case=False, na=False)

fact_transaction.loc[buy_mask, 'trade_type'] = 'buy'
fact_transaction.loc[sell_mask & (~buy_mask), 'trade_type'] = 'sell'

logger.info('Typos in trade type column fixed.')

In [None]:
# Convert trade type to 'b' for 'buy' or 's' for 'sell.'
fact_transaction['trade_type'] = fact_transaction['trade_type'].replace({'buy': 'b', 'sell': 's'})
logger.info('Trade types replaced with single characters.')

In [None]:
# Convert columns to proper dtypes.
fact_transaction = fact_transaction.astype({
    'transaction_id': 'int32',
    'account_id': 'int32',
    'ticker_symbol': 'string',
    'trade_type': 'category',
    'shares': 'float64',
    'price': 'float64'
})

In [None]:
end_time = time.time()
execution_time = end_time - start_time
logger.info(f'âˆ´ ETL pipeline finished.\nExecution time: {execution_time:.4f} seconds.')

---

In [None]:
#TODO:
# Create csv files of bad/missing data.
# Chane account type in dim account to i: individual, j: joint or r: retirement
# Map values to IDs in dimension dataframes.
# Rearrange columns to match schema.
# Convert to correct dtypes.