# ETL Pipeline

In [None]:
import importlib
from pathlib import Path

import pandas as pd

import helpers
importlib.reload(helpers)
from helpers import create_df, match_string

import utils
importlib.reload(utils)
from utils import logger

logger.info('ETL pipeline started...')

# Set floats to 4 decimal places for extra precision in analysis.
pd.set_option('display.float_format', lambda x: '%.4f' % x)

account_positions_csv_file = Path().cwd().parent / 'data' / 'account_positions.csv'
accounts_csv_file = Path().cwd().parent / 'data' / 'accounts.csv'
price_history_csv_file = Path().cwd().parent / 'data' / 'price_history.csv'
transactions_csv_file = Path().cwd().parent / 'data' / 'transactions.csv'

# Create dataframes from csv files.
account_positions = pd.read_csv(account_positions_csv_file)
accounts = pd.read_csv(accounts_csv_file)
price_history = pd.read_csv(price_history_csv_file)
transactions = pd.read_csv(transactions_csv_file)
logger.info('Dataframes created from .csv files.')

In [None]:
#====================================
# Create dimension account dataframe.
#====================================
dim_account = create_df('dim_account', accounts, ['account_id', 'user_name', 'first_name', 'last_name', 'account_type', 'created_at'])

In [None]:
# Fix typos in account types column.
correct_account_types = ['joint', 'individual', 'retirement']

for index in dim_account.index:
    dim_account.loc[index, 'account_type'] = match_string(dim_account.loc[index, 'account_type'], correct_account_types)

logger.info('Account type errors fixed.')

In [None]:
# Drop rows where account ID is NaN since no transactions can be linked to them.
dim_account = dim_account.dropna(subset=['account_id'])
logger.info('Rows with NaN account IDs dropped.')

In [None]:
#===================================
# Create dimension ticker dataframe.
#===================================
dim_ticker = create_df('dim_ticker', price_history, ['ticker_symbol'], 'ticker')

In [None]:
#===================================
# Create fact transaction dataframe.
#===================================
fact_transaction = create_df('fact_transaction', transactions, list(transactions.columns), 'transaction')

In [None]:
# Drop duplicates & rows with nan or null values.
fact_transaction_before = len(fact_transaction)
fact_transaction = fact_transaction.dropna()
fact_transaction_after = len(fact_transaction)
logger.info(f'{fact_transaction_before - fact_transaction_after} rows with nan or null values dropped.')

In [None]:
# Fix typos in trade type column by defining regex and applying mask for each trade type.
buy_regex = r'[buy]'
sell_regex = r'[sell]'

buy_mask = fact_transaction['trade_type'].str.contains(buy_regex, case=False, na=False)
sell_mask = fact_transaction['trade_type'].str.contains(sell_regex, case=False, na=False)

fact_transaction.loc[buy_mask, 'trade_type'] = 'buy'
fact_transaction.loc[sell_mask & (~buy_mask), 'trade_type'] = 'sell'

logger.info('Typos in trade type column fixed.')

In [None]:
# Convert trade type to 'b' for 'buy' or 's' for 'sell.'
fact_transaction['trade_type'] = fact_transaction['trade_type'].replace({'buy': 'b', 'sell': 's'})
logger.info('Trade types replaced with single characters.')

In [None]:
# Convert columns to proper dtypes.
fact_transaction = fact_transaction.astype({
    'transaction_id': 'int32',
    'account_id': 'int32',
    'ticker_symbol': 'string',
    'trade_type': 'category',
    'shares': 'float64',
    'price': 'float64'
})

In [None]:
fact_transaction.head()

# TODO
#   Create dimension ticker and date tables then map ids to these columns.

In [None]:
# Create dim_account.
dim_account = create_df('dim_account', accounts, ['account_id', 'user_name', 'first_name', 'last_name', 'account_type', 'created_at'])

In [None]:
dim_account = dim_account.drop_duplicates()

In [None]:
dim_account.describe(include='all')