# EDA - Transactions

In [1]:
from pathlib import Path
transactions_csv_file = Path().cwd().parent / 'data' / 'transactions.csv'

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
# Create dataframe from csv.
transactions = pd.read_csv(transactions_csv_file)

In [3]:
# Quick look at the data.
transactions.head()

Unnamed: 0,account_id,ticker_symbol,trade_date,trade_type,shares,price
0,1.0,STK007,2024-07-25,SELL,88.6,546.75
1,1.0,STK261,2024-09-13,BUY,178.19,80.29
2,1.0,STK345,2024-09-11,BUY,54.42,273.24
3,1.0,STK432,2024-11-12,BUY,78.74,391.8
4,1.0,STK082,2024-07-09,BUY,18.21,397.14


In [4]:
# Display all unique ticker symbols.
print(f'Unique ticker symbols: {transactions['ticker_symbol'].unique()}')

Unique ticker symbols: ['STK007' 'STK261' 'STK345' ... 'STK4Q94' 'STKg14' 'STo330']


In [5]:
# Display all unique values for trade type.
print(f'Unique trade types: {transactions["trade_type"].unique()}')

Unique trade types: ['SELL' 'BUY' 'BUx' 'BU' 'buy' 'BUy' ' SELL ' 'BUYo' 'BUY_' 'BUYd' 'BUYB'
 'BrUY' ' BUY ' 'BUY5' 'BYU' nan 'UBY' 'ESLL' 'SELLi' 'UY' 'SLL' 'buY'
 'fUY' 'BUYp' 'BrY' 'BUYy' 'SELn' 'mUY' 'BY' 'SLEL' 'SELlL' 'SEwL' 'BnY'
 'BUYj' 'BUmY' 'BUm' 'SELLr' 'zUY' 'ELL' 'kSELL' 'SELL_' 'BUaY' 'SELLw'
 'SELLd' 'Buy' 'BUY-' 'BUYq' 'bUy' 'ByY' 'BUYg' ' BUi ' 'BUY1' 'BuY' 'BaY'
 'lBUY' 'SELL-' 'BUYa' 'BUY8' 'BcY' 'sell' 'BUUY' 'SELLz' 'BhY' 'SELt'
 'BiY' 'BUYA' 'SEhL' 'BUY9' 'BUt' 'BUY7' 'SELLB' 'selL' 'bUY' 'seLl' 'SEL'
 'rUY' 'BUa' 'SELPL' 'sElL' 'SEzL' 'JBUY' 'SELx' 'BYUY' 'BUYR' 'SElL'
 'xUY' 'BqUY' 'BUYe' 'BUj' 'BUYc' 'BUlY' 'lUY' 'BUYv' 'mBUY' 'SELLs'
 'sELl' 'BUYX' 'BUdY' 'BUq' 'VBUY' 'SELLl' 'SEdL' 'BUY2' 'PBUY' 'BeY'
 'pUY' 'SELL4' 'lELL' 'BUd' 'BvUY' 'SoLL' 'SxELL' 'BpY' 'SELL8' 'SEnL'
 'BjY' 'BUr' 'wUY' 'aUY' 'iUY' 'BUYE' 'gUY' 'BmUY' 'SNELL' 'oUY' 'SEELL'
 'SjELL' 'BlY' 'SELwL' 'SxLL' 'sEll' 'BLUY' 'SEll' 'kUY' 'BgY' 'BUeY'
 'Sell' 'SpELL' 'SCELL' 'SELl' 'QBUY' 'SETLL' 

In [6]:
# Non-null count and Dtypes for each column.
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4582246 entries, 0 to 4582245
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   account_id     float64
 1   ticker_symbol  object 
 2   trade_date     object 
 3   trade_type     object 
 4   shares         float64
 5   price          float64
dtypes: float64(3), object(3)
memory usage: 209.8+ MB


In [7]:
# Show dataframe stats.
print(f'Shape of transactions dataframe: {transactions.shape}')
transactions.describe(include='all')

Shape of transactions dataframe: (4582246, 6)


Unnamed: 0,account_id,ticker_symbol,trade_date,trade_type,shares,price
count,4574630.0,4574557,4574611,4574744,4574656.0,4574749.0
unique,,15543,909,744,,
top,,STK111,2024-11-26,BUY,,
freq,,10825,18354,3170197,,
mean,15972.58,,,,778.62,268.41
std,32846.59,,,,1556.25,165.58
min,1.0,,,,-29718.8,-1026.22
25%,7510.0,,,,93.13,131.16
50%,15020.0,,,,267.85,257.9
75%,22407.0,,,,809.35,382.49


In [8]:
# Remove leading/trailing whitespace from strings & convert to lowercase and recheck counts, unique values and freq.
transactions = transactions.map(lambda x: x.strip().lower() if isinstance(x, str) else x)
transactions.columns = transactions.columns.str.strip().str.lower()
transactions.describe(include='all')

Unnamed: 0,account_id,ticker_symbol,trade_date,trade_type,shares,price
count,4574630.0,4574557,4574611,4574744,4574656.0,4574749.0
unique,,12833,618,438,,
top,,stk111,2024-11-26,buy,,
freq,,10887,18445,3189012,,
mean,15972.58,,,,778.62,268.41
std,32846.59,,,,1556.25,165.58
min,1.0,,,,-29718.8,-1026.22
25%,7510.0,,,,93.13,131.16
50%,15020.0,,,,267.85,257.9
75%,22407.0,,,,809.35,382.49


In [9]:
# Check for missing values.
print(f'Missing values in price history:\n{transactions.isna().sum()}')

Missing values in price history:
account_id       7616
ticker_symbol    7689
trade_date       7635
trade_type       7502
shares           7590
price            7497
dtype: int64


## Notes
- date column should be converted to datetime, extracted to dimension date
- fix typos in trade type (should be either buy or sell)
- drop rows where ticker symbols are not in price history
- trade type should be converted to single character (b: buy and s: sell) and dtype('category')
- account_id should be converted to int
- transaction_id should be created after dropping rows