# EDA - Account Positions

In [1]:
import importlib
from pathlib import Path
account_positions_csv_file = Path().cwd().parent / 'data' / 'account_positions.csv'

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

import helpers
importlib.reload(helpers)
from helpers import validate_ticker_format

In [2]:
# Create dataframe from csv.
account_positions = pd.read_csv(account_positions_csv_file)

In [3]:
# Quick look at the data.
account_positions.head()

Unnamed: 0,account_id,ticker_symbol,shares_held,last_updated
0,1.0,STK325,166.2,2024-12-31
1,1.0,STK104,152.63,2024-12-31
2,1.0,STK159,242.38,2024-12-31
3,1.0,STK405,80.87,2024-12-31
4,1.0,STK094,81.08,2024-12-31


In [4]:
# Non-null count and Dtypes for each column.
account_positions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724449 entries, 0 to 724448
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   account_id     722692 non-null  float64
 1   ticker_symbol  722684 non-null  object 
 2   shares_held    722551 non-null  float64
 3   last_updated   722613 non-null  object 
dtypes: float64(2), object(2)
memory usage: 22.1+ MB


In [5]:
# Show dataframe stats.
print(f'Shape of account_positions dataframe: {account_positions.shape}')
account_positions.describe(include='all')

Shape of account_positions dataframe: (724449, 4)


Unnamed: 0,account_id,ticker_symbol,shares_held,last_updated
count,722692.0,722684,722551.0,722613
unique,,4032,,370
top,,STK069,,2024-12-31
freq,,1526,,717407
mean,15887.42,,98.75,
std,31959.52,,136.76,
min,1.0,,-1768.62,
25%,7361.0,,25.27,
50%,14897.0,,53.16,
75%,22535.0,,116.71,


In [6]:
# Remove leading/trailing whitespace from strings & convert to lowercase and recheck counts, unique values and freq.
account_positions = account_positions.map(lambda x: x.strip().lower() if isinstance(x, str) else x)
account_positions.columns = account_positions.columns.str.strip().str.lower()
account_positions.describe(include='all')

Unnamed: 0,account_id,ticker_symbol,shares_held,last_updated
count,722692.0,722684,722551.0,722613
unique,,2986,,359
top,,stk069,,2024-12-31
freq,,1538,,721140
mean,15887.42,,98.75,
std,31959.52,,136.76,
min,1.0,,-1768.62,
25%,7361.0,,25.27,
50%,14897.0,,53.16,
75%,22535.0,,116.71,


In [7]:
# Check for missing values.
print(f'Missing values in account positions:\n{account_positions.isna().sum()}')

Missing values in account positions:
account_id       1757
ticker_symbol    1765
shares_held      1898
last_updated     1836
dtype: int64


In [8]:
# Check if ticker_symbol values match pattern: 3 uppercase letters + 3 digits (eg 'STK069')
account_positions['is_valid'] = validate_ticker_format(account_positions)

Total rows: 724449
Valid format: 720426 (99.4%)
Invalid format: 4023 (0.6%)
Null values: 1765

Invalid ticker symbols (unique values): 1935
 - nan
 - stk03
 - stk383_
 - st0k44
 - stk1w8
 - stk47
 - stk3014
 - stk0263
 - stvk273
 - stk21
 - stk1j9
 - stk383-
 - stk32
 - stk31y2
 - stk16w
 - st214
 - stk434x
 - stk045-
 - stnk216
 - stk045y
 - stks50
 - stke15
 - stk0742
 - stfk042
 - stk077d
 - stkb073
 - stkg308
 - stk354_
 - stk36a
 - stk189-
 - stk124_
 - stk08
 - stkh33
 - stk14i4
 - stk46m0
 - stkd36
 - stk403m
 - stk4k22
 - stk36f4
 - stk218s
 - stk1z00
 - st484
 - stk36
 - stk085_
 - ystk456
 - tk109
 - stk46f
 - stk05
 - tk159
 - stfk285
 - stk60
 - stk159w
 - stk196_
 - stk435_
 - stk0x57
 - st262
 - stk200e
 - stk0v7
 - stk73
 - sk053
 - sk173
 - stk03v
 - stk252_
 - stqk052
 - st4k29
 - stk460-
 - stjk108
 - stk056i
 - stk278_
 - fstk058
 - stk01
 - st2k32
 - stk1z3
 - stk1v89
 - stk1118
 - stk0b2
 - stk150r
 - svtk282
 - stk0n07
 - stk238-
 - stk330s
 - st2k34
 - stky08
 - 

## Notes:
- account_id should be converted to integer
- last_updated should be converted to datetime
- drop rows with any null or invalid values
- create separate df to contain all rows null and invalid values