# EDA - Account Positions

In [1]:
from pathlib import Path
account_positions_csv_file = Path().cwd().parent / 'data' / 'account_positions.csv'

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
# Create dataframe from csv.
account_positions = pd.read_csv(account_positions_csv_file)

In [3]:
# Quick look at the data.
account_positions.head()

Unnamed: 0,account_id,ticker_symbol,shares_held,last_updated
0,1.0,STK325,166.2,2024-12-31
1,1.0,STK104,152.63,2024-12-31
2,1.0,STK159,242.38,2024-12-31
3,1.0,STK405,80.87,2024-12-31
4,1.0,STK094,81.08,2024-12-31


In [4]:
# Non-null count and Dtypes for each column.
account_positions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 724449 entries, 0 to 724448
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   account_id     722692 non-null  float64
 1   ticker_symbol  722684 non-null  object 
 2   shares_held    722551 non-null  float64
 3   last_updated   722613 non-null  object 
dtypes: float64(2), object(2)
memory usage: 22.1+ MB


In [5]:
# Show dataframe stats.
print(f'Shape of account_positions dataframe: {account_positions.shape}')
account_positions.describe(include='all')

Shape of account_positions dataframe: (724449, 4)


Unnamed: 0,account_id,ticker_symbol,shares_held,last_updated
count,722692.0,722684,722551.0,722613
unique,,4032,,370
top,,STK069,,2024-12-31
freq,,1526,,717407
mean,15887.42,,98.75,
std,31959.52,,136.76,
min,1.0,,-1768.62,
25%,7361.0,,25.27,
50%,14897.0,,53.16,
75%,22535.0,,116.71,


In [6]:
# Remove leading/trailing whitespace from strings & convert to lowercase and recheck counts, unique values and freq.
account_positions = account_positions.map(lambda x: x.strip().lower() if isinstance(x, str) else x)
account_positions.columns = account_positions.columns.str.strip().str.lower()
account_positions.describe(include='all')

Unnamed: 0,account_id,ticker_symbol,shares_held,last_updated
count,722692.0,722684,722551.0,722613
unique,,2986,,359
top,,stk069,,2024-12-31
freq,,1538,,721140
mean,15887.42,,98.75,
std,31959.52,,136.76,
min,1.0,,-1768.62,
25%,7361.0,,25.27,
50%,14897.0,,53.16,
75%,22535.0,,116.71,


In [9]:
# Check for missing values.
print(f'Missing values in account positions:\n{account_positions.isna().sum()}')

Missing values in account positions:
account_id       1757
ticker_symbol    1765
shares_held      1898
last_updated     1836
is_valid            0
dtype: int64


In [8]:
# Check if ticker_symbol values match pattern: 3 uppercase letters + 3 digits (eg 'STK069')
def validate_ticker_format(df):
    """
    Validate ticker_symbol column values against pattern: 3 letters + 3 digits
    Returns summary and invalid values
    """
    pattern = r'^[A-Z]{3}\d{3}$'

    # Check pattern match (handle NaN)
    is_valid = df['ticker_symbol'].str.match(pattern, na=False)

    # Summary
    total = len(df)
    valid = is_valid.sum()
    invalid = (~is_valid).sum()
    null = df['ticker_symbol'].isna().sum()

    print(f"Total rows: {total}")
    print(f"Valid format: {valid} ({valid/total*100:.1f}%)")
    print(f"Invalid format: {invalid} ({invalid/total*100:.1f}%)")
    print(f"Null values: {null}")

    # Show invalid values
    if invalid > 0:
        invalid_values = df.loc[~is_valid, 'ticker_symbol'].unique()
        print(f"\nInvalid ticker symbols (unique values): {len(invalid_values)}")
        for val in invalid_values:
            print(f" - {val}")

    return is_valid

account_positions['is_valid'] = validate_ticker_format(account_positions)

Total rows: 724449
Valid format: 0 (0.0%)
Invalid format: 724449 (100.0%)
Null values: 1765

Invalid ticker symbols (unique values): 2987
 - stk325
 - stk104
 - stk159
 - stk405
 - stk094
 - stk236
 - stk420
 - stk011
 - stk051
 - stk017
 - stk001
 - stk261
 - stk146
 - stk204
 - stk189
 - stk221
 - stk130
 - stk018
 - stk338
 - stk345
 - stk029
 - stk399
 - stk118
 - stk460
 - stk102
 - stk113
 - stk413
 - stk173
 - stk319
 - stk137
 - stk429
 - stk473
 - stk145
 - stk231
 - stk206
 - stk179
 - stk207
 - stk432
 - stk147
 - stk081
 - stk216
 - stk080
 - stk181
 - stk449
 - stk088
 - stk332
 - stk250
 - stk264
 - stk214
 - stk351
 - stk301
 - stk162
 - stk381
 - stk350
 - stk082
 - stk481
 - stk079
 - stk215
 - stk335
 - stk197
 - stk424
 - stk494
 - stk367
 - stk151
 - stk012
 - stk365
 - stk007
 - stk406
 - stk302
 - stk062
 - stk384
 - stk208
 - stk066
 - stk363
 - stk057
 - stk015
 - stk371
 - stk010
 - stk217
 - stk346
 - stk343
 - stk144
 - stk149
 - stk469
 - stk143
 - stk498
 -

## Notes:
- account_id should be converted to integer
- last_updated should be converted to datetime
- drop rows with any null or invalid values
- create separate df to contain all rows null and invalid values