# EDA - Account Positions

In [None]:
from pathlib import Path
account_positions_csv_file = Path().cwd().parent / 'data' / 'account_positions.csv'

import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
# Create dataframe from csv.
account_positions = pd.read_csv(account_positions_csv_file)

In [None]:
# Quick look at the data.
account_positions.head()

In [None]:
# Non-null count and Dtypes for each column.
account_positions.info()

In [None]:
# Show dataframe stats.
print(f'Shape of account_positions dataframe: {account_positions.shape}')
account_positions.describe(include='all')

In [None]:
# Remove leading/trailing whitespace from strings & convert to lowercase and recheck counts, unique values and freq.
account_positions = account_positions.map(lambda x: x.strip().lower() if isinstance(x, str) else x)
account_positions.columns = account_positions.columns.str.strip().str.lower()
account_positions.describe(include='all')

In [None]:
# Check for missing values.
print(f'Missing values in account positions:\n{account_positions.isna().sum()}')

In [None]:
# Check if ticker_symbol values match pattern: 3 uppercase letters + 3 digits (eg 'STK069')
def validate_ticker_format(df):
    """
    Validate ticker_symbol column values against pattern: 3 letters + 3 digits
    Returns summary and invalid values
    """
    pattern = r'^[A-Z]{3}\d{3}$'

    # Check pattern match (handle NaN)
    is_valid = df['ticker_symbol'].str.match(pattern, na=False)

    # Summary
    total = len(df)
    valid = is_valid.sum()
    invalid = (~is_valid).sum()
    null = df['ticker_symbol'].isna().sum()

    print(f"Total rows: {total}")
    print(f"Valid format: {valid} ({valid/total*100:.1f}%)")
    print(f"Invalid format: {invalid} ({invalid/total*100:.1f}%)")
    print(f"Null values: {null}")

    # Show invalid values
    if invalid > 0:
        invalid_values = df.loc[~is_valid, 'ticker_symbol'].unique()
        print(f"\nInvalid ticker symbols (unique values): {len(invalid_values)}")
        for val in invalid_values:
            print(f" - {val}")

    return is_valid

account_positions['is_valid'] = validate_ticker_format(account_positions)

## Notes:
- account_id should be converted to integer
- last_updated should be converted to datetime
- drop rows with any null or invalid values
- create separate df to contain all rows null and invalid values