In [None]:
from pathlib import Path
accounts_csv_file = Path().cwd().parent / 'data' / 'accounts.csv'

from matplotlib import pyplot as plt
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
# Create dataframe from csv.
accounts = pd.read_csv(accounts_csv_file)

In [None]:
# Quick look at the data.
accounts.head()

In [None]:
# Non-null count and Dtypes for each column.
accounts.info()

In [None]:
# Show dataframe stats.
print(f'Shape of accounts dataframe: {accounts.shape}')
accounts.describe(include='all')

In [None]:
# Remove leading/trailing whitespace from strings & convert to lowercase and recheck counts, unique values and freq.
accounts = accounts.map(lambda x: x.strip().lower() if isinstance(x, str) else x)
accounts.columns = accounts.columns.str.strip().str.lower()
accounts.describe(include='all')

In [None]:
# Check for dupes.
print(f'Duplicate values in accounts:\n{accounts.apply(lambda x: x.duplicated().sum())}')

In [None]:
# Check for missing values.
print(f'Missing values in account positions:\n{accounts.isna().sum()}')

In [None]:
# Account types comparison (before fix).
account_types = accounts['account_type'].unique().tolist()
print(account_types)

counts = accounts['account_type'].value_counts()[['individual', 'joint', 'retirement']]
counts.plot(kind='bar')
plt.title('Account Type Distribution')
plt.xlabel('Account Type')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate labels if needed
plt.tight_layout()  # Prevents label cutoff
plt.show()

## Notes
- account_id should be converted to integer
- created_at should be converted to datetime
- account types need to be fixed
- drop rows with any null or invalid values
- create separate df to contain all rows null and invalid values