# Data Audit and Problem Discovery Notebook 

This notebook provides a reusable set of checks for:
- Structural sanity (shape, dtypes, duplicates)
- Missingness patterns 
- Distributions 
- Type consistency 
- Outliers 
- Basic time-series checks (if date column is present)

Configure:
- `CSV_PATH`
- `DATE_COLS`
- `INDEX_COL`
- `DATE_COL_FOR_TS`

## Config

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 120)

sns.set()

CSV_PATH = "testing.csv"             # <- change this
INDEX_COL = None                     # or e.g. "id"
DATE_COLS = ["date"]                 # e.g. ["date"]
DATE_COL_FOR_TS = None               # e.g. "date" if time-series

READ_CSV_KWARGS = {
    "header": "infer",   # or None if your file has no header row
    # "sep": ",",        # set if not comma
    # "encoding": "utf-8",
}

## Data 

In [2]:
df = pd.read_csv(
    CSV_PATH,
    index_col=INDEX_COL,
    **READ_CSV_KWARGS
)

# Parse date columns if specified
for col in DATE_COLS:
    df[col] = pd.to_datetime(df[col], format="%d/%m/%Y", errors="coerce")

df.head()

Unnamed: 0,date,transaction_amount,transaction_info,account_balance
0,NaT,-17.0,9 DEGREES WATERLOO TARINGA QL AUS Card xx6601 ...,191.36
1,2026-01-01,-17.0,9 DEGREES WATERLOO TARINGA QL AUS Card xx6601 ...,191.36
2,2026-01-01,-33.99,CHEMIST WAREHOUSE SYDNEY NS AUS Card xx6601 Va...,208.36
3,2026-01-01,-10.0,TRANSPORTFORNSW OPAL CHIPPENDALE AUS Card xx66...,242.35
4,2025-12-31,-4.0,MCDONALDS SYD CTRL P SYDNEY NS AUS Card xx6601...,252.35


## Audit

In [None]:
def structural_report(df: pd.DataFrame) -> dict:
    # Returns basic structure of df, missing values, duplicates, dtypes 
    return {
        "n_rows": len(df),
        "n_cols": df.shape[1],
        "n_missing": df.isna().mean().sort_values(),
        "n_duplicates": df.duplicated().sum(),
        "dtypes": df.dtypes.sort_index()
    }

def iqr_outlier_fraction(s: pd.Series) -> float:
    # Returns values that are probably outliers 1.5* IQR
    s = s.dropna()
    if s.empty:
        return np.nan 
    q1, q3 = np.percentile(s, [25,75])
    iqr = q3 - q1
    if iqr == 0:
        return 0.0
    mask = (s < q1 - 1.5 * iqr) | (s > q3 + 1.5 * iqr)
    return mask.mean()

In [4]:
struct = structural_report(df)
print(struct)

{'n_rows': 601, 'n_cols': 4, 'n_missing': transaction_amount    0.000000
transaction_info      0.000000
account_balance       0.000000
date                  0.001664
dtype: float64, 'n_duplicates': np.int64(0), 'dtypes': account_balance              float64
date                  datetime64[ns]
transaction_amount           float64
transaction_info              object
dtype: object}
