In [1]:
from pathlib import Path
import pandas as pd

BASE_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_PATH = BASE_DIR / "data" / "creditcard.csv"

df = pd.read_csv(
    DATA_PATH,
    parse_dates=["trans_date_trans_time"],
    low_memory=False
)


In [2]:
df = df.sort_values("trans_date_trans_time")


In [3]:
DROP_COLS = [
    "Unnamed: 0",
    "first", "last",
    "street", "zip",
    "trans_num",
    "merch_zipcode"
]

df = df.drop(columns=DROP_COLS)


In [4]:
labels = df["is_fraud"]
df = df.drop(columns=["is_fraud"])


In [5]:
num_cols = [
    "amt", "unix_time",
    "lat", "long",
    "merch_lat", "merch_long",
    "city_pop"
]

X_num = df[num_cols]


In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_num)


In [7]:
split_idx = int(len(X_scaled) * 0.7)

X_train = X_scaled[:split_idx]
X_val = X_scaled[split_idx:]

y_val = labels.iloc[split_idx:]


In [9]:
from pathlib import Path
import numpy as np
import joblib

# Resolve project root safely
BASE_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

PROCESSED_DIR = BASE_DIR / "data" / "processed"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Save arrays
np.save(PROCESSED_DIR / "X_train.npy", X_train)
np.save(PROCESSED_DIR / "X_val.npy", X_val)
np.save(PROCESSED_DIR / "y_val.npy", y_val.values)

# Save scaler
joblib.dump(scaler, PROCESSED_DIR / "scaler.joblib")


['/home/caleb/transaction-anomaly-ml/data/processed/scaler.joblib']