# Week 10 â€” Day 1: Feature Engineering

### Imports and Load Splits

In [1]:
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, average_precision_score

In [2]:
ARTIFACTS_DIR = Path("..") / "models"
REPORTS_DIR = Path("..") / "reports"
REPORTS_DIR.mkdir(exist_ok=True)

X_train, X_test, y_train, y_test = joblib.load(ARTIFACTS_DIR / "split_v1.joblib")

print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (227845, 30) (227845,)
Test: (56962, 30) (56962,)


### Feature engineering function

In [3]:
def add_features(df):
    df = df.copy()

    # log(Amount + 1) to handle big/skewed values
    df["log_amount"] = np.log1p(df["Amount"])

    # Convert Time (seconds) into "hour bucket" across the 2-day window
    df["hour"] = (df["Time"] // 3600).astype(int)  # 0..47 approx

    # Cyclic time features (repeat pattern)
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

    return df

### Apply Features to Train and Test

In [4]:
X_train_fe = add_features(X_train)
X_test_fe = add_features(X_test)

print("Before:", X_train.shape, "After:", X_train_fe.shape)
X_train_fe[["Amount", "log_amount", "Time", "hour", "hour_sin", "hour_cos"]].head()

Before: (227845, 30) After: (227845, 34)


Unnamed: 0,Amount,log_amount,Time,hour,hour_sin,hour_cos
265518,7.32,2.118662,161919.0,44,-0.8660254,0.5
180305,2.99,1.383791,124477.0,34,0.5,-0.866025
42664,175.1,5.171052,41191.0,11,0.258819,-0.965926
198723,6.1,1.960095,132624.0,36,3.67394e-16,-1.0
82325,86.1,4.467057,59359.0,16,-0.8660254,-0.5


### New Preprocessing Pipeline

In [5]:
preprocess_fe = Pipeline(steps=[
    ("scaler", StandardScaler())
])

X_train_scaled = preprocess_fe.fit_transform(X_train_fe)
X_test_scaled = preprocess_fe.transform(X_test_fe)

X_train_scaled.shape, X_test_scaled.shape

((227845, 34), (56962, 34))

### Train Logistic Regression

In [6]:
# train model
logreg_fe = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

logreg_fe.fit(X_train_scaled, y_train)
print("Trained: LogReg + Feature Engineering + Class Weights")

Trained: LogReg + Feature Engineering + Class Weights


### Evaluate

In [7]:
y_pred = logreg_fe.predict(X_test_scaled)
y_prob = logreg_fe.predict_proba(X_test_scaled)[:,1]

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
pr_auc = average_precision_score(y_test, y_prob)

print("Confusion matrix (tn, fp, fn, tp):", tn, fp, fn, tp)
print(f"Precision: {precision:.6f}")
print(f"Recall:    {recall:.6f}")
print(f"F1:        {f1:.6f}")
print(f"PR-AUC:    {pr_auc:.6f}")

Confusion matrix (tn, fp, fn, tp): 55450 1414 8 90
Precision: 0.059840
Recall:    0.918367
F1:        0.112360
PR-AUC:    0.720672


In [12]:
# comparison table with previous results 
week9_baseline = {"model": "LogReg Baseline", "precision": 0.826667, "recall": 0.632653, "f1": 0.716763, "pr_auc": 0.741382}
week9_weighted = {"model": "LogReg + Class Weights", "precision": 0.060976, "recall": 0.918367, "f1": 0.114358, "pr_auc": 0.718971}

week10_fe = {"model": "LogReg + FE + Class Weights", "precision": precision, "recall": recall, "f1": f1, "pr_auc": pr_auc}

compare_df = pd.DataFrame([week9_baseline, week9_weighted, week10_fe])
compare_df

Unnamed: 0,model,precision,recall,f1,pr_auc
0,LogReg Baseline,0.826667,0.632653,0.716763,0.741382
1,LogReg + Class Weights,0.060976,0.918367,0.114358,0.718971
2,LogReg + FE + Class Weights,0.05984,0.918367,0.11236,0.720672


In [9]:
# save results
compare_df.to_csv(REPORTS_DIR / "week10_day1_feature_engineering_results.csv", index=False)
print("Saved:", REPORTS_DIR / "week10_day1_feature_engineering_results.csv")

Saved: ..\reports\week10_day1_feature_engineering_results.csv


In [11]:
# save new preprocessing pipeline + model
compare_df.to_csv(REPORTS_DIR / "week10_day1_feature_engineering_results.csv", index=False)
joblib.dump(preprocess_fe, ARTIFACTS_DIR / "preprocess_fe_v1.joblib")
joblib.dump(logreg_fe, ARTIFACTS_DIR / "logreg_fe_weighted_v1.joblib")
print("Saved preprocess_fe_v1.joblib and logreg_fe_weighted_v1.joblib")

Saved preprocess_fe_v1.joblib and logreg_fe_weighted_v1.joblib
