In [1]:
# -*- coding: utf-8 -*-
# Импорт и пути

from pathlib import Path
from typing import Dict, Tuple
from collections import Counter
import json
import math

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
from IPython.display import display

BASE_DIR = Path("")
TX_PATH = BASE_DIR / "transaction_fraud_data.parquet"
FX_PATH = BASE_DIR / "historical_currency_exchange.parquet"
OUT_DIR = BASE_DIR / "eda_outputs"
PLOTS_DIR = OUT_DIR / "plots"
OUT_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

print("TX_PATH:", TX_PATH)
print("FX_PATH:", FX_PATH)
print("OUT_DIR:", OUT_DIR)


TX_PATH: transaction_fraud_data.parquet
FX_PATH: historical_currency_exchange.parquet
OUT_DIR: eda_outputs


In [2]:
# Утилиты: проверки, нормализация, разворот вложенных полей

def ensure_files_exist():
    missing = [p.name for p in [TX_PATH, FX_PATH] if not p.exists()]
    if missing:
        raise FileNotFoundError(
            f"Отсутствуют файлы: {', '.join(missing)}. "
            "Проверь пути в TX_PATH/FX_PATH."
        )

def normalize_timestamp(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
    df["timestamp"] = df["timestamp"].dt.tz_convert(None)
    return df

def expand_last_hour_activity(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "last_hour_activity" in df.columns:
        expanded = pd.json_normalize(df["last_hour_activity"].dropna())
        for col in ["num_transactions", "total_amount", "unique_merchants", "unique_countries", "max_single_amount"]:
            if col not in expanded.columns:
                expanded[col] = np.nan
        expanded = expanded.reindex(df.index)
        expanded.columns = [f"lha_{c}" for c in expanded.columns]
        df = pd.concat([df.drop(columns=["last_hour_activity"]), expanded], axis=1)
    return df

def basic_qc(df: pd.DataFrame) -> Dict[str, int]:
    info = {}
    if "transaction_id" in df.columns:
        info["duplicate_transaction_id"] = int(df["transaction_id"].duplicated().sum())
    info["rows"] = int(len(df))
    info["na_timestamp"] = int(df["timestamp"].isna().sum()) if "timestamp" in df.columns else None
    info["na_amount"] = int(df["amount"].isna().sum()) if "amount" in df.columns else None
    return info


In [3]:
# Валюты: подготовка FX и конвертация сумм в USD

def melt_fx_long(fx: pd.DataFrame) -> pd.DataFrame:
    fx = fx.copy()
    fx["date"] = pd.to_datetime(fx["date"]).dt.normalize()
    long = fx.melt(id_vars=["date"], var_name="currency", value_name="rate")
    long["rate"] = pd.to_numeric(long["rate"], errors="coerce")
    long = long.sort_values(["currency", "date"])
    return long

def convert_to_usd(df: pd.DataFrame, fx: pd.DataFrame, tolerance_days: int = 7):
    df = df.copy()

    ts = pd.to_datetime(df["timestamp"], errors="coerce", utc=True).dt.tz_convert(None)
    df["timestamp"] = ts
    df["date"] = ts.dt.normalize().astype("datetime64[ns]")
    df["amount"] = pd.to_numeric(df["amount"], errors="coerce")
    df["currency"] = df["currency"].astype("string")
    df["_orig_idx"] = np.arange(len(df))

    fx = fx.copy()
    fx["date"] = pd.to_datetime(fx["date"], errors="coerce").dt.normalize().astype("datetime64[ns]")
    fx_long = fx.melt(id_vars=["date"], var_name="currency", value_name="rate")
    fx_long["currency"] = fx_long["currency"].astype("string")
    fx_long["rate"] = pd.to_numeric(fx_long["rate"], errors="coerce")

    known_ccy = set(fx_long["currency"].dropna().unique().tolist())
    df_ccy = df["currency"].dropna().unique().tolist()
    present_ccy = set([str(c) for c in df_ccy if c is not None])

    intersect_ccy = list(present_ccy & known_ccy)
    unknown_ccy = list(present_ccy - known_ccy)

    parts = []

    for cur in intersect_ccy:
        sub = df[(df["currency"] == cur)].copy()
        sub_with_date = sub[sub["date"].notna()].copy()
        sub_no_date = sub[sub["date"].isna()].copy()  # позже вернём

        fx_cur = fx_long[(fx_long["currency"] == cur) & (fx_long["date"].notna())].copy()

        sub_with_date.sort_values("date", kind="mergesort", inplace=True)
        fx_cur.sort_values("date", kind="mergesort", inplace=True)

        if cur == "USD":
            sub_with_date["rate"] = 1.0
            sub_with_date["amount_usd"] = sub_with_date["amount"]
        else:
            if fx_cur.empty:
                sub_with_date["rate"] = np.nan
                sub_with_date["amount_usd"] = np.nan
            else:
                merged_cur = pd.merge_asof(
                    sub_with_date,
                    fx_cur[["date", "rate"]].copy(),
                    on="date",
                    direction="nearest",
                    tolerance=pd.Timedelta(days=tolerance_days),
                    allow_exact_matches=True,
                )
                merged_cur["rate"] = merged_cur["rate"].replace({0: np.nan})
                merged_cur["amount_usd"] = merged_cur["amount"] / merged_cur["rate"]
                sub_with_date = merged_cur

        if not sub_no_date.empty:
            if cur == "USD":
                sub_no_date["rate"] = 1.0
                sub_no_date["amount_usd"] = sub_no_date["amount"]
            else:
                sub_no_date["rate"] = np.nan
                sub_no_date["amount_usd"] = np.nan

        parts.append(sub_with_date)
        if not sub_no_date.empty:
            parts.append(sub_no_date)

    if unknown_ccy:
        for cur in unknown_ccy:
            sub = df[(df["currency"] == cur)].copy()
            if cur == "USD":
                sub["rate"] = 1.0
                sub["amount_usd"] = sub["amount"]
            else:
                sub["rate"] = np.nan
                sub["amount_usd"] = np.nan
            parts.append(sub)

    sub_nan_ccy = df[df["currency"].isna()].copy()
    if not sub_nan_ccy.empty:
        sub_nan_ccy["rate"] = np.nan
        sub_nan_ccy["amount_usd"] = np.where(sub_nan_ccy["currency"] == "USD", sub_nan_ccy["amount"], np.nan)
        parts.append(sub_nan_ccy)

    if parts:
        merged_all = pd.concat(parts, ignore_index=True)
    else:
        merged_all = df.copy()
        merged_all["rate"] = np.nan
        merged_all["amount_usd"] = np.where(merged_all["currency"] == "USD", merged_all["amount"], np.nan)

    merged_all.sort_values("_orig_idx", inplace=True)
    merged_all.drop(columns=["_orig_idx"], inplace=True)

    unknown_mask = ~df["currency"].isin(list(known_ccy)) & df["currency"].notna()
    summary = {
        "total_rows": int(len(df)),
        "unknown_currency_rows": int(unknown_mask.sum()),
        "unknown_currency_unique": int(df.loc[unknown_mask, "currency"].nunique()),
        "rows_without_rate_after_join": int(merged_all["amount_usd"].isna().sum()),
    }

    return merged_all, summary



In [4]:
# Метрики и сегменты

def compute_core_kpis(df: pd.DataFrame) -> Dict[str, float]:
    kpis = {}
    kpis["rows"] = int(len(df))
    kpis["customers"] = int(df["customer_id"].nunique()) if "customer_id" in df.columns else None
    kpis["vendors"] = int(df["vendor"].nunique()) if "vendor" in df.columns else None
    kpis["countries"] = int(df["country"].nunique()) if "country" in df.columns else None
    kpis["cities"] = int(df["city"].nunique()) if "city" in df.columns else None
    kpis["fraud_rate_overall"] = float(df["is_fraud"].mean()) if "is_fraud" in df.columns else None
    if "amount_usd" in df.columns:
        col = "amount_usd"
    else:
        col = "amount"
    kpis["median_amount"] = float(pd.to_numeric(df[col], errors="coerce").median())
    kpis["p95_amount"] = float(pd.to_numeric(df[col], errors="coerce").quantile(0.95))
    kpis["p99_amount"] = float(pd.to_numeric(df[col], errors="coerce").quantile(0.99))
    return kpis

def segment_tables(df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
    seg = {}

    def agg_rate(g):
        return pd.Series({
            "count": len(g),
            "fraud_count": int(g["is_fraud"].sum()),
            "fraud_rate": float(g["is_fraud"].mean() if len(g) else np.nan),
            "avg_amount": float(pd.to_numeric(g["amount_usd"] if "amount_usd" in g else g["amount"], errors="coerce").mean())
        })

    if "country" in df.columns:
        seg["by_country"] = df.groupby("country", dropna=False).apply(agg_rate).sort_values("fraud_count", ascending=False)

    if "city" in df.columns:
        seg["by_city"] = df.groupby("city", dropna=False).apply(agg_rate).sort_values("avg_amount", ascending=False)

    if "vendor_category" in df.columns:
        seg["by_vendor_category"] = df.groupby("vendor_category", dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)

    if "device" in df.columns:
        seg["by_device"] = df.groupby("device", dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)

    if "channel" in df.columns:
        seg["by_channel"] = df.groupby("channel", dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)

    if "is_card_present" in df.columns:
        seg["by_card_present"] = df.groupby("is_card_present", dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)

    for col in ["is_outside_home_country", "is_weekend", "is_high_risk_vendor"]:
        if col in df.columns:
            seg[f"by_{col}"] = df.groupby(col, dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)

    if "vendor" in df.columns:
        seg["top_vendors_by_fraud"] = (
            df.groupby("vendor")["is_fraud"]
            .agg(["count", "sum"])
            .rename(columns={"sum": "fraud_count"})
            .sort_values("fraud_count", ascending=False)
            .head(20)
        )
    return seg

def per_customer_hour_rate(df: pd.DataFrame) -> float:
    tmp = df[["customer_id", "timestamp"]].dropna().copy()
    tmp["hour"] = tmp["timestamp"].dt.floor("H")
    grp = tmp.groupby(["customer_id", "hour"]).size().rename("tx_count").reset_index()
    return float(grp["tx_count"].mean()) if len(grp) else float("nan")


In [5]:
from typing import Any


# Стриминговый EDA: KPI/сегменты без загрузки всего файла в память

def streaming_kpis_and_segments(tx_path: Path, out_dir: Path, topk: int = 20, batch_size: int = 200_000) -> tuple[Any, dict[str, Any]]:
    pf = pq.ParquetFile(tx_path)
    cols = ["customer_id","vendor","vendor_category","country","city","is_fraud","timestamp"]

    total_rows = 0
    fraud_count = 0
    uniq_customers, uniq_vendors, uniq_countries, uniq_cities = set(), set(), set(), set()

    by_country_count, by_country_fraud = Counter(), Counter()
    by_vcat_count, by_vcat_fraud = Counter(), Counter()
    by_vendor_count, by_vendor_fraud = Counter(), Counter()

    per_cust_hour = Counter()

    for batch in pf.iter_batches(columns=cols, batch_size=batch_size):
        tb = pa.Table.from_batches([batch])
        df = tb.to_pandas(types_mapper=pd.ArrowDtype)

        df["is_fraud"] = pd.to_numeric(df["is_fraud"], errors="coerce").fillna(0.0)

        n = len(df)
        total_rows += n
        fraud_count += int(df["is_fraud"].sum())

        uniq_customers.update(df["customer_id"].dropna().unique().tolist())
        uniq_vendors.update(df["vendor"].dropna().unique().tolist())
        uniq_countries.update(df["country"].dropna().unique().tolist())
        uniq_cities.update(df["city"].dropna().unique().tolist())

        grp_c = df.groupby("country")["is_fraud"].agg(["size","sum"])
        for k, row in grp_c.iterrows():
            by_country_count[k] += int(row["size"])
            by_country_fraud[k] += int(row["sum"])

        grp_vc = df.groupby("vendor_category")["is_fraud"].agg(["size","sum"])
        for k, row in grp_vc.iterrows():
            by_vcat_count[k] += int(row["size"])
            by_vcat_fraud[k] += int(row["sum"])

        grp_v = df.groupby("vendor")["is_fraud"].agg(["size","sum"])
        for k, row in grp_v.iterrows():
            by_vendor_count[k] += int(row["size"])
            by_vendor_fraud[k] += int(row["sum"])

        # среднее число транзакций на клиента-час
        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True).dt.tz_convert(None)
        df["hour"] = df["timestamp"].dt.floor("H")
        grp_hour = df.groupby(["customer_id","hour"]).size()
        for key, val in grp_hour.items():
            per_cust_hour[key] += int(val)

    df_country = pd.DataFrame({
        "country": list(by_country_count.keys()),
        "count": [by_country_count[k] for k in by_country_count.keys()],
        "fraud_count": [by_country_fraud.get(k, 0) for k in by_country_count.keys()],
    })
    df_country["fraud_rate"] = np.where(df_country["count"]>0, df_country["fraud_count"]/df_country["count"], np.nan)
    df_country = df_country.sort_values("fraud_count", ascending=False)

    df_vcat = pd.DataFrame({
        "vendor_category": list(by_vcat_count.keys()),
        "count": [by_vcat_count[k] for k in by_vcat_count.keys()],
        "fraud_count": [by_vcat_fraud.get(k, 0) for k in by_vcat_count.keys()],
    })
    df_vcat["fraud_rate"] = np.where(df_vcat["count"]>0, df_vcat["fraud_count"]/df_vcat["count"], np.nan)
    df_vcat = df_vcat.sort_values("fraud_rate", ascending=False)

    df_vendor = pd.DataFrame({
        "vendor": list(by_vendor_count.keys()),
        "count": [by_vendor_count[k] for k in by_vendor_count.keys()],
        "fraud_count": [by_vendor_fraud.get(k, 0) for k in by_vendor_count.keys()],
    })
    df_vendor = df_vendor.sort_values("fraud_count", ascending=False).head(topk)

    if len(per_cust_hour):
        avg_tx_per_customer_hour = float(np.mean(list(per_cust_hour.values())))
    else:
        avg_tx_per_customer_hour = float("nan")

    kpis = {
        "rows": int(total_rows),
        "customers": int(len(uniq_customers)),
        "vendors": int(len(uniq_vendors)),
        "countries": int(len(uniq_countries)),
        "cities": int(len(uniq_cities)),
        "fraud_rate_overall": float(fraud_count / total_rows) if total_rows else None,
        "avg_tx_per_customer_hour": avg_tx_per_customer_hour,
    }

    df_country.to_csv(OUT_DIR / "seg_by_country.csv", index=False)
    df_vcat.to_csv(OUT_DIR / "seg_by_vendor_category.csv", index=False)
    df_vendor.to_csv(OUT_DIR / "top_vendors_by_fraud.csv", index=False)
    (OUT_DIR / "kpis_streaming.json").write_text(json.dumps(kpis, ensure_ascii=False, indent=2), encoding="utf-8")

    return kpis, {"by_country": df_country, "by_vendor_category": df_vcat, "top_vendors_by_fraud": df_vendor}


In [6]:
# Визуализации

def plot_amount_histogram(df: pd.DataFrame, out_path: Path):
    plt.figure()
    if "amount_usd" in df.columns and df["amount_usd"].notna().any():
        vals = pd.to_numeric(df["amount_usd"], errors="coerce").dropna()
        xlabel = "Amount in USD"
    else:
        vals = pd.to_numeric(df["amount"], errors="coerce").dropna()
        xlabel = "Amount (raw)"
    plt.hist(vals, bins=50)
    plt.xlabel(xlabel)
    plt.ylabel("Frequency")
    plt.title("Distribution of transaction amounts")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def plot_fraud_rate_by_vendor_category(df: pd.DataFrame, out_path: Path):
    if "vendor_category" not in df.columns:
        return
    fr = (
        df.groupby("vendor_category")["is_fraud"]
        .mean()
        .sort_values(ascending=False)
        .reset_index()
    )
    plt.figure()
    plt.bar(fr["vendor_category"].astype(str), fr["is_fraud"])
    plt.xlabel("Vendor category")
    plt.ylabel("Fraud rate")
    plt.title("Fraud rate by vendor category")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()


In [8]:
use_cols = None
df_full = pd.read_parquet(TX_PATH, columns=use_cols)
df_full = normalize_timestamp(df_full)
df_full = expand_last_hour_activity(df_full)

fx = pd.read_parquet(FX_PATH)
df_full, fx_summary = convert_to_usd(df_full, fx)
print("FX summary:", fx_summary)

kpis_mem = compute_core_kpis(df_full)
avg_tx_per_hour = per_customer_hour_rate(df_full)
segments = segment_tables(df_full)

print("KPI (in-memory):")
display(pd.Series({**kpis_mem, "avg_tx_per_customer_hour": avg_tx_per_hour}))

if "by_country" in segments:
    print("\nFraud по странам (top 10):")
    display(segments["by_country"].head(10))
if "by_vendor_category" in segments:
    print("\nFraud по категориям мерчантов (top 15 по fraud_rate):")
    display(segments["by_vendor_category"].head(15))
if "top_vendors_by_fraud" in segments:
    print("\nTop-20 мерчантов по числу мошенничеств:")
    display(segments["top_vendors_by_fraud"])

hist_path = PLOTS_DIR / "amount_hist.png"
plot_amount_histogram(df_full, hist_path)
print("Гистограмма сумм сохранена в:", hist_path)

fr_by_cat_path = PLOTS_DIR / "fraud_rate_by_vendor_category.png"
plot_fraud_rate_by_vendor_category(df_full, fr_by_cat_path)
print("График fraud_rate по категориям мерчантов сохранён в:", fr_by_cat_path)


FX summary: {'total_rows': 7483766, 'unknown_currency_rows': 0, 'unknown_currency_unique': 0, 'rows_without_rate_after_join': 0}


  tmp["hour"] = tmp["timestamp"].dt.floor("H")
  seg["by_country"] = df.groupby("country", dropna=False).apply(agg_rate).sort_values("fraud_count", ascending=False)
  seg["by_city"] = df.groupby("city", dropna=False).apply(agg_rate).sort_values("avg_amount", ascending=False)
  seg["by_vendor_category"] = df.groupby("vendor_category", dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)
  seg["by_device"] = df.groupby("device", dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)
  seg["by_channel"] = df.groupby("channel", dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)
  seg["by_card_present"] = df.groupby("is_card_present", dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)
  seg[f"by_{col}"] = df.groupby(col, dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)
  seg[f"by_{col}"] = df.groupby(col, dropna=False).apply(agg_rate).sort_values("fraud_rate", ascending=False)
  seg[f"b

KPI (in-memory):


rows                        7.483766e+06
customers                   4.869000e+03
vendors                     1.050000e+02
countries                   1.200000e+01
cities                      1.100000e+01
fraud_rate_overall          1.997282e-01
median_amount               3.592600e+02
p95_amount                  1.735918e+03
p99_amount                  2.885956e+03
avg_tx_per_customer_hour    2.452776e+00
dtype: float64


Fraud по странам (top 10):


Unnamed: 0_level_0,count,fraud_count,fraud_rate,avg_amount
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Russia,793730.0,299425.0,0.377238,590.245348
Mexico,785704.0,298841.0,0.380348,772.656684
Brazil,804800.0,298629.0,0.37106,676.081344
Nigeria,849840.0,298600.0,0.35136,187.987569
Australia,496695.0,37652.0,0.075805,532.699657
Japan,527393.0,37592.0,0.071279,437.522896
France,541287.0,37426.0,0.069143,529.359772
Singapore,588668.0,37414.0,0.063557,616.041699
UK,538493.0,37345.0,0.069351,550.406134
USA,500060.0,37312.0,0.074615,565.539111



Fraud по категориям мерчантов (top 15 по fraud_rate):


Unnamed: 0_level_0,count,fraud_count,fraud_rate,avg_amount
vendor_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Travel,935790.0,187477.0,0.200341,1040.826006
Grocery,934029.0,186987.0,0.200194,416.534478
Gas,935401.0,186829.0,0.199731,513.259443
Restaurant,936178.0,186951.0,0.199696,323.328116
Entertainment,936173.0,186890.0,0.199632,350.314504
Education,933542.0,186203.0,0.199459,514.491554
Retail,935883.0,186613.0,0.199398,667.305228
Healthcare,936770.0,186769.0,0.199376,514.887331



Top-20 мерчантов по числу мошенничеств:


Unnamed: 0_level_0,count,fraud_count
vendor,Unnamed: 1_level_1,Unnamed: 2_level_1
Local Gas Station,155977,31205
Truck Stop,155945,31192
Chegg,156105,31171
University Bookstore,155827,31136
Highway Gas Stop,155980,31016
Barnes & Noble,154833,30933
DuaneReade,116938,23513
CVS Pharmacy,117377,23508
Walmart Grocery,116349,23475
Medical Center,117149,23464


Гистограмма сумм сохранена в: eda_outputs\plots\amount_hist.png
График fraud_rate по категориям мерчантов сохранён в: eda_outputs\plots\fraud_rate_by_vendor_category.png


In [9]:
# Метрики: threshold под целевой FPR и Expected Cost

from typing import Tuple, Dict
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, precision_recall_curve, average_precision_score, roc_auc_score

def threshold_at_fpr(y_true: np.ndarray, y_score: np.ndarray, target_fpr: float = 0.005) -> Dict[str, float]:
    """
    Возвращает порог, при котором FPR <= target_fpr (0.5% по умолчанию),
    а Recall (TPR) максимален. Если таких точек нет — берёт ближайшую сверху.
    """
    fpr, tpr, thr = roc_curve(y_true, y_score)
    ok = np.where(fpr <= target_fpr)[0]
    if len(ok) > 0:
        i = ok[np.argmax(tpr[ok])]
    else:
        # нет ни одной точки с FPR ниже порога — берём минимальный FPR
        i = np.argmin(fpr)
    return {
        "threshold": float(thr[i]),
        "fpr": float(fpr[i]),
        "tpr_recall": float(tpr[i]),
    }

def confusion_at_threshold(y_true: np.ndarray, y_score: np.ndarray, thr: float) -> Dict[str, int]:
    y_pred = (y_score >= thr).astype(int)
    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())
    tn = int(((y_true == 0) & (y_pred == 0)).sum())
    return {"TP": tp, "FP": fp, "FN": fn, "TN": tn}

def expected_cost(y_true: np.ndarray, y_score: np.ndarray, thr: float, c_fp: float, c_fn: float) -> Dict[str, float]:
    cm = confusion_at_threshold(y_true, y_score, thr)
    cost = cm["FP"] * c_fp + cm["FN"] * c_fn
    return {
        "expected_cost": float(cost),
        "TP": cm["TP"], "FP": cm["FP"], "FN": cm["FN"], "TN": cm["TN"]
    }

def compute_all_core_metrics(y_true: np.ndarray, y_score: np.ndarray) -> Dict[str, float]:
    roc = roc_auc_score(y_true, y_score)
    pr = average_precision_score(y_true, y_score)  # PR AUC (AP)
    return {"roc_auc": float(roc), "pr_auc": float(pr)}


In [10]:
import matplotlib.pyplot as plt

def _bar_fraud_rate(df: pd.DataFrame, col: str, out_path: Path):
    if col not in df.columns:
        return
    tmp = df.copy()
    grp = tmp.groupby(col)["is_fraud"].mean().sort_values(ascending=False)
    plt.figure()
    plt.bar(grp.index.astype(str), grp.values)
    plt.xlabel(col)
    plt.ylabel("Fraud rate")
    plt.title(f"Fraud rate by {col}")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def plot_fraud_by_channel_device(df: pd.DataFrame):
    _bar_fraud_rate(df, "channel", PLOTS_DIR / "fraud_rate_by_channel.png")
    _bar_fraud_rate(df, "device", PLOTS_DIR / "fraud_rate_by_device.png")

def plot_fraud_by_hour_day(df: pd.DataFrame):
    if "timestamp" not in df.columns:
        return
    tmp = df.copy()
    tmp["hour"] = tmp["timestamp"].dt.hour
    tmp["dow"] = tmp["timestamp"].dt.dayofweek  # 0=Mon .. 6=Sun

    # Hour-of-day
    fr_hour = tmp.groupby("hour")["is_fraud"].mean()
    plt.figure()
    plt.plot(fr_hour.index.values, fr_hour.values, marker="o")
    plt.xlabel("Hour of day")
    plt.ylabel("Fraud rate")
    plt.title("Fraud rate by hour of day")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "fraud_rate_by_hour.png")
    plt.close()

    # Day-of-week
    fr_dow = tmp.groupby("dow")["is_fraud"].mean()
    plt.figure()
    plt.plot(fr_dow.index.values, fr_dow.values, marker="o")
    plt.xlabel("Day of week (0=Mon)")
    plt.ylabel("Fraud rate")
    plt.title("Fraud rate by day of week")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "fraud_rate_by_dow.png")
    plt.close()

def plot_daily_timeseries(df: pd.DataFrame):
    if "timestamp" not in df.columns:
        return
    tmp = df.copy()
    tmp["day"] = tmp["timestamp"].dt.normalize()
    series = tmp.groupby("day")["is_fraud"].mean()
    plt.figure()
    plt.plot(series.index.values, series.values)
    plt.xlabel("Date")
    plt.ylabel("Fraud rate")
    plt.title("Daily fraud rate")
    plt.tight_layout()
    plt.savefig(PLOTS_DIR / "daily_fraud_rate.png")
    plt.close()


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

def build_feature_space(df: pd.DataFrame):
    """
    Гибко собираем списки признаков на основе наличия колонок.
    - numeric: amount_usd (если есть) или amount, lha_* фичи, бинарные флаги как 0/1
    - categorical: vendor_category, channel, device, country, city (если уместно)
    """
    df = df.copy()

    for b in ["is_card_present", "is_outside_home_country", "is_weekend", "is_high_risk_vendor"]:
        if b in df.columns:
            df[b] = df[b].astype(float)  # приведём к числу (0/1/NaN)

    numeric = []
    if "amount_usd" in df.columns:
        numeric.append("amount_usd")
    elif "amount" in df.columns:
        numeric.append("amount")
    # lha_* (из last_hour_activity)
    lha_cols = [c for c in df.columns if c.startswith("lha_")]
    numeric += lha_cols
    for b in ["is_card_present", "is_outside_home_country", "is_weekend", "is_high_risk_vendor"]:
        if b in df.columns:
            numeric.append(b)

    categorical = []
    for c in ["vendor_category", "channel", "device", "country", "city"]:
        if c in df.columns:
            categorical.append(c)

    drop_cols = {"is_fraud", "timestamp", "date", "transaction_id", "_orig_idx"}
    feature_cols = [c for c in df.columns if c in (numeric + categorical) and c not in drop_cols]

    numeric_step = SimpleImputer(strategy="median")
    cat_step = OneHotEncoder(handle_unknown="ignore", sparse_output=True)

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_step, [c for c in feature_cols if c in numeric]),
            ("cat", cat_step, [c for c in feature_cols if c in categorical]),
        ],
        remainder="drop",
        sparse_threshold=1.0,
    )

    return feature_cols, numeric, categorical, preprocessor

from datetime import timedelta

def time_block_split_no_leak_smart(
    df: pd.DataFrame,
    target_val_frac_customers: float = 0.20,   # хотим ~20% "новых" клиентов в валидации
    min_val_rows: int = 50_000,                # минимальный размер вал-части
    min_val_positives: int = 50,               # минимум мошеннических транзакций в вал
    max_val_frac_customers: float = 0.50,      # верхняя граница расширения валидации
    step_frac: float = 0.05,                   # шаг расширения, если мало данных
    fallback_last_days: int = 14
):
    df = df.copy()
    if "timestamp" not in df.columns:
        raise ValueError("Нужен столбец 'timestamp' (datetime).")
    if "customer_id" not in df.columns:
        raise ValueError("Нужен столбец 'customer_id'.")
    if "is_fraud" not in df.columns:
        raise ValueError("Нужен столбец 'is_fraud'.")

    cust_first = df.groupby("customer_id")["timestamp"].min().sort_values()
    unique_customers = cust_first.index.to_numpy()
    n_cust = len(unique_customers)
    if n_cust == 0:
        raise ValueError("Нет клиентов для разреза.")

    def _split_by_fraction(frac: float):
        k = max(1, int(round(n_cust * frac)))
        val_customers = set(unique_customers[-k:])
        train_customers = set(unique_customers[:-k])
        train_df = df[df["customer_id"].isin(train_customers)].copy()
        val_df = df[df["customer_id"].isin(val_customers)].copy()
        train_df.sort_values("timestamp", inplace=True)
        val_df.sort_values("timestamp", inplace=True)
        return train_df, val_df, float(frac)

    current_frac = target_val_frac_customers
    best = None
    while current_frac <= max_val_frac_customers + 1e-9:
        tr, va, used_frac = _split_by_fraction(current_frac)
        n_val = len(va)
        n_pos = int(va["is_fraud"].sum()) if n_val else 0
        if n_val >= min_val_rows and n_pos >= min_val_positives:
            best = (tr, va, used_frac, "customer-first_seen")
            break
        if best is None or (n_val > len(best[1])) or (n_pos > int(best[1]["is_fraud"].sum())):
            best = (tr, va, used_frac, "customer-first_seen")
        current_frac += step_frac

    train_df, val_df, used_frac, strategy = best

    if len(val_df) == 0 or val_df["is_fraud"].sum() < min_val_positives:
        max_ts = df["timestamp"].max()
        cutoff = max_ts - timedelta(days=fallback_last_days)
        val_df_fb = df[df["timestamp"] >= cutoff].copy()
        train_df_fb = df[df["timestamp"] < cutoff].copy()

        seen_train_customers = set(train_df_fb["customer_id"].unique())
        val_df_fb = val_df_fb[~val_df_fb["customer_id"].isin(seen_train_customers)].copy()

        if len(val_df_fb) > 0 and val_df_fb["is_fraud"].sum() >= min_val_positives:
            train_df, val_df = train_df_fb, val_df_fb
            strategy = f"time-window-last-{fallback_last_days}d (no-leak)"
    print(f"[split] Strategy: {strategy}, customers_val_frac≈{used_frac:.2f}")
    print(f"[split] Train rows: {len(train_df):,}, Val rows: {len(val_df):,}, Val positives: {int(val_df['is_fraud'].sum()):,}")

    return train_df, val_df


In [17]:
# Обучим 2 модели: LogisticRegression (One-Hot, sparse) и HistGradientBoosting (числовые фичи)
# Посчитаем ROC AUC, PR AUC, Recall@FPR=0.5%, Expected Cost и построим ROC/PR графики для лучшей модели.

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay

C_FP = 1.0    # стоимость FP
C_FN = 20.0   # стоимость FN (примерно)
TARGET_FPR = 0.005  # 0.5%

train_df, val_df = time_block_split_no_leak_smart(df_full)
print("Train rows:", len(train_df), "Val rows:", len(val_df))

feature_cols, numeric_cols, cat_cols, preproc = build_feature_space(train_df)
X_train = preproc.fit_transform(train_df[feature_cols])
y_train = train_df["is_fraud"].astype(int).to_numpy()

X_val = preproc.transform(val_df[feature_cols])
y_val = val_df["is_fraud"].astype(int).to_numpy()

results = []

logreg = LogisticRegression(
    solver="saga",
    penalty="l2",
    class_weight="balanced",
    max_iter=2000,
    n_jobs=None,
    verbose=0,
)
pipe_lr = Pipeline([("model", logreg)])
pipe_lr.fit(X_train, y_train)
proba_lr = pipe_lr.predict_proba(X_val)[:, 1]

m_lr = compute_all_core_metrics(y_val, proba_lr)
thr_lr = threshold_at_fpr(y_val, proba_lr, TARGET_FPR)
ec_lr = expected_cost(y_val, proba_lr, thr_lr["threshold"], C_FP, C_FN)

results.append({
    "model": "LogisticRegression",
    "roc_auc": m_lr["roc_auc"],
    "pr_auc": m_lr["pr_auc"],
    "thr_at_fpr": thr_lr["threshold"],
    "fpr@thr": thr_lr["fpr"],
    "recall@thr": thr_lr["tpr_recall"],
    "expected_cost": ec_lr["expected_cost"],
    "TP": ec_lr["TP"], "FP": ec_lr["FP"], "FN": ec_lr["FN"], "TN": ec_lr["TN"]
})

num_only_cols = [c for c in numeric_cols if c in feature_cols]
if len(num_only_cols) == 0:
    print("Предупреждение: нет числовых признаков для HGB — пропускаю модель.")
    proba_hgb = None
else:
    num_imputer = SimpleImputer(strategy="median")
    X_train_hgb = num_imputer.fit_transform(train_df[num_only_cols])
    X_val_hgb = num_imputer.transform(val_df[num_only_cols])

    hgb = HistGradientBoostingClassifier(
        max_depth=None,
        learning_rate=0.1,
        max_iter=300,
        l2_regularization=0.0,
        early_stopping=True,
        validation_fraction=0.1,
        random_state=42,
    )
    hgb.fit(X_train_hgb, y_train)
    proba_hgb = hgb.predict_proba(X_val_hgb)[:, 1]

    m_hgb = compute_all_core_metrics(y_val, proba_hgb)
    thr_hgb = threshold_at_fpr(y_val, proba_hgb, TARGET_FPR)
    ec_hgb = expected_cost(y_val, proba_hgb, thr_hgb["threshold"], C_FP, C_FN)

    results.append({
        "model": "HistGradientBoosting(num-only)",
        "roc_auc": m_hgb["roc_auc"],
        "pr_auc": m_hgb["pr_auc"],
        "thr_at_fpr": thr_hgb["threshold"],
        "fpr@thr": thr_hgb["fpr"],
        "recall@thr": thr_hgb["tpr_recall"],
        "expected_cost": ec_hgb["expected_cost"],
        "TP": ec_hgb["TP"], "FP": ec_hgb["FP"], "FN": ec_hgb["FN"], "TN": ec_hgb["TN"]
    })

res_df = pd.DataFrame(results).sort_values(["pr_auc","roc_auc"], ascending=False)
display(res_df)

best_row = res_df.iloc[0]
best_name = best_row["model"]

if best_name == "LogisticRegression":
    y_score_best = proba_lr
elif best_name == "HistGradientBoosting(num-only)":
    y_score_best = proba_hgb
else:
    y_score_best = proba_lr

# ROC
plt.figure()
RocCurveDisplay.from_predictions(y_val, y_score_best)
plt.title(f"ROC curve — {best_name}")
plt.tight_layout()
plt.savefig(PLOTS_DIR / f"roc_{best_name.replace(' ','_')}.png")
plt.close()

# PR
plt.figure()
PrecisionRecallDisplay.from_predictions(y_val, y_score_best)
plt.title(f"PR curve — {best_name}")
plt.tight_layout()
plt.savefig(PLOTS_DIR / f"pr_{best_name.replace(' ','_')}.png")
plt.close()

# Графики по каналам/устройствам/времени для df_full (после конверсии/обогащения)
plot_fraud_by_channel_device(df_full)
plot_fraud_by_hour_day(df_full)
plot_daily_timeseries(df_full)

print("Графики сохранены в:", PLOTS_DIR)


[split] Strategy: customer-first_seen, customers_val_frac≈0.20
[split] Train rows: 6,108,639, Val rows: 1,375,127, Val positives: 274,636
Train rows: 6108639 Val rows: 1375127




Unnamed: 0,model,roc_auc,pr_auc,thr_at_fpr,fpr@thr,recall@thr,expected_cost,TP,FP,FN,TN
1,HistGradientBoosting(num-only),0.978166,0.942687,0.658568,0.005,0.741891,1423222.0,203750,5502,70886,1094989
0,LogisticRegression,0.543659,0.226259,0.50931,0.004986,0.011004,5437767.0,3022,5487,271614,1095004


Графики сохранены в: eda_outputs\plots


<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>