In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump

import matplotlib

matplotlib.use("Agg")  # falls irgendwo Plots erzeugt werden



# __file__ is not defined in interactive environments (like Jupyter).
# Try to use it when available, otherwise fall back to the current working directory.
try:
	base_dir = Path(__file__).resolve().parent
except NameError:
	base_dir = Path.cwd()

file_path = base_dir.parent / "data" / "business_dataset.csv"

df = pd.read_csv(file_path)

df.head()

# Sort by account and date first
df['date_post'] = pd.to_datetime(df['date_post'], format='%Y%m%d')
df = df.sort_values(["bank_account_uuid", "date_post"])



In [3]:
# Calculate rolling features
df['amount_mean_5'] = df.groupby('bank_account_uuid')['amount'] \
                        .transform(lambda x: x.rolling(5, min_periods=1).mean())
df['amount_std_5'] = df.groupby('bank_account_uuid')['amount'] \
                       .transform(lambda x: x.rolling(5, min_periods=1).std()).fillna(0)
df['amount_change'] = df.groupby('bank_account_uuid')['amount'].diff()
df['amount_change'] = df['amount_change'].fillna(0)
# ✅ Correct receiver change flag (no MultiIndex)


df['month'] = df['date_post'].dt.month
df['dayofweek'] = df['date_post'].dt.dayofweek
# Time delta since last transaction
# Abstand berechnen
df['time_since_last_tx'] = (
    df.groupby(['bank_account_uuid', 'ref_iban'])['date_post']
      .diff().dt.days
)
df.drop("date_post", axis=1, inplace=True)
# globalen Mittelwert über alle gültigen Werte berechnen

# NaN durch den Mittelwert ersetzen
df['time_since_last_tx'] = df['time_since_last_tx'].fillna(30)


print(df.columns)
#gib mir die anomalien aus
anomalies = df[df['anomaly_description'].notna()]
#wie viele unique values hat jede spalte
for col in df.columns:
    print(col, df[col].nunique())

Index(['bank_account_uuid', 'business_partner_name', 'amount', 'currency',
       'ref_name', 'ref_iban', 'ref_swift', 'ref_bank', 'paym_note',
       'trns_type', 'pay_method', 'channel', 'anomaly_description',
       'amount_mean_5', 'amount_std_5', 'amount_change', 'month', 'dayofweek',
       'time_since_last_tx'],
      dtype='object')
bank_account_uuid 9
business_partner_name 9
amount 2193
currency 1
ref_name 221
ref_iban 224
ref_swift 222
ref_bank 194
paym_note 2201
trns_type 2
pay_method 3
channel 3
anomaly_description 11
amount_mean_5 2201
amount_std_5 2193
amount_change 2194
month 12
dayofweek 7
time_since_last_tx 19


In [8]:
df["trns_type"].value_counts()

trns_type
DEBIT       2201
REVERSAL       1
Name: count, dtype: int64

In [11]:
df[df["anomaly_description"].notna()].value_counts()

bank_account_uuid                 business_partner_name  amount     currency  ref_name                   ref_iban                ref_swift    ref_bank  paym_note                                                     trns_type  pay_method  channel     anomaly_description                                                                                                                         amount_mean_5  amount_std_5   amount_change  month  dayofweek  time_since_last_tx
00FD7FDA41DE4E809EFAC0962FB82C7E  Gill-Krause            107742.87  USD       Allen and Sons Solutions   GB56DLMD20680949530000  IZDRGBJFXU7  TD        Payment for INV-2022-01-1820 (Term 45d)                       DEBIT      ACH         API         IBAN_MISMATCH_ANOMALY: IBAN changed from GB08RVAB77... to GB56DLMD20...                                                                     45295.898      42939.387010   52668.29       2      2          30.0                  1
3D412B89F2AD44E7B7493E7502EDDD23  Cooper-Marquez     

In [1]:
df.drop("customer_uuid", axis=1, inplace=True)

NameError: name 'df' is not defined