In [34]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from joblib import dump
from sklearn.feature_extraction import FeatureHasher
import matplotlib

matplotlib.use("Agg")  # falls irgendwo Plots erzeugt werden



# __file__ is not defined in interactive environments (like Jupyter).
# Try to use it when available, otherwise fall back to the current working directory.
try:
	base_dir = Path(__file__).resolve().parent
except NameError:
	base_dir = Path.cwd()

file_path = base_dir.parent / "data" / "1311dataset.csv"

df = pd.read_csv(file_path)

df.head()

# Sort by account and date first
df['date_post'] = pd.to_datetime(df['date_post'], format='%Y%m%d')
df = df.sort_values(["bank_account_uuid", "date_post"])



In [35]:
# Calculate rolling features
df['amount_mean_5'] = df.groupby('bank_account_uuid')['amount'] \
                        .transform(lambda x: x.rolling(5, min_periods=1).mean())
df['amount_std_5'] = df.groupby('bank_account_uuid')['amount'] \
                       .transform(lambda x: x.rolling(5, min_periods=1).std()).fillna(0)
df['amount_change'] = df.groupby('bank_account_uuid')['amount'].diff()
df['amount_change'] = df['amount_change'].fillna(0)
# ✅ Correct receiver change flag (no MultiIndex)


df['month'] = df['date_post'].dt.month
df['dayofweek'] = df['date_post'].dt.dayofweek
df["year"] = df['date_post'].dt.year
# Time delta since last transaction
# Abstand berechnen
df['time_since_last_tx'] = (
    df.groupby(['bank_account_uuid', 'ref_iban'])['date_post']
      .diff().dt.days
)
df.drop("date_post", axis=1, inplace=True)
# globalen Mittelwert über alle gültigen Werte berechnen

# NaN durch den Mittelwert ersetzen
df['time_since_last_tx'] = df['time_since_last_tx'].fillna(30)


print(df.columns)
#gib mir die anomalien aus
anomalies = df[df['anomaly_description'].notna()]
#wie viele unique values hat jede spalte
for col in df.columns:
    print(col, df[col].nunique())

Index(['bank_account_uuid', 'business_partner_name', 'amount', 'currency',
       'ref_name', 'ref_iban', 'ref_swift', 'ref_bank', 'paym_note',
       'trns_type', 'pay_method', 'channel', 'anomaly_description',
       'amount_mean_5', 'amount_std_5', 'amount_change', 'month', 'dayofweek',
       'year', 'time_since_last_tx'],
      dtype='object')
bank_account_uuid 9
business_partner_name 9
amount 1804
currency 1
ref_name 196
ref_iban 204
ref_swift 196
ref_bank 194
paym_note 1800
trns_type 1
pay_method 3
channel 3
anomaly_description 18
amount_mean_5 1806
amount_std_5 1798
amount_change 1797
month 12
dayofweek 7
year 3
time_since_last_tx 22


In [36]:
df["anomaly_description"].value_counts()
df.head()

Unnamed: 0,bank_account_uuid,business_partner_name,amount,currency,ref_name,ref_iban,ref_swift,ref_bank,paym_note,trns_type,pay_method,channel,anomaly_description,amount_mean_5,amount_std_5,amount_change,month,dayofweek,year,time_since_last_tx
1009,1C447D4F1599450AB2AB4DFC163B529F,White Group,42516.45,USD,Ray Inc,GB33XJMF87452642213857,VCKQGB1T3DI,GM,New Equipment Purchase - PO#5858,DEBIT,WIRE,ONLINE_BANKING_PORTAL,,42516.45,0.0,0.0,1,3,2022,30.0
1077,1C447D4F1599450AB2AB4DFC163B529F,White Group,34306.0,USD,Thompson-Perez,GB24CEYO61531523398575,SUPQGBRSBLR,HT,Legal Fees - PO#9502,DEBIT,WIRE,ONLINE_BANKING_PORTAL,,38411.225,5805.664872,-8210.45,1,4,2022,30.0
269,1C447D4F1599450AB2AB4DFC163B529F,White Group,2170.9,USD,Butler LLC Logistics,GB24PNVP62636341275547,KHFOGBIVO5J,BY,Payment for INV-2022-01-1567 (Term 15d),DEBIT,WIRE,ONLINE_BANKING_PORTAL,,26331.116667,21322.287033,-32135.1,1,0,2022,30.0
1766,1C447D4F1599450AB2AB4DFC163B529F,White Group,3039.55,USD,Perry Inc Logistics,GB41ZIWZ00768587878792,ZSYAGBWNT1E,GN,Payment for INV-2022-01-3885 (Term 15d),DEBIT,WIRE,ONLINE_BANKING_PORTAL,,20508.225,20945.58551,868.65,2,3,2022,30.0
61,1C447D4F1599450AB2AB4DFC163B529F,White Group,18408.99,USD,Lopez Ltd,GB36UFFR27318410616772,WEBDGBJHZGS,KI,Legal Fees - PO#8377,DEBIT,WIRE,ONLINE_BANKING_PORTAL,,20088.378,18163.686899,15369.44,2,6,2022,30.0


In [37]:
columns_to_drop = ["bank_account_uuid", "ref_bank", "currency", "trns_type"]

df = df.drop(columns=columns_to_drop)

In [38]:
df.head()

Unnamed: 0,business_partner_name,amount,ref_name,ref_iban,ref_swift,paym_note,pay_method,channel,anomaly_description,amount_mean_5,amount_std_5,amount_change,month,dayofweek,year,time_since_last_tx
1009,White Group,42516.45,Ray Inc,GB33XJMF87452642213857,VCKQGB1T3DI,New Equipment Purchase - PO#5858,WIRE,ONLINE_BANKING_PORTAL,,42516.45,0.0,0.0,1,3,2022,30.0
1077,White Group,34306.0,Thompson-Perez,GB24CEYO61531523398575,SUPQGBRSBLR,Legal Fees - PO#9502,WIRE,ONLINE_BANKING_PORTAL,,38411.225,5805.664872,-8210.45,1,4,2022,30.0
269,White Group,2170.9,Butler LLC Logistics,GB24PNVP62636341275547,KHFOGBIVO5J,Payment for INV-2022-01-1567 (Term 15d),WIRE,ONLINE_BANKING_PORTAL,,26331.116667,21322.287033,-32135.1,1,0,2022,30.0
1766,White Group,3039.55,Perry Inc Logistics,GB41ZIWZ00768587878792,ZSYAGBWNT1E,Payment for INV-2022-01-3885 (Term 15d),WIRE,ONLINE_BANKING_PORTAL,,20508.225,20945.58551,868.65,2,3,2022,30.0
61,White Group,18408.99,Lopez Ltd,GB36UFFR27318410616772,WEBDGBJHZGS,Legal Fees - PO#8377,WIRE,ONLINE_BANKING_PORTAL,,20088.378,18163.686899,15369.44,2,6,2022,30.0


In [39]:
#alle ausgeben lasse wo anomaly != nan ist
df.columns

Index(['business_partner_name', 'amount', 'ref_name', 'ref_iban', 'ref_swift',
       'paym_note', 'pay_method', 'channel', 'anomaly_description',
       'amount_mean_5', 'amount_std_5', 'amount_change', 'month', 'dayofweek',
       'year', 'time_since_last_tx'],
      dtype='object')

In [40]:
#wie viele unique values hat jede spalte
for col in df.columns:
    print(col, df[col].nunique())

business_partner_name 9
amount 1804
ref_name 196
ref_iban 204
ref_swift 196
paym_note 1800
pay_method 3
channel 3
anomaly_description 18
amount_mean_5 1806
amount_std_5 1798
amount_change 1797
month 12
dayofweek 7
year 3
time_since_last_tx 22


In [41]:
hasher_iban = FeatureHasher(input_type='string', n_features=10)
hashed_features = hasher_iban.transform(df['ref_iban'].astype(str).values.reshape(-1,1))
hashed_df = pd.DataFrame(hashed_features.toarray(), columns=[f'iban_hash_{i}' for i in range(10)])
df = pd.concat([hashed_df, df], axis=1)
df = df.drop("ref_iban", axis=1)

hasher_swift = FeatureHasher(input_type='string', n_features=10)
hashed_features_swift = hasher_swift.transform(df['ref_swift'].astype(str).values.reshape(-1,1))
hashed_df_swift = pd.DataFrame(hashed_features_swift.toarray(), columns=[f'swift_hash_{i}' for i in range(10)])
df = pd.concat([hashed_df_swift, df], axis=1)
df = df.drop("ref_swift", axis=1)

In [42]:
hasher_paym_note = FeatureHasher(input_type='string', n_features=10)
hashed_paym_note = hasher_paym_note.transform(df['paym_note'].astype(str).values.reshape(-1,1))
hashed_df_payment_node = pd.DataFrame(hashed_paym_note.toarray(), columns=[f'paym_note{i}' for i in range(10)])
df = pd.concat([hashed_df_payment_node, df], axis=1)
df = df.drop("paym_note", axis=1)

In [43]:
df

Unnamed: 0,paym_note0,paym_note1,paym_note2,paym_note3,paym_note4,paym_note5,paym_note6,paym_note7,paym_note8,paym_note9,swift_hash_0,swift_hash_1,swift_hash_2,swift_hash_3,swift_hash_4,swift_hash_5,swift_hash_6,swift_hash_7,swift_hash_8,swift_hash_9,iban_hash_0,iban_hash_1,iban_hash_2,iban_hash_3,iban_hash_4,iban_hash_5,iban_hash_6,iban_hash_7,iban_hash_8,iban_hash_9,business_partner_name,amount,ref_name,pay_method,channel,anomaly_description,amount_mean_5,amount_std_5,amount_change,month,dayofweek,year,time_since_last_tx
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Cook Ltd,1567.90,Oliver and Sons Logistics,WIRE,ONLINE_BANKING_PORTAL,,2454.034,1285.530742,-1684.41,1,0,2024,32.0
1,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Martinez Ltd,2903.04,Fields LLC Logistics,WIRE,ONLINE_BANKING_PORTAL,,3950.452,1122.033471,-2810.80,2,0,2023,33.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Carpenter Inc,2793.36,Smith-Kramer Logistics,WIRE,ONLINE_BANKING_PORTAL,,2982.630,976.011437,-701.17,10,2,2022,30.0
3,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,Cook Ltd,1212.31,Oliver and Sons Logistics,WIRE,ONLINE_BANKING_PORTAL,,2282.722,1362.287499,-2775.02,5,0,2022,28.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Williams Inc,3110.65,Knox and Sons Logistics,WIRE,ONLINE_BANKING_PORTAL,,26886.630,53642.616650,329.93,5,3,2022,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cook Ltd,47000.08,Allen and Sons,WIRE,ONLINE_BANKING_PORTAL,,11565.776,19825.061149,44307.30,3,2,2022,30.0
1802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,Williams Inc,3017.75,Knox and Sons Logistics,WIRE,ONLINE_BANKING_PORTAL,,19523.596,39068.577010,129.32,1,0,2024,34.0
1803,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,White Group,3737.33,Wilson-Conner Logistics,WIRE,ONLINE_BANKING_PORTAL,,41177.542,83740.920464,-1034.86,5,2,2023,28.0
1804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,Carroll-Taylor,2449.64,Taylor-Morales Logistics,WIRE,ONLINE_BANKING_PORTAL,,35768.144,55172.436109,-1763.59,3,0,2023,28.0
