In [1]:
# Baseline-Modelle
import pandas as pd

# Datensatz laden
df = pd.read_excel("../data/raw/PSP_Jan_Feb_2019.xlsx")
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

# Erfolgsraten je PSP
erfolgsraten = df.groupby('PSP')['success'].mean()
print("Erfolgsraten je PSP:")
print(erfolgsraten)
print()

# Kostenstruktur
kosten_erfolg = {'Moneycard': 5, 'Goldcard': 10, 'UK_Card': 3, 'Simplecard': 1}
kosten_fail = {'Moneycard': 2, 'Goldcard': 5, 'UK_Card': 1, 'Simplecard': 0.5}

# Baseline A: Immer denselben PSP (Was-wäre-wenn)
print("Baseline A: Immer nur einen PSP wählen")
for psp in erfolgsraten.index:
    avg_success = erfolgsraten[psp]
    avg_cost = avg_success * kosten_erfolg[psp] + (1 - avg_success) * kosten_fail[psp]
    print(f"PSP: {psp:10} | Erfolgsrate: {avg_success:.3f} | Ø Kosten: {avg_cost:.2f} €")
print()

# Baseline B: Regelbasiert nach Betrag
def regelbasiert_psp(amount):
    if amount < 100:
        return 'Simplecard'
    elif amount < 300:
        return 'UK_Card'
    else:
        return 'Goldcard'

df['baseline_PSP'] = df['amount'].apply(regelbasiert_psp)
df['baseline_success'] = df['baseline_PSP'].map(erfolgsraten)
df['baseline_cost'] = df['baseline_PSP'].apply(lambda psp: kosten_erfolg[psp]) * df['baseline_success'] + \
                      df['baseline_PSP'].apply(lambda psp: kosten_fail[psp]) * (1 - df['baseline_success'])

regel_success = df['baseline_success'].mean()
regel_cost = df['baseline_cost'].mean()
print(f"Baseline B (regelbasiert): Erfolgsrate: {regel_success:.3f} | Ø Kosten: {regel_cost:.2f} €")


Erfolgsraten je PSP:
PSP
Goldcard      0.406172
Moneycard     0.218754
Simplecard    0.158123
UK_Card       0.194338
Name: success, dtype: float64

Baseline A: Immer nur einen PSP wählen
PSP: Goldcard   | Erfolgsrate: 0.406 | Ø Kosten: 7.03 €
PSP: Moneycard  | Erfolgsrate: 0.219 | Ø Kosten: 2.66 €
PSP: Simplecard | Erfolgsrate: 0.158 | Ø Kosten: 0.58 €
PSP: UK_Card    | Erfolgsrate: 0.194 | Ø Kosten: 1.39 €

Baseline B (regelbasiert): Erfolgsrate: 0.223 | Ø Kosten: 2.17 €


In [2]:
#Datenaufbereitung für das ML-Modell inkl. Speicherung als CSV

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

# 1. Datensatz laden
df = pd.read_excel("../data/raw/PSP_Jan_Feb_2019.xlsx")
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

# 2. Neue Features zu Zahlungsversuchen berechnen
df['minute'] = pd.to_datetime(df['tmsp']).dt.floor('T')
df['attempt_group'] = df.groupby(['minute', 'country', 'amount']).ngroup()
df['attempt_number'] = df.groupby('attempt_group').cumcount() + 1
df['total_attempts'] = df.groupby('attempt_group')['amount'].transform('count')

# 3. Auswahl der relevanten Features und Zielvariable
features = ['amount', '3D_secured', 'card', 'country', 'PSP',
            'attempt_number', 'total_attempts']
X = df[features]
y = df['success']

# 4. Kategorische Variablen in numerische umwandeln (One-Hot-Encoding)
X_encoded = pd.get_dummies(X, columns=['card', 'country', 'PSP'], drop_first=True)

# 5. Aufteilen in Trainings- und Testdaten (z.B. 80% Training, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

# 6. Ordner für verarbeitete Daten erstellen, falls nicht vorhanden
os.makedirs("../data/processed/", exist_ok=True)

# 7. Speichern als CSV
X_train.to_csv("../data/processed/X_train.csv", index=False)
X_test.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

# 8. Überblick und Kontrolle
print("Shape X_train:", X_train.shape)
print("Shape X_test :", X_test.shape)
print("Shape y_train:", y_train.shape)
print("Shape y_test :", y_test.shape)
print("Dateien wurden erfolgreich im Ordner '../data/processed/' gespeichert.")
print("Beispiel für neue Features (erste Zeilen):")
print(X_train[['attempt_number', 'total_attempts']].head())


Shape X_train: (40328, 11)
Shape X_test : (10082, 11)
Shape y_train: (40328,)
Shape y_test : (10082,)
Dateien wurden erfolgreich im Ordner '../data/processed/' gespeichert.
Beispiel für neue Features (erste Zeilen):
       attempt_number  total_attempts
4325                1               1
49061               1               1
417                 1               2
49426               2               2
16767               1               1


In [3]:
# Vorhersagemodell

import pandas as pd
import numpy as np
import os
import datetime
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, precision_recall_curve
import matplotlib.pyplot as plt

# Timestamp für die Dateinamen
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Sicherstellen, dass reports-Ordner existiert
os.makedirs("../reports/", exist_ok=True)

# Trainings- und Testdaten laden
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/processed/y_test.csv").values.ravel()

# 1. Berechne scale_pos_weight
ratio = (len(y_train) - sum(y_train)) / sum(y_train)
with open(f"../reports/ergebnisse_{timestamp}.txt", "w") as f:
    f.write(f"scale_pos_weight = {ratio:.2f}\n")

# 2. XGBoost mit scale_pos_weight trainieren
model = XGBClassifier(random_state=42, eval_metric='logloss', scale_pos_weight=ratio)
model.fit(X_train, y_train)

# 3. Vorhersagen & Auswertung
y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

# Ergebnisse und Reports speichern
with open(f"../reports/ergebnisse_{timestamp}.txt", "a") as f:
    f.write(f"\nXGBoost (mit scale_pos_weight) auf Testdaten:\n")
    f.write(f"Accuracy  : {acc:.3f}\n")
    f.write(f"Precision : {prec:.3f}\n")
    f.write(f"Recall    : {rec:.3f}\n")
    f.write(f"F1-Score  : {f1:.3f}\n")
    f.write(f"\nKonfusionsmatrix:\n{cm}\n")
    f.write(f"\nClassification Report:\n{classification_report(y_test, y_pred)}\n")

# Kostensensitive Bewertung
costs = {'TP': +1, 'FP': -5, 'FN': -3, 'TN': 0}
TN, FP, FN, TP = cm.ravel()
total_benefit = (TP * costs['TP'] + FP * costs['FP'] + FN * costs['FN'] + TN * costs['TN'])
with open(f"../reports/ergebnisse_{timestamp}.txt", "a") as f:
    f.write(f"\nGesamtnutzen (Kostensensitive Bewertung): {total_benefit}\n")
    f.write(f"TP: {TP}, FP: {FP}, FN: {FN}, TN: {TN}\n")

# Wahrscheinlichkeiten berechnen
y_proba = model.predict_proba(X_test)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)

# Feature Importance speichern und plotten
importances = model.feature_importances_
features = X_train.columns
feature_importance_df = pd.DataFrame({'feature': features, 'importance': importances}).sort_values(by='importance', ascending=False)
feature_importance_df.to_csv(f"../reports/feature_importance_{timestamp}.csv", index=False)

# Plot Feature Importance und speichern
plt.figure(figsize=(8,6))
plt.barh(feature_importance_df['feature'][:10], feature_importance_df['importance'][:10])
plt.gca().invert_yaxis()
plt.title("Top 10 Feature Importances (XGBoost)")
plt.xlabel("Importance")
plt.tight_layout()
plt.savefig(f"../reports/feature_importance_plot_{timestamp}.png")
plt.close()

# Precision-Recall vs Threshold Plot und speichern
plt.figure(figsize=(8,6))
plt.plot(thresholds, precisions[:-1], label='Precision')
plt.plot(thresholds, recalls[:-1], label='Recall')
plt.xlabel('Schwellenwert')
plt.ylabel('Metrik')
plt.legend()
plt.title('Precision und Recall vs. Schwellenwert')
plt.tight_layout()
plt.savefig(f"../reports/precision_recall_curve_{timestamp}.png")
plt.close()
plt.show()

# Besten Threshold bestimmen
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
best_index = f1_scores.argmax()
best_threshold = thresholds[best_index]
with open(f"../reports/ergebnisse_{timestamp}.txt", "a") as f:
    f.write(f"\nBester Threshold für maximalen F1-Score: {best_threshold:.2f}\n")

# Neue Prediction mit optimalem Threshold
y_pred_opt = (y_proba >= best_threshold).astype(int)
acc_opt = accuracy_score(y_test, y_pred_opt)
prec_opt = precision_score(y_test, y_pred_opt)
rec_opt = recall_score(y_test, y_pred_opt)
f1_opt = f1_score(y_test, y_pred_opt)
cm_opt = confusion_matrix(y_test, y_pred_opt)
with open(f"../reports/ergebnisse_{timestamp}.txt", "a") as f:
    f.write("\nMetriken mit optimiertem Threshold:\n")
    f.write(f"Accuracy  : {acc_opt:.3f}\n")
    f.write(f"Precision : {prec_opt:.3f}\n")
    f.write(f"Recall    : {rec_opt:.3f}\n")
    f.write(f"F1-Score  : {f1_opt:.3f}\n")
    f.write(f"Konfusionsmatrix:\n{cm_opt}\n")

# Tages- und Nachtauswertung
# (Original df laden, falls nicht mehr im Speicher)
df = pd.read_excel("../data/raw/PSP_Jan_Feb_2019.xlsx")
df_test = df.loc[X_test.index]
df_test['hour'] = pd.to_datetime(df_test['tmsp']).dt.hour

mask_day = df_test['hour'].between(8, 18)
mask_night = ~mask_day

with open(f"../reports/ergebnisse_{timestamp}.txt", "a") as f:
    for label, mask in zip(['Tag', 'Nacht'], [mask_day, mask_night]):
        recall_val = recall_score(y_test[mask], y_pred_opt[mask])
        prec_val = precision_score(y_test[mask], y_pred_opt[mask])
        f1_val = f1_score(y_test[mask], y_pred_opt[mask])
        f.write(f"\nAuswertung für {label}-Transaktionen:\n")
        f.write(f"Recall: {recall_val:.3f}, Precision: {prec_val:.3f}, F1-Score: {f1_val:.3f}\n")
