In [3]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Lade die Daten
data = pd.read_excel('../data/raw/PSP_Jan_Feb_2019.xlsx')

# Datenbereinigung und Feature-Engineering
data_cleaned = data.dropna()
data_cleaned['tmsp'] = pd.to_datetime(data_cleaned['tmsp'])
data_cleaned['hour'] = data_cleaned['tmsp'].dt.hour
data_cleaned['day'] = data_cleaned['tmsp'].dt.day
data_cleaned['month'] = data_cleaned['tmsp'].dt.month
data_cleaned['day_of_week'] = data_cleaned['tmsp'].dt.dayofweek
data_cleaned['previous_attempts'] = data_cleaned.groupby(['country', 'amount', 'tmsp'])['tmsp'].transform('count') - 1

# Erstellen von Dummy-Variablen für die PSPs
data_cleaned = pd.get_dummies(data_cleaned, columns=['PSP'])

# Auswahl der Features und Zielvariable
features = ['amount', '3D_secured', 'hour', 'day', 'month', 'day_of_week', 'previous_attempts'] + [col for col in data_cleaned.columns if col.startswith('PSP_')]
X = data_cleaned[features]
y = data_cleaned['success']

# Aufteilung der Daten in Trainings- und Testdatensätze
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialisierung und Training des Random Forest Classifiers
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Vorhersagen auf dem Testdatensatz
y_pred = rf_classifier.predict(X_test)

# Bewertung des Modells
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Model Evaluation Metrics (ohne Kostenfunktion):")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Berechnung der gesamten Transaktionskosten des Testsatzes
transaction_costs = {
    'Moneycard': {'success': 5, 'failure': 2},
    'Goldcard': {'success': 10, 'failure': 5},
    'UK_Card': {'success': 3, 'failure': 1},
    'Simplecard': {'success': 1, 'failure': 0.5}
}

X_test['predicted_success'] = y_pred
X_test['actual_success'] = y_test.values

total_cost = 0
for i in range(len(X_test)):
    psp_col = X_test.iloc[i][[col for col in X_test.columns if col.startswith('PSP_')]]
    psp = psp_col.idxmax().replace('PSP_', '')
    if X_test.iloc[i]['predicted_success'] == 1:
        total_cost += transaction_costs[psp]['success']
    else:
        total_cost += transaction_costs[psp]['failure']

print(f"Gesamte Transaktionskosten (ohne Kostenfunktion): {total_cost:.2f} Euro")

Model Evaluation Metrics (ohne Kostenfunktion):
Accuracy: 0.76
Precision: 0.27
Recall: 0.14
F1 Score: 0.18
Gesamte Transaktionskosten (ohne Kostenfunktion): 15601.00 Euro


In [8]:
from sklearn.metrics import make_scorer

# Funktion zur Berechnung der gewichteten Kosten
def cost_function(y_true, y_pred, X):
    total_cost = 0
    for i in range(len(y_true)):
        psp_col = X.iloc[i][[col for col in X.columns if col.startswith('PSP_')]]
        psp = psp_col.idxmax().replace('PSP_', '')
        if y_true.iloc[i] == 1:
            total_cost += transaction_costs[psp]['success']
        else:
            total_cost += transaction_costs[psp]['failure']
    return total_cost

# Erstellen eines Scorers für die Kostenfunktion
cost_scorer = make_scorer(cost_function, greater_is_better=False, needs_proba=False, needs_threshold=False)

# Initialisierung und Training des Random Forest Classifiers mit der Kostenfunktion
rf_classifier_with_cost = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_with_cost.fit(X_train, y_train)

# Vorhersagen auf dem Testdatensatz (ohne zusätzliche Spalten)
X_test_for_prediction = X_test.drop(columns=['predicted_success', 'actual_success'], errors='ignore')
y_pred_with_cost = rf_classifier_with_cost.predict(X_test_for_prediction)

# Bewertung des Modells mit der Kostenfunktion
accuracy_with_cost = accuracy_score(y_test, y_pred_with_cost)
total_cost_with_cost = cost_function(y_test, y_pred_with_cost, X_test_for_prediction)

print(f"\nModel Evaluation Metrics (mit Kostenfunktion):")
print(f"Accuracy: {accuracy_with_cost:.2f}")
print(f"Total cost of transactions: {total_cost_with_cost:.2f}")

# Vergleich der Transaktionskosten vor Umsetzung des Modells

# Berechnung der Transaktionskosten vor Umsetzung des Modells (aktueller Zustand)
total_transactions = len(data)
total_successful_transactions = data['success'].sum()
total_failed_transactions = total_transactions - total_successful_transactions

average_transaction_costs_before_model = {
    'Moneycard': (transaction_costs['Moneycard']['success'] * total_successful_transactions + transaction_costs['Moneycard']['failure'] * total_failed_transactions) / total_transactions,
    'Goldcard': (transaction_costs['Goldcard']['success'] * total_successful_transactions + transaction_costs['Goldcard']['failure'] * total_failed_transactions) / total_transactions,
    'UK_Card': (transaction_costs['UK_Card']['success'] * total_successful_transactions + transaction_costs['UK_Card']['failure'] * total_failed_transactions) / total_transactions,
    'Simplecard': (transaction_costs['Simplecard']['success'] * total_successful_transactions + transaction_costs['Simplecard']['failure'] * total_failed_transactions) / total_transactions,
}

total_average_transaction_cost_before_model = sum(average_transaction_costs_before_model.values()) / len(average_transaction_costs_before_model)

print(f"\nTransaktionskosten vor Umsetzung des Modells (durchschnittlich): {total_average_transaction_cost_before_model:.2f} Euro")


Model Evaluation Metrics (mit Kostenfunktion):
Accuracy: 0.76
Total cost of transactions: 17466.00

Transaktionskosten vor Umsetzung des Modells (durchschnittlich): 2.66 Euro


In [10]:
average_transaction_costs_before_model = {
    'Moneycard': (transaction_costs['Moneycard']['success'] * total_successful_transactions + transaction_costs['Moneycard']['failure'] * total_failed_transactions),
    'Goldcard': (transaction_costs['Goldcard']['success'] * total_successful_transactions + transaction_costs['Goldcard']['failure'] * total_failed_transactions),
    'UK_Card': (transaction_costs['UK_Card']['success'] * total_successful_transactions + transaction_costs['UK_Card']['failure'] * total_failed_transactions),
    'Simplecard': (transaction_costs['Simplecard']['success'] * total_successful_transactions + transaction_costs['Simplecard']['failure'] * total_failed_transactions),
}

total_transaction_cost_before_model = sum(average_transaction_costs_before_model.values())

In [11]:
# Vergleich der Genauigkeit und der gesamten Transaktionskosten
print(f"\nVergleich der Ergebnisse:")
print(f"Genauigkeit (ohne Kostenfunktion): {accuracy:.2f}")
print(f"Genauigkeit (mit Kostenfunktion): {accuracy_with_cost:.2f}")
print(f"Gesamte Transaktionskosten (ohne Kostenfunktion): {total_cost:.2f} Euro")
print(f"Gesamte Transaktionskosten (mit Kostenfunktion): {total_cost_with_cost:.2f} Euro")
print(f"Transaktionskosten vor Umsetzung des Modells: {total_transaction_cost_before_model:.2f} Euro")


Vergleich der Ergebnisse:
Genauigkeit (ohne Kostenfunktion): 0.76
Genauigkeit (mit Kostenfunktion): 0.76
Gesamte Transaktionskosten (ohne Kostenfunktion): 15601.00 Euro
Gesamte Transaktionskosten (mit Kostenfunktion): 17466.00 Euro
Transaktionskosten vor Umsetzung des Modells: 535879.00 Euro
