In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import pandas as pd
import pickle
import numpy as np
from sklearn.metrics import confusion_matrix

# Einlesen der Daten PSP_Jan_Feb_2019_preprocessed
df = pd.read_csv("./PSP_Jan_Feb_2019_preprocessed.csv")

# Feature Auswahl und Zielvariable
features = ['amount', 'PSP', '3D_secured', 'card', 'country', 'weekday', 'day', 'hour', 'minute']
target_variable = ['success']

X = df[features]
y = df[target_variable].values.ravel()

# Verschiedene Modelle testen
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Neural Network': MLPClassifier(random_state=42)
}

# Durchführung von Kreuzvalidierung und Vergleich der Modelle
score = 0
for model_name, model in models.items():
    # Verwenden Sie f1_score als Metrik für die cross_val_score
    scores = cross_val_score(model, X, y, cv=5, scoring=make_scorer(f1_score, average='weighted'))
    if scores.mean() > score:
        score = scores.mean()
        highest_score = scores.mean()
        model_name_highest_score = model_name
    print(f'{model_name} F1-Score: {scores.mean()}')

print(f'The highes score is {highest_score} of model {model_name_highest_score}')

model = RandomForestClassifier()

# Definieren der Hyperparameter-Räume für die zufällige Suche
param_dist = {
    'n_estimators': randint(50, 300),  # Beispiel für eine kontinuierliche Verteilung
    'max_depth': [10, 20, 30,40,50,60],
    'min_samples_split': [2, 3, 4,5,6,7],
    'min_samples_leaf': [1, 2, 3,4,5,6]
}

# Verwenden Sie RandomizedSearchCV anstelle von GridSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)

# Führen Sie die zufällige Suche durch
random_search.fit(X, y)

# Holen Sie sich die besten Hyperparameter
best_params = random_search.best_params_

# Zuweisen der Parameter
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

# Zeigen Sie die besten Hyperparameter an
print("Die besten Hyperparameter sind:", best_params)

# Aufteilung der Daten in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modell erstellen und trainieren
model = RandomForestClassifier(
    max_depth=best_max_depth,
    n_estimators=best_n_estimators,
    min_samples_split= best_min_samples_split,
    min_samples_leaf=best_min_samples_leaf,
    random_state=42)

model.fit(X_train, y_train)

# Vorhersagen
y_pred = model.predict(X_test)

# Auswertung des Modells
accuracy = round(accuracy_score(y_test, y_pred),2)
precision = round(precision_score(y_test, y_pred, zero_division=1.0, average='weighted'),2)
recall = round(recall_score(y_test, y_pred, zero_division=1.0, average='weighted'),2)
f1_score = round(f1_score(y_test,y_pred,zero_division=1.0,average='weighted'),2)

# Ergebnisse ausgeben
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1_score}')

unique_label = np.unique([y_test, y_pred])
cmtx = pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=unique_label),
    index=["true:{:}".format(x) for x in unique_label],
    columns=["pred:{:}".format(x) for x in unique_label],
)
print(cmtx)

# Relevante Features ausgeben
feature_importances = pd.DataFrame(model.feature_importances_,
                                      index=X_train.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)
# Runden der Werte auf zwei Nachkommastellen
feature_importances = feature_importances.round(2)

print(feature_importances)

# Modell speichern für die weitere Verwendung


Logistic Regression F1-Score: 0.7073058359946423
K-Nearest Neighbors F1-Score: 0.7082928146822705
Support Vector Machine F1-Score: 0.7073058359946423
Naive Bayes F1-Score: 0.7231326515141259
Decision Tree F1-Score: 0.6800890605742047
Random Forest F1-Score: 0.727088135472554
Gradient Boosting F1-Score: 0.7178462511699395
Neural Network F1-Score: 0.709435064896454
The highes score is 0.727088135472554 of model Random Forest
Die besten Hyperparameter sind: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 108}
Accuracy: 0.8
Precision: 0.75
Recall: 0.8
F1-Score: 0.72
        pred:0  pred:1
true:0    8022      57
true:1    1937      65
            importance
amount            0.24
PSP               0.21
minute            0.13
hour              0.12
day               0.11
weekday           0.05
card              0.05
3D_secured        0.04
country           0.03


In [13]:
import sklearn.metrics as metrics
# Auswertung des Modells
accuracy = round(accuracy_score(y_test, y_pred),2)
precision = round(precision_score(y_test, y_pred, zero_division=1.0),2)
recall = round(recall_score(y_test, y_pred, zero_division=1.0),2)
f1_score = round(metrics.f1_score(y_test,y_pred,zero_division=1),2)

# Ergebnisse ausgeben
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1_score}')

Accuracy: 0.8
Precision: 0.53
Recall: 0.03
F1-Score: 0.06


In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

# Einlesen der Daten PSP_Jan_Feb_2019_preprocessed
df = pd.read_csv("./PSP_Jan_Feb_2019_preprocessed.csv")

# Feature Auswahl und Zielvariable
features = ['amount', 'PSP', '3D_secured', 'card', 'country', 'weekday', 'day', 'hour', 'minute']
target_variable = ['success']

X = df[features]
y = df[target_variable].values.ravel()

# Baseline-Modell erstellen (am häufigsten auftretende Klasse verwenden)
baseline_model = DummyClassifier(strategy='most_frequent')

# Durchführung von Kreuzvalidierung und Bewertung der Baseline
baseline_scores = cross_val_score(baseline_model, X, y, cv=5, scoring=make_scorer(f1_score, average='weighted'))
print(f'Baseline Model F1-Score: {baseline_scores.mean()}')

# Bewertungsmetriken für das Baseline-Modell ausgeben
baseline_model.fit(X, y)
baseline_predictions = baseline_model.predict(X)

accuracy = accuracy_score(y, baseline_predictions)
precision = precision_score(y, baseline_predictions, pos_label=1, zero_division=1.0, average='weighted')
recall = recall_score(y, baseline_predictions, pos_label=1, zero_division=1.0, average='weighted')
f1_score = f1_score(y, baseline_predictions, pos_label=1, zero_division=1.0, average='weighted')

print('Baseline Model Metrics:')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1_score}')

unique_label = np.unique([y, baseline_predictions])
cmtx = pd.DataFrame(
    confusion_matrix(y, baseline_predictions, labels=unique_label),
    index=["true:{:}".format(x) for x in unique_label],
    columns=["pred:{:}".format(x) for x in unique_label],
)
print(cmtx)

Baseline Model F1-Score: 0.7073058359946423
Baseline Model Metrics:
Accuracy: 0.7972460863871749
Precision: 0.8383552358724917
Recall: 0.7972460863871749
F1-Score: 0.7073058353821238
        pred:0  pred:1
true:0   40182       0
true:1   10219       0


In [14]:
print("Hello")

Hello
