In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np

import ipywidgets

import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.metrics import f1_score, make_scorer, recall_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from datetime import datetime
import itertools

# Create Dataset and Train, Test, Split including SMOTE

In [None]:
baseline = pd.read_csv("../data/training.csv")



#df = df.drop(["CurrencyCode","CountryCode"], axis=1) # identical value across all entries
baseline = baseline.drop(["CurrencyCode","CountryCode", "TransactionStartTime"], axis=1) # identical value across all entries
#test = test.drop(["CurrencyCode","CountryCode"], axis=1) # identical value across all entries

baseline.set_index("TransactionId", inplace=True)

In [None]:
cat_columns = [
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'PricingStrategy']

baseline_dummies = pd.get_dummies(baseline, columns=cat_columns, drop_first = True)
baseline_dummies.head()

baseline = baseline_dummies.drop(["BatchId", "AccountId", "SubscriptionId", "CustomerId"], axis=1)

In [None]:
df = pd.read_csv("../data/training_final.csv")
df.set_index("TransactionId", inplace=True)
cat_columns = [
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'PricingStrategy',
 'BookingType',
 'Interval']

df_dummies = pd.get_dummies(df, columns=cat_columns, drop_first = True)
df_dummies.head()

df = df_dummies.drop(["BatchId", "AccountId", "SubscriptionId", "CustomerId", "TransactionStartTime",'ProviderId-AccountId'], axis=1)

In [None]:
test = pd.read_csv("../data/test_final.csv")
test.set_index("TransactionId", inplace=True)

cat_columns = [
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'PricingStrategy',
 'BookingType',
 'Interval']

test_dummies = pd.get_dummies(test, columns=cat_columns, drop_first = True)
test_dummies.head()

test = test_dummies.drop(["BatchId", "AccountId", "SubscriptionId", "CustomerId", "TransactionStartTime",'ProviderId-AccountId'], axis=1)

In [None]:
def prepare_data(dataset=baseline, RSEED=0):
    #Define features X and target variable y
    
    RSEED=0
    X = dataset.loc[:, dataset.columns != 'FraudResult']
    y = dataset["FraudResult"]
    
    #Train, Test, Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=RSEED)
    
    # Balancing with SMOTE
    #sm = RandomOverSampler(sampling_strategy='minority')
    sm = SMOTE(random_state=RSEED)
    X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train)
    
    return X_train, X_test, y_train, y_test, X_train_balanced, y_train_balanced

# Get best Hyperparameters for DecisionTree, RandomForest and KNN

In [None]:
#Getting best parameters regarding models
def best_hyperparameter(X_train, y_train, RSEED=0): 
    
    #Creating Scorer for optimization
    f1 = make_scorer(f1_score)
    matthews_coeff = make_scorer(matthews_corrcoef)
    recall = make_scorer(recall_score)
    
    #Decision Tree
    start_time = datetime.now()

    param_grid_dt = [{'criterion': ['entropy', 'gini'], 
                      'max_depth': [3,6,9],
                     'min_samples_leaf': [2,5,10]}]
    estimator_dt = DecisionTreeClassifier(random_state=RSEED)
    rs_dt = GridSearchCV(estimator_dt, param_grid_dt, scoring=f1)
    rs_dt.fit(X_train, y_train)
    best_params_dt = estimator_dt.set_params(**rs_dt.best_params_)
    

    end_time = datetime.now()
    print('Duration DT: {}'.format(end_time - start_time))
    
    
    start_time = datetime.now()
    #Random Forest  
    param_grid_rf = {
    'n_estimators': np.linspace(10, 200).astype(int),
    'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]}
    
    estimator_rf = RandomForestClassifier(random_state=RSEED)
    rs_rf = RandomizedSearchCV(estimator_rf, param_grid_rf, n_jobs = -1, n_iter=5,
                               cv = 3, verbose = 1, scoring=f1, random_state=RSEED)
    rs_rf.fit(X_train, y_train)
    best_params_rf = estimator_rf.set_params(**rs_rf.best_params_)
    
    end_time = datetime.now()
    print('Duration RF: {}'.format(end_time - start_time))
    
    start_time = datetime.now()
    
    ''' 
    #KNN
    param_grid_knn = [{'n_neighbors': [5], 
                      'metric': ['minkowski'],
                      'p': [1,2]}]
    estimator_knn = KNeighborsClassifier()
    rs_knn = GridSearchCV(estimator_knn, param_grid_knn,
                            scoring=matthews_coeff, verbose=4)
    rs_knn.fit(X_train, y_train)
    best_params_knn = estimator_knn.set_params(**rs_knn.best_params_)
    
    end_time = datetime.now()
    print('Duration KNN: {}'.format(end_time - start_time))
    '''
    
    return best_params_dt, best_params_rf

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.6f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compare different models (DecTree, RF, ...)

In [None]:
def compare(dataset=baseline, RSEED=0, param_search=False, smote=True):
    
    print("Settings: \n")
    print("RSEED: {}".format(RSEED))
    print("Hyperparameter Search:{}".format(param_search))
    print("Smote: {}".format(smote))
    
    #Prepare data for given dataset, conduct test, train split and oversample via SMOTE
    X_train, X_test, y_train, y_test, X_train_balanced, y_train_balanced = prepare_data(dataset)    
    
    #Defining models
    
    #Without smote
    dtree_baseline = DecisionTreeClassifier(random_state=RSEED)
    RandomForest = RandomForestClassifier(random_state=RSEED)
    KNN_euclidian = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    AdaBoost = AdaBoostClassifier(n_estimators=100, random_state=RSEED)
    logreg = LogisticRegression(random_state=0)
    
    models = [dtree_baseline, RandomForest, AdaBoost] #KNN_euclidian, logreg
    
    #Print data for models without smote
    for model in models:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test) 
        
        print("\n\n")
        print("Results for model: \n {}".format(model))
        print("\n Confusion Matrix: \n{}".format(confusion_matrix(y_test, predictions)))
        print("\n Classification report: \n {}".format(classification_report(y_test, predictions)))
        print("\n Matthew Coefficient: \n {}".format(matthews_corrcoef(y_test, predictions)))
        
    
    if smote is True:
        print("All further results are oversampled via SMOTE:\n")
    
        #Looping through models using SMOTE data: balanced
        for model in models:
            model.fit(X_train_balanced, y_train_balanced)
            predictions = model.predict(X_test) 
    
            print("\n\n")
            print("Results for model: \n {}".format(model))
            print("Smote: {}".format(smote))
            print("\n Confusion Matrix: \n{}".format(confusion_matrix(y_test, predictions)))
            print("\n Classification report: \n {}".format(classification_report(y_test, predictions)))
            print("\n Matthew Coefficient: \n {}".format(matthews_corrcoef(y_test, predictions)))
            
            df.to_csv('../data/test{}.csv'.format(model))
            
            
    if param_search is True:
        print("Hyperparameter-Search running for Decision Tree and Random Forest.")
        #print("KNN also analyzed with Manhattan and Minkowski Metric...")
        if smote is True:
            #Get best parameter via GridSearch
            best_params_dt, best_params_rf = best_hyperparameter(X_train_balanced, y_train_balanced)
        if smote is False: 
            best_params_dt, best_params_rf = best_hyperparameter(X_train, y_train)
            
        dtree_sm_opt = best_params_dt
        RandomForest_sm_opt = best_params_rf
        #KNN_sm_manhattan = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
        #KNN_sm_minkowski3 = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=3)
        
        models_sm_opt = [dtree_sm_opt, RandomForest_sm_opt] #KNN_sm_manhattan, KNN_sm_minkowski3
        
        #Loop for further models using SMOTE data
        print("All further results are results of best hyperparameter search:\n")
    
        #Looping through models
        for model in models_sm_opt:

            model.fit(X_train_balanced, y_train_balanced)
            predictions = model.predict(X_test) 
    
            print("\n\n")
            print("Results for model: \n {}".format(model))
            print("Smote: {}".format(smote))
            print("Hyperparameter Search:{}".format(param_search))
            print("\n Confusion Matrix: \n {}".format(confusion_matrix(y_test, predictions)))
            print("\n Classification report: \n {}".format(classification_report(y_test, predictions)))
            print("\n Matthew Coefficient: \n {}".format(matthews_corrcoef(y_test, predictions)))
            
    print("\n END")
        

In [None]:
compare(df, param_search=False)

# Preparing data for upload and checking submission score

In [None]:
test = pd.read_csv("../data/test_final.csv")
test.set_index("TransactionId", inplace=True)

cat_columns = [
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'PricingStrategy',
 'BookingType',
 'Interval']

test_dummies = pd.get_dummies(test, columns=cat_columns, drop_first = True)
test_dummies.head()

test = test_dummies.drop(["BatchId", "AccountId", "SubscriptionId", "CustomerId", "TransactionStartTime",'ProviderId-AccountId'], axis=1)

df = pd.read_csv("../data/training_final.csv")
df.set_index("TransactionId", inplace=True)
cat_columns = [
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'PricingStrategy',
 'BookingType',
 'Interval']

df_dummies = pd.get_dummies(df, columns=cat_columns, drop_first = True)
df_dummies.head()

df = df_dummies.drop(["BatchId", "AccountId", "SubscriptionId", "CustomerId", "TransactionStartTime",'ProviderId-AccountId'], axis=1)

X = df.loc[:, df.columns != 'FraudResult']
y = df["FraudResult"]
sm = SMOTE(random_state=RSEED)
X_balanced, y_balanced = sm.fit_resample(X, y)

#X_train, X_test, y_train, y_test, X_train_balanced, y_train_balanced = prepare_data(df)

In [None]:
#Prepare dataset for upload
RSEED=0
dtree = DecisionTreeClassifier(random_state=RSEED)
RandomForest = RandomForestClassifier(random_state=RSEED)
KNN_euclidian = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
AdaBoost = AdaBoostClassifier(n_estimators=100, random_state=RSEED)
logreg = LogisticRegression(random_state=0)

## Re-arranging columns to fit with columns of submission data

In [None]:
test['ProductId_ProductId_12'] = 0
test['ProductCategory_other'] = 0
X['ProductId_ProductId_17'] = 0
X['ProductId_ProductId_18'] = 0
X['ProductId_ProductId_25'] = 0
X['ProductId_ProductId_26'] = 0
X['ChannelId_ChannelId_4'] = 0
X['ProductCategory_retail'] = 0

X_2 = X[['Amount', 'Value', 'TimeDelta_ProductId', 'TimeDelta_ProductCategory', 'TimeDelta_ChannelId', 'TimeDelta_PricingStrategy',
 'NTransactions_AccountId', 'NTransactions_SubscriptionId', 'NTransactions_CustomerId', 'NTransactions_ProviderId', 'NTransactions_ProductId',
 'NTransactions_ProductCategory', 'NTransactions_ChannelId', 'NTransactions_PricingStrategy', 'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4',
 'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6', 'ProductId_ProductId_10', 'ProductId_ProductId_11', 'ProductId_ProductId_13', 'ProductId_ProductId_14',
 'ProductId_ProductId_15', 'ProductId_ProductId_16', 'ProductId_ProductId_17', 'ProductId_ProductId_18', 'ProductId_ProductId_19', 'ProductId_ProductId_2',
 'ProductId_ProductId_20', 'ProductId_ProductId_21', 'ProductId_ProductId_22', 'ProductId_ProductId_23', 'ProductId_ProductId_24', 'ProductId_ProductId_25',
 'ProductId_ProductId_26', 'ProductId_ProductId_27', 'ProductId_ProductId_3', 'ProductId_ProductId_4', 'ProductId_ProductId_5', 'ProductId_ProductId_6',
 'ProductId_ProductId_7', 'ProductId_ProductId_8', 'ProductId_ProductId_9', 'ProductCategory_data_bundles', 'ProductCategory_financial_services', 'ProductCategory_movies',
 'ProductCategory_retail', 'ProductCategory_ticket', 'ProductCategory_transport', 'ProductCategory_tv', 'ProductCategory_utility_bill', 'ChannelId_ChannelId_2',
 'ChannelId_ChannelId_3','ChannelId_ChannelId_4','ChannelId_ChannelId_5', 'PricingStrategy_1', 'PricingStrategy_2', 'PricingStrategy_4', 'BookingType_1.0',
 'Interval_evening', 'Interval_midday', 'Interval_morning', 'Interval_night', 'ProductId_ProductId_12', 'ProductCategory_other']]

In [None]:
#test['ProductId_ProductId_12'] = 0
#test['ProductCategory_other'] = 0
X_train_balanced['ProductId_ProductId_17'] = 0
X_train_balanced['ProductId_ProductId_18'] = 0
X_train_balanced['ProductId_ProductId_25'] = 0
X_train_balanced['ProductId_ProductId_26'] = 0
X_train_balanced['ChannelId_ChannelId_4'] = 0
X_train_balanced['ProductCategory_retail'] = 0

In [None]:
X_train_balanced2 = X_train_balanced[['Amount', 'Value', 'TimeDelta_ProductId', 'TimeDelta_ProductCategory', 'TimeDelta_ChannelId', 'TimeDelta_PricingStrategy',
 'NTransactions_AccountId', 'NTransactions_SubscriptionId', 'NTransactions_CustomerId', 'NTransactions_ProviderId', 'NTransactions_ProductId',
 'NTransactions_ProductCategory', 'NTransactions_ChannelId', 'NTransactions_PricingStrategy', 'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3', 'ProviderId_ProviderId_4',
 'ProviderId_ProviderId_5', 'ProviderId_ProviderId_6', 'ProductId_ProductId_10', 'ProductId_ProductId_11', 'ProductId_ProductId_13', 'ProductId_ProductId_14',
 'ProductId_ProductId_15', 'ProductId_ProductId_16', 'ProductId_ProductId_17', 'ProductId_ProductId_18', 'ProductId_ProductId_19', 'ProductId_ProductId_2',
 'ProductId_ProductId_20', 'ProductId_ProductId_21', 'ProductId_ProductId_22', 'ProductId_ProductId_23', 'ProductId_ProductId_24', 'ProductId_ProductId_25',
 'ProductId_ProductId_26', 'ProductId_ProductId_27', 'ProductId_ProductId_3', 'ProductId_ProductId_4', 'ProductId_ProductId_5', 'ProductId_ProductId_6',
 'ProductId_ProductId_7', 'ProductId_ProductId_8', 'ProductId_ProductId_9', 'ProductCategory_data_bundles', 'ProductCategory_financial_services', 'ProductCategory_movies',
 'ProductCategory_retail', 'ProductCategory_ticket', 'ProductCategory_transport', 'ProductCategory_tv', 'ProductCategory_utility_bill', 'ChannelId_ChannelId_2',
 'ChannelId_ChannelId_3','ChannelId_ChannelId_4','ChannelId_ChannelId_5', 'PricingStrategy_1', 'PricingStrategy_2', 'PricingStrategy_4', 'BookingType_1.0',
 'Interval_evening', 'Interval_midday', 'Interval_morning', 'Interval_night', 'ProductId_ProductId_12', 'ProductCategory_other']]

## Calculating predictions for submission data

In [None]:
RandomForest.fit(X_2, y)
predictions = RandomForest.predict(test)

In [None]:
submission = test
submission['FraudResult'] = np.array(predictions)
submission = submission['FraudResult']

In [None]:
submission.to_csv('../data/submission_RandomForest_withoutSMOTE.csv')

# Randomized search for Random Forest to find better hyperparameters

In [None]:
start_time = datetime.now()
#Random Forest  
param_grid_rf = {
'n_estimators': np.linspace(1000, 2000).astype(int),
'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
'min_samples_split': [2, 5, 10],
'bootstrap': [True, False]}

f1 = make_scorer(f1_score)    

estimator_rf = RandomForestClassifier()
rs_rf = RandomizedSearchCV(estimator_rf, param_grid_rf, n_jobs = -1, n_iter=3,
                           cv = 3, scoring='roc_auc', verbose = 5, random_state=RSEED)
rs_rf.fit(X_balanced, y_balanced)
best_params_rf = estimator_rf.set_params(**rs_rf.best_params_)
    
end_time = datetime.now()
print('Duration RF: {}'.format(end_time - start_time))

In [None]:
best_params_rf.fit(X_train_balanced, y_train_balanced)
predictions = best_params_rf.predict(X_test) 
    
#print("\n\n")
#print("Results for model: \n {}".format(model))
#print("Smote: {}".format(smote))
#print("Hyperparameter Search:{}".format(param_search))
print("\n Confusion Matrix: \n {}".format(confusion_matrix(y_test, predictions)))
print("\n Classification report: \n {}".format(classification_report(y_test, predictions)))
print("\n Matthew Coefficient: \n {}".format(matthews_corrcoef(y_test, predictions)))

In [None]:
X_train, X_test, y_train, y_test, X_train_balanced, y_train_balanced = prepare_data(df)

# Plotting results

# Visualize Feature Importance

In [None]:
from sklearn.inspection import permutation_importance
result = permutation_importance(RandomForest, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
feature_importances = pd.Series(result.importances_mean, index=X_test.columns.to_list())
feature_importances = pd.DataFrame(feature_importances).reset_index().rename(columns={"index": "Feature", 0: "Importance"})
feature_importances.sort_values(by="Importance", ascending=False, inplace=True)
feature_importances = feature_importances[feature_importances["Importance"]>0.0000]
plt.figure(figsize=(4,4), dpi=120)
g = sns.barplot(data=feature_importances, x="Feature",y="Importance", hue=np.ones(len(feature_importances)))
g.set(title="Feature importance")
plt.xticks(rotation=90)
plt.legend([],[], frameon=False)
plt.savefig('../feature_importances.png',dpi=300,bbox_inches="tight")

# Visualize Confusion Matrix

In [None]:
RandomForest = RandomForestClassifier(random_state=0)
X_train, X_test, y_train, y_test, X_train_balanced, y_train_balanced = prepare_data(df) 
RandomForest.fit(X_train_balanced, y_train_balanced)
predictions = RandomForest.predict(X_test) 
cnf_matrix = confusion_matrix(y_test, predictions)
print(cnf_matrix)
print("\n Confusion Matrix: \n")
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=['Non-fraud','Fraud'], normalize= True,  title='Confusion matrix')

print("\n Classification report: \n {}".format(classification_report(y_test, predictions)))

# Visualize Decision Tree

In [None]:
from sklearn import tree
dtree.fit(X_train_balanced, y_train_balanced)

plt.figure(figsize=(12,6), dpi=200)
tree.plot_tree(dtree, max_depth=2, filled=True, feature_names=X_test.columns.tolist(),
              fontsize=8, impurity=False)
plt.savefig('../decision_tree.pdf',dpi=300,bbox_inches="tight")