In [None]:
import pandas as pd
from pandas_profiling import ProfileReport
import numpy as np

import ipywidgets

import matplotlib
#matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.metrics import f1_score, make_scorer, recall_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

from datetime import datetime


# Create Dataset and Train, Test, Split including SMOTE

In [None]:
df = pd.read_csv("../data/training_final.csv")
#df = df.drop(["CurrencyCode","CountryCode"], axis=1) # identical value across all entries
df.set_index("TransactionId", inplace=True)

In [None]:
df.columns.tolist()

In [None]:
cat_columns = [
 'ProviderId',
 'ProductId',
 'ProductCategory',
 'ChannelId',
 'PricingStrategy',
 'BookingType',
 'Interval']

df_dummies = pd.get_dummies(df, columns=cat_columns, drop_first = True)
df_dummies.head()

baseline = df_dummies.drop(["BatchId", "AccountId", "SubscriptionId", "CustomerId", "TransactionStartTime",'ProviderId-AccountId'], axis=1)
baseline_short = baseline.head(10000)

In [None]:
baseline

In [None]:
def prepare_data(dataset=baseline, RSEED=0):
    #Define features X and target variable y
    
    RSEED=0
    X = dataset.loc[:, dataset.columns != 'FraudResult']
    y = dataset["FraudResult"]
    
    #Train, Test, Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=RSEED)
    
    # Balancing with SMOTE
    #sm = RandomOverSampler(sampling_strategy='minority')
    sm = SMOTE(random_state=RSEED)
    X_train_balanced, y_train_balanced = sm.fit_resample(X_train, y_train)
    
    return X_train, X_test, y_train, y_test, X_train_balanced, y_train_balanced

In [None]:
X_train, X_test, y_train, y_test, X_train_balanced, y_train_balanced = prepare_data(baseline)    


# Get best Hyperparameters for DecisionTree, RandomForest and KNN

In [None]:
#Getting best parameters regarding models
def best_hyperparameter(X_train, y_train, RSEED=0): 
    
    #Creating Scorer for optimization
    f1 = make_scorer(f1_score)
    matthews_coeff = make_scorer(matthews_corrcoef)
    recall = make_scorer(recall_score)
    
    #Decision Tree
    start_time = datetime.now()

    param_grid_dt = [{'criterion': ['entropy', 'gini'], 
                      'max_depth': [3,6,9],
                     'min_samples_leaf': [2,5,10]}]
    estimator_dt = DecisionTreeClassifier(random_state=RSEED)
    rs_dt = GridSearchCV(estimator_dt, param_grid_dt, scoring=f1)
    rs_dt.fit(X_train, y_train)
    best_params_dt = estimator_dt.set_params(**rs_dt.best_params_)
    

    end_time = datetime.now()
    print('Duration DT: {}'.format(end_time - start_time))
    
    
    start_time = datetime.now()
    #Random Forest  
    param_grid_rf = {
    'n_estimators': np.linspace(10, 200).astype(int),
    'max_depth': [None] + list(np.linspace(3, 20).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]}
    
    estimator_rf = RandomForestClassifier(random_state=RSEED)
    rs_rf = RandomizedSearchCV(estimator_rf, param_grid_rf, n_jobs = -1, n_iter=5,
                               cv = 3, verbose = 1, scoring=f1, random_state=RSEED)
    rs_rf.fit(X_train, y_train)
    best_params_rf = estimator_rf.set_params(**rs_rf.best_params_)
    
    end_time = datetime.now()
    print('Duration RF: {}'.format(end_time - start_time))
    
    start_time = datetime.now()
    
    ''' 
    #KNN
    param_grid_knn = [{'n_neighbors': [5], 
                      'metric': ['minkowski'],
                      'p': [1,2]}]
    estimator_knn = KNeighborsClassifier()
    rs_knn = GridSearchCV(estimator_knn, param_grid_knn,
                            scoring=matthews_coeff, verbose=4)
    rs_knn.fit(X_train, y_train)
    best_params_knn = estimator_knn.set_params(**rs_knn.best_params_)
    
    end_time = datetime.now()
    print('Duration KNN: {}'.format(end_time - start_time))
    '''
    
    return best_params_dt, best_params_rf

# Compare different models (DecTree, RF, ...)

In [None]:
def compare(dataset=baseline, RSEED=0, param_search=False, smote=True):
    
    print("Settings: \n")
    print("RSEED: {}".format(RSEED))
    print("Hyperparameter Search:{}".format(param_search))
    print("Smote: {}".format(smote))
    
    #Prepare data for given dataset, conduct test, train split and oversample via SMOTE
    X_train, X_test, y_train, y_test, X_train_balanced, y_train_balanced = prepare_data(dataset)    
    
    #Defining models
    
    #Without smote
    dtree_baseline = DecisionTreeClassifier(random_state=RSEED)
    RandomForest = RandomForestClassifier(random_state=RSEED)
    KNN_euclidian = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
    AdaBoost = AdaBoostClassifier(n_estimators=100, random_state=RSEED)
    logreg = LogisticRegression(random_state=0)
    
    models = [dtree_baseline, RandomForest, KNN_euclidian, AdaBoost, logreg]
    
    #Print data for models without smote
    for model in models:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test) 
        
        print("\n\n")
        print("Results for model: \n {}".format(model))
        print("\n Confusion Matrix: \n {}".format(confusion_matrix(y_test, predictions)))
        print("\n Classification report: \n {}".format(classification_report(y_test, predictions)))
        print("\n Matthew Coefficient: \n {}".format(matthews_corrcoef(y_test, predictions)))
        
    
    if smote is True:
        print("All further results are oversampled via SMOTE:\n")
    
        #Looping through models using SMOTE data: balanced
        for model in models:
            model.fit(X_train_balanced, y_train_balanced)
            predictions = model.predict(X_test) 
    
            print("\n\n")
            print("Results for model: \n {}".format(model))
            print("Smote: {}".format(smote))
            print("\n Confusion Matrix: \n {}".format(confusion_matrix(y_test, predictions)))
            print("\n Classification report: \n {}".format(classification_report(y_test, predictions)))
            print("\n Matthew Coefficient: \n {}".format(matthews_corrcoef(y_test, predictions)))
            
            
    if param_search is True:
        print("Hyperparameter-Search running for Decision Tree and Random Forest.")
        #print("KNN also analyzed with Manhattan and Minkowski Metric...")
        if smote is True:
            #Get best parameter via GridSearch
            best_params_dt, best_params_rf = best_hyperparameter(X_train_balanced, y_train_balanced)
        if smote is False: 
            best_params_dt, best_params_rf = best_hyperparameter(X_train, y_train)
            
        dtree_sm_opt = best_params_dt
        RandomForest_sm_opt = best_params_rf
        #KNN_sm_manhattan = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=1)
        #KNN_sm_minkowski3 = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=3)
        
        models_sm_opt = [dtree_sm_opt, RandomForest_sm_opt] #KNN_sm_manhattan, KNN_sm_minkowski3
        
        #Loop for further models using SMOTE data
        print("All further results are results of best hyperparameter search:\n")
    
        #Looping through models
        for model in models_sm_opt:

            model.fit(X_train_balanced, y_train_balanced)
            predictions = model.predict(X_test) 
    
            print("\n\n")
            print("Results for model: \n {}".format(model))
            print("Smote: {}".format(smote))
            print("Hyperparameter Search:{}".format(param_search))
            print("\n Confusion Matrix: \n {}".format(confusion_matrix(y_test, predictions)))
            print("\n Classification report: \n {}".format(classification_report(y_test, predictions)))
            print("\n Matthew Coefficient: \n {}".format(matthews_corrcoef(y_test, predictions)))
            
    print("\n END")
        

In [None]:
compare(baseline, param_search=True)

In [None]:
df2 = pd.read_csv("../data/test.csv")

In [None]:
#baseline.to_csv('../data/training_final.csv', index=False)