# Feature Engineering - Fraud Detection Problem

### Load, clean and prepare data

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from imblearn.over_sampling  import SMOTE
from plotly.subplots         import make_subplots
from sklearn.metrics         import classification_report, confusion_matrix
from sklearn.tree            import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes     import GaussianNB

RSEED = 0

In [None]:
df = pd.read_csv("../data/training.csv")

df = df.drop(["CurrencyCode","CountryCode"], axis=1) # identical value across all entries

df.head(3)

### Features

In [None]:
# feature accounting type: 1 if positive(payment) value else 0(return?)
# Hat F1 Score von 0.64 auf 0.65 verbessert
df.loc[df['Amount'] > 0, 'BookingType'] = 1
df.loc[df['Amount'] < 0, 'BookingType'] = 0

df['BookingType'] = df['BookingType'].astype('object')

In [None]:
# feature interval in daytime - verbessert F1 auf 0.74 von 0.65

# Transform to pandas timeseries fomar
df['TransactionStartTime'] = df['TransactionStartTime'].str.replace('T', ' ')
df['TransactionStartTime'] = df['TransactionStartTime'].str.replace('Z', '')
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], infer_datetime_format=True) 

# get more info for datetime
df['Hour'] = df.TransactionStartTime.dt.hour

df['Interval'] = np.nan 

df.loc[df.Hour < 8, 'Interval'] = 'night'
df.loc[(df.Hour >= 8) & (df.Hour < 12), 'Interval'] = 'morning'
df.loc[(df.Hour >= 12) & (df.Hour < 15), 'Interval'] = 'midday'
df.loc[(df.Hour >= 15) & (df.Hour < 18), 'Interval'] = 'afternoon'
df.loc[(df.Hour >= 18) & (df.Hour <= 24), 'Interval'] = 'evening'

df.sample(10)

In [None]:
interval_group = df.groupby(['BookingType', 'FraudResult', 'Interval']).agg({'Interval': 'count'})
interval_group

In [None]:
# Plot interval counts
sns.countplot(x="Interval", hue="FraudResult",data=df)
plt.yscale('log')
plt.xlabel('Interval', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Transaction Counts per Daytime', fontsize=18)
plt.legend(labels=["non-fraudulent", "fraudulent"])
plt.savefig(f'../figures/daytime.pdf',dpi=300,bbox_inches="tight")

In [None]:
test =df['Interval'].groupby(df['FraudResult']).value_counts(normalize=True)
test

In [None]:
# possible features with date attributes
df['Year'] = df['TransactionStartTime'].dt.year
df['Month'] = df['TransactionStartTime'].dt.month
df['Day'] = df['TransactionStartTime'].dt.day
df['Weekday'] = df['TransactionStartTime'].dt.weekday

df['Weekday'] = df['Weekday'].astype('object')

In [None]:
df_p = df.query('FraudResult == 0')
sns.countplot(x="Weekday", data=df_p)

In [None]:
df_p = df.query('FraudResult == 1')
sns.countplot(x="Weekday", data=df_p)

In [None]:
# Monday 0 - Sunday 6 - day of week reduced F1 score
# 1 if positive(payment) value else 0(return?)
weekday_group = df.groupby(['BookingType', 'FraudResult', 'Weekday']).agg({'Weekday': 'count'})
print(df.BookingType.value_counts())
weekday_group

In [None]:
drop_columns = ["TransactionId", "BatchId", "AccountId", "SubscriptionId", "CustomerId", "TransactionStartTime", "Hour", "Year", "Month", "Day"]

In [None]:
df.PricingStrategy.value_counts()

In [None]:
# Feature: ProviderId + AccountId combination
# The top is the most common value. The freq is the most common value’s frequency. T
df['ProviderId-AccountId'] = df.ProviderId + '_' + df.AccountId
provider_accid_group = df.groupby(['FraudResult', 'ProviderId']).agg({'ProviderId-AccountId': 'describe'})
#print(df['ProviderId-AccountId'].value_counts().head(10))
provider_accid_group

In [None]:
df.AccountId.value_counts()

In [None]:
pd.crosstab(df.FraudResult, df.ChannelId)

In [None]:
pd.crosstab(df.FraudResult, df.ProviderId)

### Baseline Model Evaluation

In [None]:
'''
# Convert categorical variables into dummy/indicator variables
cat_columns = [
 'ProviderId',
 'ProductCategory',
 'ProductId',
 'ChannelId',
 'PricingStrategy',
 'BookingType',
 #'Interval',
 #'Weekday',
'ProviderId-AccountId'
]

df_dummies = pd.get_dummies(df, columns=cat_columns, drop_first = True)
baseline = df_dummies.drop(["TransactionId", "BatchId", "AccountId", "SubscriptionId", "CustomerId", "TransactionStartTime", "Interval", "Weekday"], axis=1)

# Generate synthetic samples
X = baseline.drop("FraudResult", axis = 1)
y = baseline["FraudResult"]

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RSEED)

sm = SMOTE(random_state=0)
X_train, y_train = sm.fit_resample(X_train, y_train)

dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)   

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
'''

pass