# Feature engineering

## Load libraries and data

### Libraries

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

from datetime                import datetime, date, time, timedelta
from plotly.subplots         import make_subplots
from sklearn.metrics         import classification_report, confusion_matrix
from sklearn.tree            import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes     import GaussianNB

%matplotlib inline

## Data

In [None]:
df = pd.read_csv("../data/training.csv")
df = df.drop(["CurrencyCode","CountryCode"], axis=1) # identical value across all entries
df["TransactionId2"] = df["TransactionId"]
df.set_index("TransactionId", inplace=True)
df = df.rename(columns={'TransactionId2': "TransactionId"})
df.head()

### Convert TransactionStartTime to timestamps

In [None]:
# Convert string to timestamp
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], format="%Y-%m-%dT%H:%M:%SZ")

# Check if correct
#duration = df['TransactionStartTime']["TransactionId_73770"] - df['TransactionStartTime']["TransactionId_76871"] # Use ID to get same result even if df is resorted
#duration.seconds # Should be 19

# Sort df by date
df.sort_values(by="TransactionStartTime", inplace=True)

## Features

### Time between transactions, per categorical feature type and feature level

In [None]:
features = [ 
             #'AccountId', # Previous EDA showed poor discriminability between fraudulent and nonfraudulent for commented-out features
             #'SubscriptionId',
             #'CustomerId',
             #'ProviderId',
             'ProductId',
             'ProductCategory',
             'ChannelId',
             'PricingStrategy']

new_feature_names = list()

for i, feat in enumerate(features):
    td_feat = "TimeDelta_" + feat
    new_feature_names.append(td_feat)
    df[td_feat]  = np.zeros((len(df),1))
    df[td_feat]  = df[td_feat].apply(pd.to_timedelta)
    
    for feat_lvl in df[feat].unique().tolist():
        df_subs = df[df[feat]==feat_lvl]

        if len(df_subs)==1:
            continue
        else:
            for j in range(1,len(df_subs)):
                df[td_feat][df_subs.index[j]] = df_subs['TransactionStartTime'][j] - df_subs['TransactionStartTime'][j-1]
    
    df[td_feat] = df[td_feat].dt.seconds
    print(f"{i+1} of {len(features)} time delta features computed.")

### Number of transactions, per categorical feature type and feature level

In [None]:
features = [ 
             'AccountId',
             'SubscriptionId',
             'CustomerId',
             'ProviderId',
             'ProductId',
             'ProductCategory',
             'ChannelId',
             'PricingStrategy']

new_feature_names2 = list()

for i, feat in enumerate(features):
    n_feat = "NTransactions_" + feat
    new_feature_names2.append(n_feat)
    
    df = pd.merge(df,pd.DataFrame(df.groupby(feat).count()["FraudResult"]).reset_index().rename(columns={'FraudResult': n_feat}))

### Booking Type

In [None]:
# Possible booking types: 1 if positive amount else 0
df.loc[df['Amount'] > 0, 'BookingType'] = 1
df.loc[df['Amount'] < 0, 'BookingType'] = 0

df['BookingType'] = df['BookingType'].astype('object')

### Day Intervals

In [None]:
df['Hour'] = df.TransactionStartTime.dt.hour
df['Interval'] = np.nan 

df.loc[df.Hour < 8, 'Interval'] = 'night'
df.loc[(df.Hour >= 8) & (df.Hour < 12), 'Interval'] = 'morning'
df.loc[(df.Hour >= 12) & (df.Hour < 15), 'Interval'] = 'midday'
df.loc[(df.Hour >= 15) & (df.Hour < 18), 'Interval'] = 'afternoon'
df.loc[(df.Hour >= 18) & (df.Hour <= 24), 'Interval'] = 'evening'

In [None]:
df.drop('Hour', axis=1, inplace=True)

### ProviderId + AccountId

In [None]:
df['ProviderId-AccountId'] = df.ProviderId + '_' + df.AccountId
# The top is the most common value. The freq is the most common value’s frequency.
#provider_accid_group = df.groupby(['FraudResult', 'ProviderId']).agg({'ProviderId-AccountId': 'describe'})
#provider_accid_group

### Save dataframe to csv

In [None]:
df.to_csv('../data/training_final.csv', index=False)

In [None]:
df