In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from models import dt as model

In [2]:
df=pd.read_csv('data/transactions_processed_noreversed_method2_no_multi_swipe_3.csv', parse_dates=['transactionDateTime','currentExpDate','accountOpenDate','dateOfLastAddressChange'])

# Feature Creation

Some ideas from https://github.com/Alisaahy/Fraud-Detection-Project/blob/74f569b2d13c2b408150b4175b3fef2f7cf01dd7/ML_project_Fraud_detection.ipynb 

In [3]:
# Conversion of the datatype as we will do some datetime operations
df['transactionDateTime'] = pd.to_datetime(df['transactionDateTime'])
df['currentExpDate'] = pd.to_datetime(df['currentExpDate'])
df['accountOpenDate'] = pd.to_datetime(df['accountOpenDate'])
df['dateOfLastAddressChange'] = pd.to_datetime(df['dateOfLastAddressChange'])
df = df.drop(['cardCVV','cardLast4Digits'], axis=1)
df=df.sort_values(by=['customerId','transactionDateTime'])
# Are the transactions in the same country
df['sameCountry'] = (df.acqCountry == df.merchantCountryCode).astype(int)
df = df.drop(['acqCountry', 'merchantCountryCode'], axis=1)
# Convert datetime to time span (days)
df['expTime'] = ((df.currentExpDate - df.transactionDateTime) / np.timedelta64(1, 'D'))
df['openTime'] = ((df.transactionDateTime - df.accountOpenDate) / np.timedelta64(1, 'D'))
df['changeAddTime'] = ((df.transactionDateTime - df.dateOfLastAddressChange) / np.timedelta64(1, 'D'))
df = df.drop(['currentExpDate', 'accountOpenDate', 'dateOfLastAddressChange'], axis=1)


In [4]:
df.columns

Index(['customerId', 'creditLimit', 'availableMoney', 'transactionDateTime',
       'transactionAmount', 'merchantName', 'posEntryMode', 'posConditionCode',
       'merchantCategoryCode', 'transactionType', 'currentBalance',
       'cardPresent', 'expirationDateKeyInMatch', 'isFraud', 'sameCountry',
       'expTime', 'openTime', 'changeAddTime'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712790 entries, 477123 to 95524
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   customerId                712790 non-null  int64         
 1   creditLimit               712790 non-null  int64         
 2   availableMoney            712790 non-null  float64       
 3   transactionDateTime       712790 non-null  datetime64[ns]
 4   transactionAmount         712790 non-null  float64       
 5   merchantName              712790 non-null  object        
 6   posEntryMode              712790 non-null  int64         
 7   posConditionCode          712790 non-null  int64         
 8   merchantCategoryCode      712790 non-null  object        
 9   transactionType           712790 non-null  object        
 10  currentBalance            712790 non-null  float64       
 11  cardPresent               712790 non-null  bool          
 12

In [6]:
catcols=['creditLimit','posEntryMode','posConditionCode','merchantCategoryCode','transactionType']
labelencode=['merchantName']
booltype=['cardPresent','expirationDateKeyInMatch','isFraud']
le = LabelEncoder()
for col in labelencode:
    df[col] = le.fit_transform(df[col])
df = pd.get_dummies(df, columns=catcols)
df = df.drop(['transactionDateTime'], axis=1)
df[booltype] = df[booltype].astype(int)


#### I use one-hot encoding for all the catcols because of the following reasons:
1. CreditLimit in this case is just a bin. So, if we normalize the raw values, dominance of a particular CreditLimit type might influence the normalization
2. 'posEntryMode','posConditionCode','merchantCategoryCode','transactionType','cardPresent','expirationDateKeyInMatch' mostly have binary values or just a few more values. Again, normalization of the raw values might not be correct as the distribution of values might then influence the normalization
3. merchantName has too many unique values, which led to me trying to use LabelEncoder on it.
4. The boolean type columns on the other hand are converted to 0-1 values

In [7]:
# Normalize continuous columns
cont_col=['availableMoney', 'transactionAmount','merchantName','currentBalance','expTime', 'openTime', 'changeAddTime']
scaler = StandardScaler()
df[cont_col] = scaler.fit_transform(df[cont_col])
df.to_csv('data/feature_label_data.csv', index=False)    
df.drop(['customerId'], axis=1,inplace=True)

In [8]:
df.to_csv('data/feature_label_data_DT.csv', index=False)