In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn')
%matplotlib inline

from statsmodels.stats.outliers_influence import variance_inflation_factor

#models
from sklearn.ensemble import RandomForestClassifier

#transformers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#evaluation metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import auc 
from sklearn.metrics import precision_recall_curve

import ppscore as pps

import re
import Levenshtein

### Reading in the data

In [42]:
df = pd.read_csv('data.csv',encoding='latin-1')

### Fraud Rate of dataset

In [92]:
df.groupby('is_fraud')['purchase_amount'].sum()[1] / df.groupby('is_fraud')['purchase_amount'].sum()[0] * 100

11.50165772696562

### Cleaning operations

In [43]:
#strip whitespace from column names
df = df.rename(columns=lambda x: x.strip())

#strip whitespace from values
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#replace '-' in annual salary with 0
df['annual_salary'] = df['annual_salary'].replace('-',0)

#convert annual salary to int type
df['annual_salary'] = df['annual_salary'].astype('int64')

#replace NaNs in the IP country column, as missing values here may also be as valuable
df['ip_country'] = df['ip_country'].fillna('N/A')

### Feature Engineering

In [44]:
#Get the purchase amount as a percentage of the reported salary
df['purchase_div_salary'] = (df.query('purchase_amount > 0')['purchase_amount'] / df.query('annual_salary >0')['annual_salary']*100)

#Get the purchase hour from time_of_purchase 
df[['purchase_hour','minute','second']] = df['time_of_purchase'].str.split(':',expand=True)
df.drop(columns=['minute','second'],inplace=True)

#get full name from fname and lname
df['full_name'] = df.fname + ' ' + df.lname



#A method for calculating the similarity of emails to customer names
#This function strips out punctuation and numbers from emails, then calculates the Levenshtein distance between the name and email
#Levenshtein distance is the number of single character changes to get from one string to another

def compare_email_name(email, name):
    name = name
    email = email

    lower_name = str.lower(name)
    lower_email = str.lower(str.split(email,'@')[0])
    nopunc_email = re.sub('[!@#$%^&*()-=+.,]', ' ', lower_email)
    nonum_email = re.sub(r'[0-9]+', '', nopunc_email).strip()

    distance = round(Levenshtein.distance(lower_name,nonum_email) / len(email),1)
    #print(f'The name {lower_name} is {distance} characters different to the email {email}')
    return distance

full_names = []
emails = []
lev_distances = []

for i in range(len(df)):
    full_names.append(df['full_name'][i])
    emails.append(df['email'][i])
    lev_distances.append(compare_email_name(df['email'][i],df['full_name'][i]))

df['lev_distance'] = lev_distances

#fillna with mean for the num_vars df for the purpose of the VIF calculation
df['purchase_div_salary'] = df['purchase_div_salary'].fillna(df['purchase_div_salary'].mean())

In [45]:
#creating a Domain column from the email address column

df['domain'] = df['email'].str.split('@').str[1]
df['domain'] = df['domain'].str.replace('.','')

In [46]:
#converting has_paid_before from f and t to 0 and 1
df['has_paid_before'].replace('f',0, inplace=True)
df['has_paid_before'].replace('t',1, inplace=True)

#converting is_fraud from f and t to 0 and 1
df['is_fraud'].replace('f',0, inplace=True)
df['is_fraud'].replace('t',1, inplace=True)


### Severe class imbalance

After working through a few models and seeing where I can get with the data as is, I'll be revisiting this with a few methods of how to address the class imbalance

### Variance Inflation Factor

In [47]:
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)


In [48]:
num_vars = df.select_dtypes(include=('int64','float64','int32'))
#adding a constant to num_vars
num_vars = num_vars.assign(const=1)

In [49]:
X = num_vars
calc_vif(X)

Unnamed: 0,variables,VIF
0,transaction_id,1.034264
1,purchase_amount,1.151451
2,customer_id,1.011201
3,age,1.004042
4,has_paid_before,1.305417
5,annual_salary,1.034812
6,is_fraud,1.43593
7,purchase_div_salary,1.136105
8,lev_distance,1.145989
9,const,2874.561104


In [50]:
df.head()

Unnamed: 0,transaction_id,time_of_purchase,purchase_amount,customer_id,fname,lname,age,addr_city,email,has_paid_before,annual_salary,type_of_goods,delivery_city,ip_country,is_fraud,purchase_div_salary,purchase_hour,full_name,lev_distance,domain
0,2001563,16:52:11,15249,3941,Agneta,Jansson,28,Stockholm,agneta.jansson@hotmail.com,1,0,Clothing,Stockholm,SE,0,14.079045,16,Agneta Jansson,0.0,hotmailcom
1,1997663,14:20:45,9334,122158,Agneta,Gustafsson,73,Malmo,agneta_gustafsson@gmail.com,0,340000,Dating,Malmo,SE,0,2.745294,14,Agneta Gustafsson,0.0,gmailcom
2,2013263,12:26:00,8744,25977,Agneta,Olofsson,60,Stockholm,agneta.olofsson@hotmail.com,1,304000,Clothing,Stockholm,SE,0,2.876316,12,Agneta Olofsson,0.0,hotmailcom
3,2009363,15:13:01,6779,31238,Agneta,Petersson,66,Stockholm,agneta.petersson@hotmail.com,1,76000,Electronics,Stockholm,SE,0,8.919737,15,Agneta Petersson,0.0,hotmailcom
4,1988563,16:08:04,4963,37499,Agneta,Karlsson,36,Lund,agn.karl@yahoo.se,0,228000,Tickets,Lund,FI,0,2.176754,16,Agneta Karlsson,0.4,yahoose


## Creating a new DF to train ML models

In [51]:
X_df = df[['purchase_amount',
           'customer_id',
           'age',
           'addr_city',
           'has_paid_before',
           'annual_salary',
           'delivery_city',
           'ip_country',
           'purchase_div_salary',
           'purchase_hour',
           'lev_distance',
           'domain']].copy()

y = df['is_fraud']

In [52]:
X_df['purchase_hour'] = X_df['purchase_hour'].astype('int32')

In [53]:
X_df.head()

Unnamed: 0,purchase_amount,customer_id,age,addr_city,has_paid_before,annual_salary,delivery_city,ip_country,purchase_div_salary,purchase_hour,lev_distance,domain
0,15249,3941,28,Stockholm,1,0,Stockholm,SE,14.079045,16,0.0,hotmailcom
1,9334,122158,73,Malmo,0,340000,Malmo,SE,2.745294,14,0.0,gmailcom
2,8744,25977,60,Stockholm,1,304000,Stockholm,SE,2.876316,12,0.0,hotmailcom
3,6779,31238,66,Stockholm,1,76000,Stockholm,SE,8.919737,15,0.0,hotmailcom
4,4963,37499,36,Lund,0,228000,Lund,FI,2.176754,16,0.4,yahoose


In [54]:
X = pd.get_dummies(X_df, columns=['addr_city','delivery_city','ip_country','domain'], drop_first=True)

# Models!

In [55]:
#creating train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,
                                                    random_state = 0)

### Random Forest 

In [72]:
forest = RandomForestClassifier(n_estimators=1000)
forest.fit(X_train,y_train)
y_pred = forest.predict(X_test)

In [73]:
probs_forest = forest.predict_proba(X_test)[:, 1]

In [74]:
cm = confusion_matrix(y_test,y_pred)
forest_cm = pd.DataFrame(cm)
forest_cm.rename(index={0:'Not Fraud', 1:'Fraud'}, columns={0:'Predicted Not Fraud', 1:'Predicted Fraud'})

Unnamed: 0,Predicted Not Fraud,Predicted Fraud
Not Fraud,2435,14
Fraud,36,85


In [76]:
cm

array([[2435,   14],
       [  36,   85]], dtype=int64)

In [75]:
print(classification_report(y_test, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9854    0.9943    0.9898      2449
           1     0.8586    0.7025    0.7727       121

    accuracy                         0.9805      2570
   macro avg     0.9220    0.8484    0.8813      2570
weighted avg     0.9795    0.9805    0.9796      2570



In [83]:
true_neg, false_pos, false_neg, true_pos = confusion_matrix(y_test,y_pred).ravel()

In [97]:
print('Fraud Rate after Random Forest: ',round(false_neg/true_neg * 100,2), '%', sep='')

Fraud Rate after Random Forest: 1.48%


In [98]:
print('Fraud Rate after Random Forest: ',round(false_pos/true_neg * 100,2), '%', sep='')

Fraud Rate after Random Forest: 0.57%
