Import dependencies

In [3]:
import numpy as np
import pandas as pd

Load the dataset from csv file: farudTrain.csv

In [4]:
train_data = pd.read_csv('fraudTrain.csv')

In [5]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [6]:
# get dataset info
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

is_fraud is the class label of the data. 0 is for legitimate transaction, while 1 is for fradulent transaction.

In [7]:
# check for missing values in each column
train_data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

**no missing values**

In [14]:
# Compare the total number of legitimate transactions and fraudulent trarnsactions
train_data['is_fraud'].value_counts()

0    1289169
1       7506
Name: is_fraud, dtype: int64

In [40]:
train_data.describe()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,648337.0,4.17192e+17,70.35104,48800.67,38.53762,-90.22634,88824.44,1349244000.0,38.53734,-90.22646,0.005788652
std,374318.0,1.308806e+18,160.316,26893.22,5.075808,13.75908,301956.4,12841280.0,5.109788,13.77109,0.07586269
min,0.0,60416210000.0,1.0,1257.0,20.0271,-165.6723,23.0,1325376000.0,19.02779,-166.6712,0.0
25%,324168.5,180042900000000.0,9.65,26237.0,34.6205,-96.798,743.0,1338751000.0,34.73357,-96.89728,0.0
50%,648337.0,3521417000000000.0,47.52,48174.0,39.3543,-87.4769,2456.0,1349250000.0,39.36568,-87.43839,0.0
75%,972505.5,4642255000000000.0,83.14,72042.0,41.9404,-80.158,20328.0,1359385000.0,41.95716,-80.2368,0.0
max,1296674.0,4.992346e+18,28948.9,99783.0,66.6933,-67.9503,2906700.0,1371817000.0,67.51027,-66.9509,1.0


So, there are 1289169 number of legitimate transactions while only 7506 number of fraudulent transactions which accounts for only ~0.6% of the total number of transactions. 
The dataset is drastically imbalanced.

In [15]:
# Plot the pie graph for depicting the imbalance in the data.
import plotly.graph_objects as go
colors = ['orange', 'white']
labels = ['Legit', 'Fraud']
values = train_data['is_fraud'].value_counts()/train_data['is_fraud'].shape[0]
fig = go.Figure(data=[go.Pie(labels = labels,
                           values = values, hole = .3)])
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='white', width=0.1)))
fig.update_layout(
    title_text="Credit Card Fraud",
    title_font_color="white",
    legend_title_font_color="yellow",
    paper_bgcolor="black",
    plot_bgcolor='black',
    font_color="white",
)
fig.show()

In [18]:
# Separating the training data into the 2 class labels
l_train_data = train_data[train_data.is_fraud == 0] # legitimate transactions
f_train_data = train_data[train_data.is_fraud == 1] # fraudulent transactions

In [17]:
# Getting the info of newly formed
l_train_data.info()
f_train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1289169 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1289169 non-null  int64  
 1   trans_date_trans_time  1289169 non-null  object 
 2   cc_num                 1289169 non-null  int64  
 3   merchant               1289169 non-null  object 
 4   category               1289169 non-null  object 
 5   amt                    1289169 non-null  float64
 6   first                  1289169 non-null  object 
 7   last                   1289169 non-null  object 
 8   gender                 1289169 non-null  object 
 9   street                 1289169 non-null  object 
 10  city                   1289169 non-null  object 
 11  state                  1289169 non-null  object 
 12  zip                    1289169 non-null  int64  
 13  lat                    1289169 non-null  float64
 14  long              

In [20]:
train_data_copy = train_data.copy()
train_data_copy.drop_duplicates(inplace=True)
print("Duplicated values dropped successfully")

Duplicated values dropped successfully


In [21]:
train_data_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

There are no duplicates in this dataset.

In [23]:
# Normalize a sample column merch_lat
from sklearn import preprocessing
merch_lat = train_data['merch_lat'].tolist()
normalized_merch_lat = preprocessing.normalize([merch_lat])
print(normalized_merch_lat)

[[0.0008135  0.00111051 0.00097478 ... 0.00075947 0.00096661 0.00105193]]


In [24]:
train_data['amt'].describe()

count    1.296675e+06
mean     7.035104e+01
std      1.603160e+02
min      1.000000e+00
25%      9.650000e+00
50%      4.752000e+01
75%      8.314000e+01
max      2.894890e+04
Name: amt, dtype: float64

In [26]:
print("Describe amount in legit dataframe")
l_train_data['amt'].describe()

Describe amount in legit dataframe


count    1.289169e+06
mean     6.766711e+01
std      1.540080e+02
min      1.000000e+00
25%      9.610000e+00
50%      4.728000e+01
75%      8.254000e+01
max      2.894890e+04
Name: amt, dtype: float64

In [27]:
print("Describe amount in fraud dataframe")
f_train_data['amt'].describe()

Describe amount in fraud dataframe


count    7506.000000
mean      531.320092
std       390.560070
min         1.060000
25%       245.662500
50%       396.505000
75%       900.875000
max      1376.040000
Name: amt, dtype: float64

In [32]:
print(l_train_data.shape)
print(f_train_data.shape)

(1289169, 23)
(7506, 23)


Under-sampling

In [33]:
l_train_data_sample = l_train_data.sample(7506)

In [34]:
l_train_data_sample.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
70154,70154,2019-02-11 08:00:35,2286236465059468,"fraud_Huel, Hammes and Witting",grocery_pos,149.61,Morgan,Murray,F,2788 Brittney Island,...,38.5319,-93.9221,467,Agricultural consultant,1950-05-27,c81c08727d3985cba9e0d9e31e22e8a5,1328947235,38.988585,-94.096963,0
720897,720897,2019-11-03 22:12:31,4509922033272157,fraud_Daugherty LLC,kids_pets,53.29,Monica,West,F,22084 Smith Roads Suite 776,...,43.2893,-97.1904,811,Neurosurgeon,1972-03-28,bd48c2966be9f5b4110e9f7f99332be3,1351980751,43.103151,-97.929334,0
944693,944693,2020-01-12 18:13:53,3575540972310993,fraud_Abshire PLC,entertainment,118.82,Rachel,Villarreal,F,250 Carrie Throughway,...,34.3396,-89.5736,4198,Curator,2001-06-22,8d16322bbd07068d6e31b27e7a8d7829,1358014433,33.82233,-89.406917,0
711925,711925,2019-10-31 06:58:33,2266735643685262,fraud_O'Keefe-Hudson,grocery_pos,28.57,Carlos,Chung,M,8957 Russell Key,...,34.4959,-86.259,5901,Curator,1972-07-25,acd9d2c71ea65bebb38d7cbdf9751469,1351666713,35.212151,-86.316608,0
997995,997995,2020-02-12 22:42:13,36485887555770,fraud_Brown PLC,misc_net,69.14,Michael,Gross,M,230 Ryan Tunnel Apt. 025,...,40.4971,-82.8342,267,Facilities manager,2005-01-29,f6113959b7113fd2a5a188ecfb4797b5,1360708933,39.84428,-81.975934,0


In [35]:
print(l_train_data_sample.shape)

(7506, 23)


In [36]:
# Concatenating both the datasets to form a new dataset by rows (i.e. axis=0. If we want to concatenate by columns, axis=1)
sample_train_data = pd.concat([l_train_data_sample, f_train_data], axis=0)

In [38]:
sample_train_data['is_fraud'].value_counts()

0    7506
1    7506
Name: is_fraud, dtype: int64

In [39]:
sample_train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
70154,70154,2019-02-11 08:00:35,2286236465059468,"fraud_Huel, Hammes and Witting",grocery_pos,149.61,Morgan,Murray,F,2788 Brittney Island,...,38.5319,-93.9221,467,Agricultural consultant,1950-05-27,c81c08727d3985cba9e0d9e31e22e8a5,1328947235,38.988585,-94.096963,0
720897,720897,2019-11-03 22:12:31,4509922033272157,fraud_Daugherty LLC,kids_pets,53.29,Monica,West,F,22084 Smith Roads Suite 776,...,43.2893,-97.1904,811,Neurosurgeon,1972-03-28,bd48c2966be9f5b4110e9f7f99332be3,1351980751,43.103151,-97.929334,0
944693,944693,2020-01-12 18:13:53,3575540972310993,fraud_Abshire PLC,entertainment,118.82,Rachel,Villarreal,F,250 Carrie Throughway,...,34.3396,-89.5736,4198,Curator,2001-06-22,8d16322bbd07068d6e31b27e7a8d7829,1358014433,33.82233,-89.406917,0
711925,711925,2019-10-31 06:58:33,2266735643685262,fraud_O'Keefe-Hudson,grocery_pos,28.57,Carlos,Chung,M,8957 Russell Key,...,34.4959,-86.259,5901,Curator,1972-07-25,acd9d2c71ea65bebb38d7cbdf9751469,1351666713,35.212151,-86.316608,0
997995,997995,2020-02-12 22:42:13,36485887555770,fraud_Brown PLC,misc_net,69.14,Michael,Gross,M,230 Ryan Tunnel Apt. 025,...,40.4971,-82.8342,267,Facilities manager,2005-01-29,f6113959b7113fd2a5a188ecfb4797b5,1360708933,39.84428,-81.975934,0


In [41]:
sample_train_data['trans_date_trans_time'] = pd.to_datetime(sample_train_data['trans_date_trans_time'])
sample_train_data['trans_date'] = sample_train_data['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
sample_train_data['trans_date']=pd.to_datetime(sample_train_data['trans_date'])

sample_train_data['dob'] = pd.to_datetime(sample_train_data['dob'])

In [42]:
sample_train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,trans_date
70154,70154,2019-02-11 08:00:35,2286236465059468,"fraud_Huel, Hammes and Witting",grocery_pos,149.61,Morgan,Murray,F,2788 Brittney Island,...,-93.9221,467,Agricultural consultant,1950-05-27,c81c08727d3985cba9e0d9e31e22e8a5,1328947235,38.988585,-94.096963,0,2019-02-11
720897,720897,2019-11-03 22:12:31,4509922033272157,fraud_Daugherty LLC,kids_pets,53.29,Monica,West,F,22084 Smith Roads Suite 776,...,-97.1904,811,Neurosurgeon,1972-03-28,bd48c2966be9f5b4110e9f7f99332be3,1351980751,43.103151,-97.929334,0,2019-11-03
944693,944693,2020-01-12 18:13:53,3575540972310993,fraud_Abshire PLC,entertainment,118.82,Rachel,Villarreal,F,250 Carrie Throughway,...,-89.5736,4198,Curator,2001-06-22,8d16322bbd07068d6e31b27e7a8d7829,1358014433,33.82233,-89.406917,0,2020-01-12
711925,711925,2019-10-31 06:58:33,2266735643685262,fraud_O'Keefe-Hudson,grocery_pos,28.57,Carlos,Chung,M,8957 Russell Key,...,-86.259,5901,Curator,1972-07-25,acd9d2c71ea65bebb38d7cbdf9751469,1351666713,35.212151,-86.316608,0,2019-10-31
997995,997995,2020-02-12 22:42:13,36485887555770,fraud_Brown PLC,misc_net,69.14,Michael,Gross,M,230 Ryan Tunnel Apt. 025,...,-82.8342,267,Facilities manager,2005-01-29,f6113959b7113fd2a5a188ecfb4797b5,1360708933,39.84428,-81.975934,0,2020-02-12


In [43]:
# Age
sample_train_data['age'] = sample_train_data['trans_date']-sample_train_data['dob']
sample_train_data['age'] = sample_train_data['age'].astype('timedelta64[Y]')

sample_train_data['age']

70154      68.0
720897     47.0
944693     18.0
711925     47.0
997995     15.0
           ... 
1295399    34.0
1295491    34.0
1295532    26.0
1295666    50.0
1295733    26.0
Name: age, Length: 15012, dtype: float64

In [44]:
# Transaction month & year
sample_train_data['trans_month'] = pd.DatetimeIndex(sample_train_data['trans_date']).month
sample_train_data['trans_year'] = pd.DatetimeIndex(sample_train_data['trans_date']).year

sample_train_data['trans_month'],sample_train_data['trans_year']

(70154       2
 720897     11
 944693      1
 711925     10
 997995      2
            ..
 1295399     6
 1295491     6
 1295532     6
 1295666     6
 1295733     6
 Name: trans_month, Length: 15012, dtype: int64,
 70154      2019
 720897     2019
 944693     2020
 711925     2019
 997995     2020
            ... 
 1295399    2020
 1295491    2020
 1295532    2020
 1295666    2020
 1295733    2020
 Name: trans_year, Length: 15012, dtype: int64)

In [45]:
# Distance between merchant and home
sample_train_data['latitudinal_distance'] = abs(round(sample_train_data['merch_lat']-sample_train_data['lat'],3))
sample_train_data['longitudinal_distance'] = abs(round(sample_train_data['merch_long']-sample_train_data['long'],3))

sample_train_data['latitudinal_distance'] 
sample_train_data['longitudinal_distance']

70154      0.175
720897     0.739
944693     0.167
711925     0.058
997995     0.858
           ...  
1295399    0.431
1295491    0.547
1295532    0.459
1295666    0.744
1295733    0.970
Name: longitudinal_distance, Length: 15012, dtype: float64

In [46]:
# Convert gender to a numerical column
sample_train_data.gender=sample_train_data.gender.apply(lambda x: 1 if x=="M" else 0)

In [47]:
sample_train_data.gender

70154      0
720897     0
944693     0
711925     1
997995     1
          ..
1295399    0
1295491    0
1295532    1
1295666    0
1295733    1
Name: gender, Length: 15012, dtype: int64

In [48]:
sample_train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,unix_time,merch_lat,merch_long,is_fraud,trans_date,age,trans_month,trans_year,latitudinal_distance,longitudinal_distance
70154,70154,2019-02-11 08:00:35,2286236465059468,"fraud_Huel, Hammes and Witting",grocery_pos,149.61,Morgan,Murray,0,2788 Brittney Island,...,1328947235,38.988585,-94.096963,0,2019-02-11,68.0,2,2019,0.457,0.175
720897,720897,2019-11-03 22:12:31,4509922033272157,fraud_Daugherty LLC,kids_pets,53.29,Monica,West,0,22084 Smith Roads Suite 776,...,1351980751,43.103151,-97.929334,0,2019-11-03,47.0,11,2019,0.186,0.739
944693,944693,2020-01-12 18:13:53,3575540972310993,fraud_Abshire PLC,entertainment,118.82,Rachel,Villarreal,0,250 Carrie Throughway,...,1358014433,33.82233,-89.406917,0,2020-01-12,18.0,1,2020,0.517,0.167
711925,711925,2019-10-31 06:58:33,2266735643685262,fraud_O'Keefe-Hudson,grocery_pos,28.57,Carlos,Chung,1,8957 Russell Key,...,1351666713,35.212151,-86.316608,0,2019-10-31,47.0,10,2019,0.716,0.058
997995,997995,2020-02-12 22:42:13,36485887555770,fraud_Brown PLC,misc_net,69.14,Michael,Gross,1,230 Ryan Tunnel Apt. 025,...,1360708933,39.84428,-81.975934,0,2020-02-12,15.0,2,2020,0.653,0.858
