### Step 1. Importing Libraries and load dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("data/fraudTest.csv")

### Step 2. Exploratory Data Analysis (EDA)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [5]:
df.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [6]:
df.duplicated().sum()

np.int64(0)

In [7]:
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

#### the columns and their data type are messy let's clean and structure them properly 

In [8]:
df['Unnamed: 0'].value_counts()

Unnamed: 0
555718    1
0         1
1         1
2         1
3         1
         ..
10        1
11        1
12        1
13        1
14        1
Name: count, Length: 555719, dtype: int64

In [9]:
# these seems to be the same as the index, so we can drop it
df.drop(columns=['Unnamed: 0'], inplace=True)


In [10]:
df['trans_date_trans_time'].value_counts()

trans_date_trans_time
2020-10-05 19:37:49    4
2020-12-13 17:53:47    4
2020-12-17 20:36:39    4
2020-12-19 16:02:22    4
2020-10-12 22:31:39    3
                      ..
2020-12-31 23:44:51    1
2020-12-31 23:45:21    1
2020-12-31 23:46:22    1
2020-12-31 23:46:44    1
2020-06-21 12:20:42    1
Name: count, Length: 544760, dtype: int64

In [11]:
# it is a datetime, so we can convert it and let's extract hour and day of the week as that might be useful

In [12]:
# Convert to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

# Extract useful features
df['transaction_hour'] = df['trans_date_trans_time'].dt.hour  # this has to be encoded later
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek  # this has to be encoded later


In [13]:
df['cc_num'].value_counts()

cc_num
6538441737335434       1474
4586810168620942       1466
4745996322265          1462
4587657402165341815    1458
2242542703101233       1428
                       ... 
4883407061576             9
4087542780207162          9
3540416671210051          8
4352307151555405069       7
4295296907373             6
Name: count, Length: 924, dtype: int64

In [14]:
# just like the index, this seems to be a unique identifier for each transaction, so we can drop it
df.drop(columns=['cc_num'], inplace=True)

In [15]:
df['merchant'].nunique()


693

In [16]:
df['category'].value_counts()

category
gas_transport     56370
grocery_pos       52553
home              52345
shopping_pos      49791
kids_pets         48692
shopping_net      41779
entertainment     40104
personal_care     39327
food_dining       39268
health_fitness    36674
misc_pos          34574
misc_net          27367
grocery_net       19426
travel            17449
Name: count, dtype: int64

In [17]:
# merchant and category are important categories, so we can keep them as they are and later we will encode them

In [18]:
df['amt'].describe()


count    555719.000000
mean         69.392810
std         156.745941
min           1.000000
25%           9.630000
50%          47.290000
75%          83.010000
max       22768.110000
Name: amt, dtype: float64

In [19]:
# now for the first name and last name, we can see that they are not very useful as they are not unique and do not provide much information

In [20]:
df.drop(columns=["first"], inplace=True)
df.drop(columns=["last"], inplace=True)

In [21]:
df['gender'].value_counts() #it may help to identify patterns in fraud, so we can keep it

gender
F    304886
M    250833
Name: count, dtype: int64

In [22]:
df['zip'].value_counts()

zip
82514    1589
48088    1518
34112    1495
16114    1474
73754    1470
         ... 
87417      10
73044       9
98118       9
56367       8
52658       6
Name: count, Length: 912, dtype: int64

In [23]:
df['street'].value_counts()

street
444 Robert Mews                  1474
6983 Carrillo Isle               1466
6114 Adams Harbor Suite 096      1462
6296 John Keys Suite 858         1458
43235 Mckenzie Views Apt. 837    1428
                                 ... 
6386 Bailey Hill Apt. 421           9
537 Brian Island                    9
44613 James Turnpike                8
610 Pacheco Parkway                 7
007 Tonya Isle Suite 299            6
Name: count, Length: 924, dtype: int64

In [24]:
df['city'].value_counts()

city
Birmingham     2423
Meridian       2229
Phoenix        2222
Utica          2204
San Antonio    2182
               ... 
Senatobia        10
Seattle           9
Guthrie           9
Rice              8
Wever             6
Name: count, Length: 849, dtype: int64

In [25]:
df['state'].value_counts()

state
TX    40393
NY    35918
PA    34326
CA    24135
OH    20147
MI    19671
IL    18960
FL    18104
AL    17532
MO    16501
MN    13719
AR    13484
NC    12868
SC    12541
VA    12506
KY    12506
WI    12370
IN    11959
IA    11819
OK    11379
GA    11277
MD    11152
WV    10838
NJ    10528
NE    10257
KS     9943
LA     8988
MS     8833
WY     8454
WA     8116
OR     7811
TN     7359
NM     7020
ME     6928
ND     6397
CO     5886
SD     5250
MA     5186
MT     5052
VT     5044
UT     4658
AZ     4592
NH     3449
CT     3277
ID     2490
NV     2451
DC     1517
HI     1090
AK      843
RI      195
Name: count, dtype: int64

In [26]:
# After seeing , street , city , state , zip , The street column is not very useful and the other columns have under
# 1000 unique categories so we will keep for now and see what happens after encoding them 

In [27]:
df.drop(columns=["street"], inplace=True)

In [28]:
# for lat , long  and city_pop
# No changes required at this point
# We'll keep: 'lat', 'long', 'city_pop'
 

In [29]:
df.dtypes


trans_date_trans_time    datetime64[ns]
merchant                         object
category                         object
amt                             float64
gender                           object
city                             object
state                            object
zip                               int64
lat                             float64
long                            float64
city_pop                          int64
job                              object
dob                              object
trans_num                        object
unix_time                         int64
merch_lat                       float64
merch_long                      float64
is_fraud                          int64
transaction_hour                  int32
day_of_week                       int32
dtype: object

In [30]:
df.job.value_counts()

job
Film/video editor                4119
Exhibition designer              3968
Surveyor, land/geomatics         3756
Naval architect                  3750
Designer, ceramics/pottery       3463
                                 ... 
Estate manager/land agent         195
Engineer, civil (consulting)      194
Operational investment banker      11
Software engineer                  11
Engineer, water                     8
Name: count, Length: 478, dtype: int64

In [31]:
# well can be encoded later as it has under 1000 unique categories

In [32]:
df.dob.value_counts()

dob
1977-03-23    2408
1988-09-15    1951
1981-08-29    1935
1997-09-22    1474
1997-07-05    1466
              ... 
1969-11-08       9
1936-12-23       9
1944-05-30       8
1932-05-09       7
1998-08-02       6
Name: count, Length: 910, dtype: int64

In [33]:
# Okay so the dob may not be useful but we can extract the age from it, so let's do that
df['dob'] = pd.to_datetime(df['dob'])

In [34]:
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

In [35]:
# now we can drop the dob column
df.drop(columns=['dob'], inplace=True)

In [36]:
# now trans_num and unix_time are not useful as they are unique identifiers for each transaction
df.drop(columns=['trans_num', 'unix_time'], inplace=True)


In [37]:
# also i think let's just drop merchant, city , state and zip as they are not very useful and have many unique categories
df.drop(columns=['merchant','city', 'state', 'zip'], inplace=True)

In [38]:
df.amt.value_counts()

amt
1.10      239
1.14      237
1.08      229
1.31      227
1.03      227
         ... 
850.87      1
516.74      1
255.42      1
302.79      1
362.67      1
Name: count, Length: 37256, dtype: int64

In [39]:
df.head()

Unnamed: 0,trans_date_trans_time,category,amt,gender,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,transaction_hour,day_of_week,age
0,2020-06-21 12:14:25,personal_care,2.86,M,33.9659,-80.9355,333497,Mechanical engineer,33.986391,-81.200714,0,12,6,52
1,2020-06-21 12:14:33,personal_care,29.84,F,40.3207,-110.436,302,"Sales professional, IT",39.450498,-109.960431,0,12,6,30
2,2020-06-21 12:14:53,health_fitness,41.28,F,40.6729,-73.5365,34496,"Librarian, public",40.49581,-74.196111,0,12,6,49
3,2020-06-21 12:15:15,misc_pos,60.05,M,28.5697,-80.8191,54767,Set designer,28.812398,-80.883061,0,12,6,32
4,2020-06-21 12:15:17,travel,3.19,M,44.2529,-85.017,1126,Furniture designer,44.959148,-85.884734,0,12,6,65


In [40]:
df.drop(columns='trans_date_trans_time', inplace=True)

In [41]:
df.head()

Unnamed: 0,category,amt,gender,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,transaction_hour,day_of_week,age
0,personal_care,2.86,M,33.9659,-80.9355,333497,Mechanical engineer,33.986391,-81.200714,0,12,6,52
1,personal_care,29.84,F,40.3207,-110.436,302,"Sales professional, IT",39.450498,-109.960431,0,12,6,30
2,health_fitness,41.28,F,40.6729,-73.5365,34496,"Librarian, public",40.49581,-74.196111,0,12,6,49
3,misc_pos,60.05,M,28.5697,-80.8191,54767,Set designer,28.812398,-80.883061,0,12,6,32
4,travel,3.19,M,44.2529,-85.017,1126,Furniture designer,44.959148,-85.884734,0,12,6,65


In [42]:
#  i want to just make sure columns which are numeric should be numeric so i will convert them
numeric_cols = ['amt', 'lat', 'long', 'city_pop', 'age', 'transaction_hour', 'day_of_week', 'merch_lat', 'merch_long']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [43]:
df.dtypes

category             object
amt                 float64
gender               object
lat                 float64
long                float64
city_pop              int64
job                  object
merch_lat           float64
merch_long          float64
is_fraud              int64
transaction_hour      int32
day_of_week           int32
age                   int64
dtype: object

In [44]:
# let's check the which are the columns are object type
df.select_dtypes(include=['object']).columns

Index(['category', 'gender', 'job'], dtype='object')

#### Now finally all the unwanted columns has been removed and the required columns are structured as well 

### Step 3. Encoding and Feature Engineering

In [49]:
import category_encoders as ce

In [45]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])  # M=1, F=0



In [50]:
df.head()           

Unnamed: 0,category,amt,gender,lat,long,city_pop,job,merch_lat,merch_long,is_fraud,transaction_hour,day_of_week,age
0,personal_care,2.86,1,33.9659,-80.9355,333497,Mechanical engineer,33.986391,-81.200714,0,12,6,52
1,personal_care,29.84,0,40.3207,-110.436,302,"Sales professional, IT",39.450498,-109.960431,0,12,6,30
2,health_fitness,41.28,0,40.6729,-73.5365,34496,"Librarian, public",40.49581,-74.196111,0,12,6,49
3,misc_pos,60.05,1,28.5697,-80.8191,54767,Set designer,28.812398,-80.883061,0,12,6,32
4,travel,3.19,1,44.2529,-85.017,1126,Furniture designer,44.959148,-85.884734,0,12,6,65


In [51]:
df.dtypes

category             object
amt                 float64
gender                int64
lat                 float64
long                float64
city_pop              int64
job                  object
merch_lat           float64
merch_long          float64
is_fraud              int64
transaction_hour      int32
day_of_week           int32
age                   int64
dtype: object

In [52]:
df.shape

(555719, 13)

In [53]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

In [54]:
X.head()

Unnamed: 0,category,amt,gender,lat,long,city_pop,job,merch_lat,merch_long,transaction_hour,day_of_week,age
0,personal_care,2.86,1,33.9659,-80.9355,333497,Mechanical engineer,33.986391,-81.200714,12,6,52
1,personal_care,29.84,0,40.3207,-110.436,302,"Sales professional, IT",39.450498,-109.960431,12,6,30
2,health_fitness,41.28,0,40.6729,-73.5365,34496,"Librarian, public",40.49581,-74.196111,12,6,49
3,misc_pos,60.05,1,28.5697,-80.8191,54767,Set designer,28.812398,-80.883061,12,6,32
4,travel,3.19,1,44.2529,-85.017,1126,Furniture designer,44.959148,-85.884734,12,6,65


In [76]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: is_fraud, dtype: int64

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [57]:
encoder = ce.LeaveOneOutEncoder(return_df=True)
X_train_loo = encoder.fit_transform(X_train, y_train)
X_train_loo.dtypes

category            float64
amt                 float64
gender                int64
lat                 float64
long                float64
city_pop              int64
job                 float64
merch_lat           float64
merch_long          float64
transaction_hour      int32
day_of_week           int32
age                   int64
dtype: object

In [59]:
X_train_loo.describe()

Unnamed: 0,category,amt,gender,lat,long,city_pop,job,merch_lat,merch_long,transaction_hour,day_of_week,age
count,444575.0,444575.0,444575.0,444575.0,444575.0,444575.0,444575.0,444575.0,444575.0,444575.0,444575.0,444575.0
mean,0.00386,69.380533,0.45198,38.544704,-90.239258,87966.26,0.00386,38.54497,-90.238832,12.811712,2.728588,46.425339
std,0.00357,154.071049,0.497689,5.06273,13.726744,299926.9,0.009556,5.097005,13.738705,6.810791,2.17807,17.443916
min,0.001205,1.0,0.0,20.0271,-165.6723,23.0,0.0,19.027422,-166.671575,0.0,0.0,15.0
25%,0.001366,9.64,0.0,34.6902,-96.798,741.0,0.0,34.760641,-96.899752,7.0,1.0,33.0
50%,0.00217,47.33,0.0,39.3716,-87.4769,2408.0,0.0,39.376373,-87.450556,14.0,3.0,44.0
75%,0.004353,83.0,1.0,41.8948,-80.1752,19685.0,0.006024,41.957335,-80.270008,19.0,5.0,58.0
max,0.011773,21437.71,1.0,65.6899,-67.9503,2906700.0,1.0,66.679297,-66.952026,23.0,6.0,96.0


In [60]:
X_test_loo = encoder.transform(X_test)
X_test_loo.describe()

Unnamed: 0,category,amt,gender,lat,long,city_pop,job,merch_lat,merch_long,transaction_hour,day_of_week,age
count,111144.0,111144.0,111144.0,111144.0,111144.0,111144.0,111144.0,111144.0,111144.0,111144.0,111144.0,111144.0
mean,0.003876,69.44192,0.448913,38.537449,-90.199592,89244.4,0.00381,38.534109,-90.201574,12.79846,2.719544,46.417629
std,0.003576,167.018356,0.497386,5.055774,13.701921,302239.1,0.008927,5.091137,13.71053,6.811473,2.18112,17.438097
min,0.001231,1.0,0.0,20.0271,-165.6723,23.0,0.0,19.027849,-166.630922,0.0,0.0,15.0
25%,0.001366,9.57,0.0,34.6689,-96.8094,741.0,0.0,34.728067,-96.925437,7.0,1.0,33.0
50%,0.00217,47.16,0.0,39.3716,-87.4616,2408.0,0.0,39.37743,-87.419015,14.0,2.0,44.0
75%,0.004353,83.07,1.0,41.8948,-80.1752,20328.0,0.005801,41.940383,-80.248195,19.0,5.0,58.0
max,0.011772,22768.11,1.0,65.6899,-67.9503,2906700.0,1.0,66.646051,-66.957364,23.0,6.0,96.0


#### Now we will do Modeling , starting with the most basic to the most advanced 

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [62]:
# train the model 

lr=LogisticRegression(max_iter=1000)

In [63]:
lr.fit(X_train_loo, y_train)

#### Training has been done , let's predict and see the accuracy

In [64]:
# Predictions
y_pred = lr.predict(X_test_loo)

In [65]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9959062117613187


## BOOM! an accuracy of 99.6%

In [66]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    110715
           1       0.00      0.00      0.00       429

    accuracy                           1.00    111144
   macro avg       0.50      0.50      0.50    111144
weighted avg       0.99      1.00      0.99    111144



In [67]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[110689     26]
 [   429      0]]


## This suggests that the Cleaning and Feature engineering we did was spot on!

### let's also check for other models as well

In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [71]:
models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [72]:
print("Model Accuracies:\n" + "-"*50)

for name, model in models.items():
    model.fit(X_train_loo, y_train)
    y_pred = model.predict(X_test_loo)

    acc = accuracy_score(y_test, y_pred)
    print(f"{name:<25}: {acc:.4f}")

Model Accuracies:
--------------------------------------------------
Decision Tree            : 0.9971
Random Forest            : 0.9978
Gradient Boosting        : 0.9964
Naive Bayes              : 0.9925
K-Nearest Neighbors      : 0.9963
Support Vector Machine   : 0.9961


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost                  : 0.8092


## After training for every model , we can see that Random Forest is our best Model
## Accuracy - 99.78 % 🚀

#### now let's save the best model

In [73]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_loo, y_train)

In [74]:
best_model = rf_model


In [75]:
import joblib

joblib.dump(best_model, 'rf_best_model.pkl')


['rf_best_model.pkl']