In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score

In [2]:
df_train = pd.read_csv('Credit_Card_Transactions_Fraud_Detection_Dataset/FraudTrain.csv')
df_train.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
to_date_time = ['trans_date_trans_time', 'dob']
for col in to_date_time:
    df_train[col] = pd.to_datetime(df_train[col])
    df_train[col] = df_train[col].apply(lambda x: x.timestamp())

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  float64
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [4]:
df_train = df_train.drop(columns = ['Unnamed: 0', 'first', 'last', 'trans_num', 'merch_long', 'merch_lat', 'unix_time', 'street', 'gender'])
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 14 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  float64
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  object 
 3   category               1296675 non-null  object 
 4   amt                    1296675 non-null  float64
 5   city                   1296675 non-null  object 
 6   state                  1296675 non-null  object 
 7   zip                    1296675 non-null  int64  
 8   lat                    1296675 non-null  float64
 9   long                   1296675 non-null  float64
 10  city_pop               1296675 non-null  int64  
 11  job                    1296675 non-null  object 
 12  dob                    1296675 non-null  float64
 13  is_fraud               1296675 non-null  int64  
dtypes: float64(5), int

In [5]:
label = LabelEncoder()
cols = ['category', 'city', 'state', 'merchant', 'job']
for col in cols:
    df_train[col] = label.fit_transform(df_train[col])
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 14 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  float64
 1   cc_num                 1296675 non-null  int64  
 2   merchant               1296675 non-null  int32  
 3   category               1296675 non-null  int32  
 4   amt                    1296675 non-null  float64
 5   city                   1296675 non-null  int32  
 6   state                  1296675 non-null  int32  
 7   zip                    1296675 non-null  int64  
 8   lat                    1296675 non-null  float64
 9   long                   1296675 non-null  float64
 10  city_pop               1296675 non-null  int64  
 11  job                    1296675 non-null  int32  
 12  dob                    1296675 non-null  float64
 13  is_fraud               1296675 non-null  int64  
dtypes: float64(5), int

In [6]:
df_train.dtypes

trans_date_trans_time    float64
cc_num                     int64
merchant                   int32
category                   int32
amt                      float64
city                       int32
state                      int32
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                        int32
dob                      float64
is_fraud                   int64
dtype: object

In [7]:
df_train.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,city,state,zip,lat,long,city_pop,job,dob,is_fraud
0,1546301000.0,2703186189652095,514,8,4.97,526,27,28654,36.0788,-81.1781,3495,370,573868800.0,0
1,1546301000.0,630423337322,241,4,107.23,612,47,99160,48.8878,-118.2105,149,428,267235200.0,0
2,1546301000.0,38859492057661,390,0,220.11,468,13,83252,42.1808,-112.262,4154,307,-250905600.0,0
3,1546301000.0,3534093764340240,360,2,45.0,84,26,59632,46.2306,-112.1138,1939,328,-93744000.0,0
4,1546301000.0,375534208663984,297,9,41.96,216,45,24433,38.4207,-79.4629,99,116,512352000.0,0


In [8]:
df_train.describe()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,city,state,zip,lat,long,city_pop,job,dob,is_fraud
count,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0,1296675.0
mean,1570107000.0,4.17192e+17,342.8585,6.227787,70.35104,445.2633,26.67728,48800.67,38.53762,-90.22634,88824.44,251.1734,118523000.0,0.005788652
std,12855450.0,1.308806e+18,200.9519,3.913443,160.316,258.6001,14.33098,26893.22,5.075808,13.75908,301956.4,140.1094,548805600.0,0.07586269
min,1546301000.0,60416210000.0,0.0,0.0,1.0,0.0,0.0,1257.0,20.0271,-165.6723,23.0,0.0,-1425514000.0,0.0
25%,1559589000.0,180042900000000.0,165.0,3.0,9.65,224.0,15.0,26237.0,34.6205,-96.798,743.0,131.0,-233107200.0,0.0
50%,1570088000.0,3521417000000000.0,346.0,6.0,47.52,439.0,28.0,48174.0,39.3543,-87.4769,2456.0,251.0,186537600.0,0.0
75%,1580224000.0,4642255000000000.0,514.0,10.0,83.14,677.0,38.0,72042.0,41.9404,-80.158,20328.0,374.0,540950400.0,0.0
max,1592742000.0,4.992346e+18,692.0,13.0,28948.9,893.0,50.0,99783.0,66.6933,-67.9503,2906700.0,493.0,1106957000.0,1.0


In [9]:
X_train = df_train.drop(columns=['is_fraud'])
Y_train = df_train['is_fraud']

In [10]:
X_train

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,city,state,zip,lat,long,city_pop,job,dob
0,1.546301e+09,2703186189652095,514,8,4.97,526,27,28654,36.0788,-81.1781,3495,370,573868800.0
1,1.546301e+09,630423337322,241,4,107.23,612,47,99160,48.8878,-118.2105,149,428,267235200.0
2,1.546301e+09,38859492057661,390,0,220.11,468,13,83252,42.1808,-112.2620,4154,307,-250905600.0
3,1.546301e+09,3534093764340240,360,2,45.00,84,26,59632,46.2306,-112.1138,1939,328,-93744000.0
4,1.546301e+09,375534208663984,297,9,41.96,216,45,24433,38.4207,-79.4629,99,116,512352000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1.592742e+09,30263540414123,499,0,15.56,330,44,84735,37.7175,-112.4777,258,215,-255744000.0
1296671,1.592742e+09,6011149206456997,2,1,51.70,813,20,21790,39.2667,-77.5101,100,360,313718400.0
1296672,1.592742e+09,3514865930894695,599,1,105.93,346,32,88325,32.9396,-105.8189,899,308,-73872000.0
1296673,1.592742e+09,2720012583106919,509,1,74.90,471,41,57756,43.3526,-102.5411,1126,485,335404800.0


In [11]:
Y_train

0          0
1          0
2          0
3          0
4          0
          ..
1296670    0
1296671    0
1296672    0
1296673    0
1296674    0
Name: is_fraud, Length: 1296675, dtype: int64

In [12]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [13]:
X_train

array([[-1.8518249 , -0.31669237,  0.85165441, ..., -0.28258876,
         0.848099  ,  0.82970364],
       [-1.85182288, -0.31875727, -0.50688018, ..., -0.29366984,
         1.26206149,  0.27097443],
       [-1.85182233, -0.31872806,  0.23459108, ..., -0.28040632,
         0.39845009, -0.67315042],
       ...,
       [ 1.76070971, -0.3160722 ,  1.27464137, ..., -0.29118603,
         0.40558738, -0.3505705 ],
       [ 1.76071469, -0.31667951,  0.82677283, ..., -0.29043427,
         1.6688867 ,  0.39518895],
       [ 1.76071477,  2.96125682,  0.13506474, ..., -0.29344133,
         1.54041558,  1.25729129]])

In [14]:
df_test = pd.read_csv('Credit_Card_Transactions_Fraud_Detection_Dataset/FraudTest.csv')
df_test.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [15]:
df_test.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [16]:
df_test.drop(columns=['Unnamed: 0', 'first', 'last', 'gender', 'street', 'trans_num', 'unix_time', 'merch_lat', 'merch_long'], inplace=True)
df_test.dtypes

trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
is_fraud                   int64
dtype: object

In [17]:
for col in cols:
    label.fit(df_test[col])
    df_test[col] = label.transform(df_test[col])
for col in to_date_time:
    df_test[col] = pd.to_datetime(df_test[col])
    df_test[col] = df_test[col].apply(lambda x: x.timestamp())

In [18]:
df_test.dtypes

trans_date_trans_time    float64
cc_num                     int64
merchant                   int32
category                   int32
amt                      float64
city                       int32
state                      int32
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                        int32
dob                      float64
is_fraud                   int64
dtype: object

In [19]:
df_test.describe()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,city,state,zip,lat,long,city_pop,job,dob,is_fraud
count,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0,555719.0
mean,1601604000.0,4.178387e+17,343.08643,6.237059,69.39281,422.283741,25.771627,48842.628015,38.543253,-90.231325,88221.89,241.75504,121883400.0,0.00386
std,5201104.0,1.309837e+18,201.284993,3.912874,156.745941,244.81295,14.116718,26855.283328,5.061336,13.72178,300390.9,136.327924,550071900.0,0.062008
min,1592742000.0,60416210000.0,0.0,0.0,1.0,0.0,0.0,1257.0,20.0271,-165.6723,23.0,0.0,-1425514000.0,0.0
25%,1596953000.0,180042900000000.0,166.0,3.0,9.63,212.0,14.0,26292.0,34.6689,-96.798,741.0,124.0,-229219200.0,0.0
50%,1601687000.0,3521417000000000.0,346.0,6.0,47.29,420.0,26.0,48174.0,39.3716,-87.4769,2408.0,239.0,186537600.0,0.0
75%,1606792000.0,4635331000000000.0,515.0,10.0,83.01,637.0,37.0,72011.0,41.8948,-80.1752,19685.0,362.0,546134400.0,0.0
max,1609459000.0,4.992346e+18,692.0,13.0,22768.11,848.0,49.0,99921.0,65.6899,-67.9503,2906700.0,477.0,1106957000.0,1.0


In [20]:
X_test = df_test.drop(columns=['is_fraud'])
Y_test = df_test['is_fraud']

In [21]:
X_test

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,city,state,zip,lat,long,city_pop,job,dob
0,1.592742e+09,2291163933867244,319,10,2.86,157,39,29209,33.9659,-80.9355,333497,275,-56419200.0
1,1.592742e+09,3573030041201292,591,10,29.84,16,43,84002,40.3207,-110.4360,302,392,632534400.0
2,1.592742e+09,3598215285024754,611,5,41.28,61,33,11710,40.6729,-73.5365,34496,259,25315200.0
3,1.592742e+09,3591919803438423,222,9,60.05,764,8,32780,28.5697,-80.8191,54767,407,554169600.0
4,1.592742e+09,3526826139003047,292,13,3.19,247,21,49632,44.2529,-85.0170,1126,196,-457315200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,1.609459e+09,30560609640617,507,5,43.77,443,23,63453,40.4931,-91.8912,519,460,-122515200.0
555715,1.609459e+09,3556613125071656,264,7,111.84,401,42,77566,29.0393,-95.4401,28739,198,946252800.0
555716,1.609459e+09,6011724471098086,496,7,86.88,104,46,99323,46.1966,-118.9017,3684,294,375840000.0
555717,1.609459e+09,4079773899158,75,13,7.99,476,12,83643,44.6255,-116.4493,129,58,-127699200.0


In [22]:
X_test= scaler.fit_transform(X_test)
X_test

array([[-1.70387057, -0.31725171, -0.11966343, ...,  0.81652054,
         0.24386046, -0.32414447],
       [-1.70386903, -0.31627306,  1.23165562, ..., -0.2926852 ,
         1.10208599,  0.92833581],
       [-1.70386518, -0.31625383,  1.33101731, ..., -0.17885341,
         0.12649628, -0.17555576],
       ...,
       [ 1.5103518 , -0.31441123,  0.75968757, ..., -0.28142652,
         0.38323042,  0.46167931],
       [ 1.51035353, -0.31899779, -1.3318761 , ..., -0.29326111,
        -1.34789117, -0.45372764],
       [ 1.51035546, -0.31581678, -1.08347186, ...,  0.09247663,
         0.25119572,  1.11823406]])

In [23]:
Y_test 

0         0
1         0
2         0
3         0
4         0
         ..
555714    0
555715    0
555716    0
555717    0
555718    0
Name: is_fraud, Length: 555719, dtype: int64

In [24]:
print(f"shape of X_train & Y_train: {X_train.shape} & {Y_train.shape}")
print(f"shape of X_test & Y_test: {X_test.shape} & {Y_test.shape}")

shape of X_train & Y_train: (1296675, 13) & (1296675,)
shape of X_test & Y_test: (555719, 13) & (555719,)


In [25]:
my_models = {
    "Logistic Regression" : LogisticRegression(random_state=42), 
    "Decision Tree" : DecisionTreeClassifier(random_state=42)
}

parameters = {
    "Logistic Regression" : {
        'solver': ['liblinear', 'saga'],
        'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10],
        'class_weight': [None, 'balanced']
    }, 

    "Decision Tree" : {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 10, 20],
        'min_samples_leaf': [1, 5, 10],
        'max_features': [None, 'sqrt', 'log2'] 
    }
}

In [27]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for my_model_name, model in my_models.items():
    print(f"running for {my_model_name}")
    grid_search = GridSearchCV(estimator=model, param_grid=parameters[my_model_name], cv=3, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train, Y_train)

    if my_model_name == 'Logistic Regression':
        best_model_lg = grid_search.best_estimator_

    elif my_model_name == 'Decision Tree':
        best_model_dt = grid_search.best_estimator_

running for Logistic Regression
running for Decision Tree


In [28]:
dt_model = best_model_dt
dt_model.fit(X_train, Y_train)

dt_y_pred = dt_model.predict(X_test)

In [29]:
print('classes in training set \n',Y_train.value_counts())
print('classes in test set \n',Y_test.value_counts())

classes in training set 
 is_fraud
0    1289169
1       7506
Name: count, dtype: int64
classes in test set 
 is_fraud
0    553574
1      2145
Name: count, dtype: int64


considering both the scores the accuracy_score and the roc_auc_score as the dataset contains highly imbalanced class
Almost all the examples has is_fraud value of 0, there are very few examples with is_fraud = 1

In [30]:
print('For the decision tree model:-')
print(f"the accuracy_score is: {accuracy_score(Y_test ,dt_y_pred)}")
print(f"the rou_auc_score : {roc_auc_score(Y_test, dt_y_pred)}")

For the decision tree model:-
the accuracy_score is: 0.997660688225524
the rou_auc_score : 0.8465045723688651


In [34]:
lg_model = best_model_lg
lg_model.fit(X_train, Y_train)
lg_y_pred = lg_model.predict(X_test)

In [35]:
print('Now for logistic regression model:-')
print('the accuracy score is: ', accuracy_score(Y_test, lg_y_pred))
print('the roc_auc_score is: ',roc_auc_score(Y_test, lg_y_pred))

Now for logistic regression model:-
the accuracy score is:  0.947604814663526
the roc_auc_score is:  0.8508686772792384
