# Credit Card Transactions Fraud Detection

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [2]:
train_data = pd.read_csv("fraudTrain.csv")
test_data = pd.read_csv("fraudTest.csv")

In [3]:
print(train_data.columns.tolist())

['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']


In [4]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [5]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [6]:
train_data.tail()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
1296670,1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,entertainment,15.56,Erik,Patterson,M,162 Jessica Row Apt. 072,...,37.7175,-112.4777,258,Geoscientist,1961-11-24,440b587732da4dc1a6395aba5fb41669,1371816728,36.841266,-111.690765,0
1296671,1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,food_dining,51.7,Jeffrey,White,M,8617 Holmes Terrace Suite 651,...,39.2667,-77.5101,100,"Production assistant, television",1979-12-11,278000d2e0d2277d1de2f890067dcc0a,1371816739,38.906881,-78.246528,0
1296672,1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,food_dining,105.93,Christopher,Castaneda,M,1632 Cohen Drive Suite 639,...,32.9396,-105.8189,899,Naval architect,1967-08-30,483f52fe67fabef353d552c1e662974c,1371816752,33.619513,-105.130529,0
1296673,1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",food_dining,74.9,Joseph,Murray,M,42933 Ryan Underpass,...,43.3526,-102.5411,1126,Volunteer coordinator,1980-08-18,d667cdcbadaaed3da3f4020e83591c83,1371816816,42.78894,-103.24116,0
1296674,1296674,2020-06-21 12:13:37,4292902571056973207,"fraud_Langosh, Wintheiser and Hyatt",food_dining,4.3,Jeffrey,Smith,M,135 Joseph Mountains,...,45.8433,-113.8748,218,"Therapist, horticultural",1995-08-16,8f7c8e4ab7f25875d753b422917c98c9,1371816817,46.565983,-114.18611,0


In [7]:
test_data.tail()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
555714,555714,2020-12-31 23:59:07,30560609640617,fraud_Reilly and Sons,health_fitness,43.77,Michael,Olson,M,558 Michael Estates,...,40.4931,-91.8912,519,Town planner,1966-02-13,9b1f753c79894c9f4b71f04581835ada,1388534347,39.946837,-91.333331,0
555715,555715,2020-12-31 23:59:09,3556613125071656,fraud_Hoppe-Parisian,kids_pets,111.84,Jose,Vasquez,M,572 Davis Mountains,...,29.0393,-95.4401,28739,Futures trader,1999-12-27,2090647dac2c89a1d86c514c427f5b91,1388534349,29.661049,-96.186633,0
555716,555716,2020-12-31 23:59:15,6011724471098086,fraud_Rau-Robel,kids_pets,86.88,Ann,Lawson,F,144 Evans Islands Apt. 683,...,46.1966,-118.9017,3684,Musician,1981-11-29,6c5b7c8add471975aa0fec023b2e8408,1388534355,46.65834,-119.715054,0
555717,555717,2020-12-31 23:59:24,4079773899158,fraud_Breitenberg LLC,travel,7.99,Eric,Preston,M,7020 Doyle Stream Apt. 951,...,44.6255,-116.4493,129,Cartographer,1965-12-15,14392d723bb7737606b2700ac791b7aa,1388534364,44.470525,-117.080888,0
555718,555718,2020-12-31 23:59:34,4170689372027579,fraud_Dare-Marvin,entertainment,38.13,Samuel,Frey,M,830 Myers Plaza Apt. 384,...,35.6665,-97.4798,116001,Media buyer,1993-05-10,1765bb45b3aa3224b4cdcb6e7a96cee3,1388534374,36.210097,-97.036372,0


In [8]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [10]:
train_data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [11]:
test_data.isnull().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [12]:
train_data['is_fraud'].value_counts()

is_fraud
0    1289169
1       7506
Name: count, dtype: int64

In [13]:
test_data['is_fraud'].value_counts()

is_fraud
0    553574
1      2145
Name: count, dtype: int64


0 - Normal Transaction

1 - fraudulent transaction


In [14]:
train_data.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [15]:
test_data.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [16]:
# separating the data for analysis
legit_train = train_data[train_data.is_fraud == 0]
fraud_train= train_data[train_data.is_fraud == 1]

In [17]:
print(legit_train.shape)
print(fraud_train.shape)

(1289169, 23)
(7506, 23)


In [18]:
legit_test = test_data[test_data.is_fraud == 0]
fraud_test= test_data[test_data.is_fraud == 1]

In [19]:
print(legit_test.shape)
print(fraud_test.shape)

(553574, 23)
(2145, 23)


### Feature Engineering

In [20]:
#converting all non numerical type column into numerical column
non_numeric_cols = train_data.select_dtypes(include=['object', 'category', 'datetime']).columns
print("Non-numeric columns:", non_numeric_cols)

Non-numeric columns: Index(['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'],
      dtype='object')


In [21]:
non_numeric_cols = test_data.select_dtypes(include=['object', 'category', 'datetime']).columns
print("Non-numeric columns:", non_numeric_cols)

Non-numeric columns: Index(['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'],
      dtype='object')


In [22]:
non_numeric_traindata = pd.read_csv('fraudTrain.csv',usecols=['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'])
non_numeric_traindata.head()

Unnamed: 0,trans_date_trans_time,merchant,category,first,last,gender,street,city,state,job,dob,trans_num
0,2019-01-01 00:00:18,"fraud_Rippin, Kub and Mann",misc_net,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9
1,2019-01-01 00:00:44,"fraud_Heller, Gutmann and Zieme",grocery_pos,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99
2,2019-01-01 00:00:51,fraud_Lind-Buckridge,entertainment,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95
3,2019-01-01 00:01:16,"fraud_Kutch, Hermiston and Farrell",gas_transport,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81
4,2019-01-01 00:03:06,fraud_Keeling-Crist,misc_pos,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46


In [23]:
non_numeric_testdata = pd.read_csv('fraudTest.csv',usecols=['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'])
non_numeric_testdata.head()

Unnamed: 0,trans_date_trans_time,merchant,category,first,last,gender,street,city,state,job,dob,trans_num
0,2020-06-21 12:14:25,fraud_Kirlin and Sons,personal_care,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3
1,2020-06-21 12:14:33,fraud_Sporer-Keebler,personal_care,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7
2,2020-06-21 12:14:53,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be
3,2020-06-21 12:15:15,fraud_Haley Group,misc_pos,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c
4,2020-06-21 12:15:17,fraud_Johnston-Casper,travel,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b


In [24]:
for col in non_numeric_traindata.columns:
    print(col, ': ',len(non_numeric_traindata[col].unique()), 'labels')

trans_date_trans_time :  1274791 labels
merchant :  693 labels
category :  14 labels
first :  352 labels
last :  481 labels
gender :  2 labels
street :  983 labels
city :  894 labels
state :  51 labels
job :  494 labels
dob :  968 labels
trans_num :  1296675 labels


In [25]:
for col in non_numeric_testdata.columns:
    print(col, ': ',len(non_numeric_testdata[col].unique()), 'labels')

trans_date_trans_time :  544760 labels
merchant :  693 labels
category :  14 labels
first :  341 labels
last :  471 labels
gender :  2 labels
street :  924 labels
city :  849 labels
state :  50 labels
job :  478 labels
dob :  910 labels
trans_num :  555719 labels


In [26]:
train_data.drop(columns=['trans_num', 'unix_time','Unnamed: 0','first','last','street'], inplace=True)


In [27]:
test_data.drop(columns=['trans_num', 'unix_time', 'Unnamed: 0','first','last','street'], inplace=True)


In [28]:
print(train_data.columns.tolist())

['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'gender', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'merch_lat', 'merch_long', 'is_fraud']


In [29]:
train_data = pd.get_dummies(train_data, columns=['gender'], drop_first=True)

In [30]:
test_data = pd.get_dummies(test_data, columns=['gender'], drop_first=True)

In [31]:
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time'])
train_data['hour'] = train_data['trans_date_trans_time'].dt.hour
train_data['day'] = train_data['trans_date_trans_time'].dt.day
train_data['weekday'] = train_data['trans_date_trans_time'].dt.weekday
train_data['month'] = train_data['trans_date_trans_time'].dt.month


In [32]:
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time'])
test_data['hour'] = test_data['trans_date_trans_time'].dt.hour
test_data['day'] = test_data['trans_date_trans_time'].dt.day
test_data['weekday'] = test_data['trans_date_trans_time'].dt.weekday
test_data['month'] = test_data['trans_date_trans_time'].dt.month


In [33]:
train_data['dob'] = pd.to_datetime(train_data['dob'])
train_data['age'] = train_data['trans_date_trans_time'].dt.year - train_data['dob'].dt.year
train_data.drop(columns=['dob'], inplace=True)


In [34]:
test_data['dob'] = pd.to_datetime(test_data['dob'])
test_data['age'] = test_data['trans_date_trans_time'].dt.year - test_data['dob'].dt.year
test_data.drop(columns=['dob'], inplace=True)


In [35]:
train_data.drop(columns=['trans_date_trans_time'], inplace=True)

In [36]:
test_data.drop(columns=['trans_date_trans_time'], inplace=True)

In [37]:
freq_maps = {}
for col in ['city', 'state', 'job','merchant','category']:
    freq = train_data[col].value_counts(normalize=True)
    train_data[col + '_encoded'] = train_data[col].map(freq)
    train_data.drop(columns=[col], inplace=True)
    freq_maps[col] = freq  # Save the mapping

In [38]:
for col in ['city', 'state', 'job','merchant','category']:
    test_data[col + '_encoded'] = test_data[col].map(freq_maps[col])
    test_data.drop(columns=[col], inplace=True)

In [39]:
train_data.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,gender_M,hour,day,weekday,month,age,city_encoded,state_encoded,job_encoded,merchant_encoded,category_encoded
0,2703186189652095,4.97,28654,36.0788,-81.1781,3495,36.011293,-82.048315,0,False,0,1,1,1,31,0.001564,0.023341,0.002734,0.000977,0.048807
1,630423337322,107.23,99160,48.8878,-118.2105,149,49.159047,-118.186462,0,False,0,1,1,1,41,0.002734,0.014594,0.003932,0.00193,0.09535
2,38859492057661,220.11,83252,42.1808,-112.262,4154,43.150704,-112.154481,0,True,0,1,1,1,57,0.000388,0.004276,0.000394,0.001461,0.072504
3,3534093764340240,45.0,59632,46.2306,-112.1138,1939,47.034331,-112.561071,0,True,0,1,1,1,52,0.00038,0.009065,0.001951,0.002015,0.101536
4,375534208663984,41.96,24433,38.4207,-79.4629,99,38.674999,-78.632459,0,True,0,1,1,1,33,0.001556,0.022558,0.001556,0.001228,0.06143


In [40]:
test_data.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,gender_M,hour,day,weekday,month,age,city_encoded,state_encoded,job_encoded,merchant_encoded,category_encoded
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,33.986391,-81.200714,0,True,12,21,6,6,52,0.001204,0.022511,0.004343,0.001401,0.069993
1,3573030041201292,29.84,84002,40.3207,-110.436,302,39.450498,-109.960431,0,False,12,21,6,6,30,0.001611,0.008251,0.004255,0.001407,0.069993
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,40.49581,-74.196111,0,False,12,21,6,6,50,0.001987,0.064396,0.004771,0.001323,0.06623
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,28.812398,-80.883061,0,True,12,21,6,6,33,0.001177,0.032908,0.001192,0.001256,0.06143
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,44.959148,-85.884734,0,True,12,21,6,6,65,0.001569,0.035594,0.001576,0.000641,0.031239


In [41]:
train_data.isnull().sum()

cc_num              0
amt                 0
zip                 0
lat                 0
long                0
city_pop            0
merch_lat           0
merch_long          0
is_fraud            0
gender_M            0
hour                0
day                 0
weekday             0
month               0
age                 0
city_encoded        0
state_encoded       0
job_encoded         0
merchant_encoded    0
category_encoded    0
dtype: int64

In [42]:
test_data.isnull().sum()

cc_num                0
amt                   0
zip                   0
lat                   0
long                  0
city_pop              0
merch_lat             0
merch_long            0
is_fraud              0
gender_M              0
hour                  0
day                   0
weekday               0
month                 0
age                   0
city_encoded        124
state_encoded         0
job_encoded          30
merchant_encoded      0
category_encoded      0
dtype: int64

In [43]:
for col in ['city_encoded', 'job_encoded']:
    test_data[col] = test_data[col].fillna(0)

In [44]:
test_data.head()

Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,gender_M,hour,day,weekday,month,age,city_encoded,state_encoded,job_encoded,merchant_encoded,category_encoded
0,2291163933867244,2.86,29209,33.9659,-80.9355,333497,33.986391,-81.200714,0,True,12,21,6,6,52,0.001204,0.022511,0.004343,0.001401,0.069993
1,3573030041201292,29.84,84002,40.3207,-110.436,302,39.450498,-109.960431,0,False,12,21,6,6,30,0.001611,0.008251,0.004255,0.001407,0.069993
2,3598215285024754,41.28,11710,40.6729,-73.5365,34496,40.49581,-74.196111,0,False,12,21,6,6,50,0.001987,0.064396,0.004771,0.001323,0.06623
3,3591919803438423,60.05,32780,28.5697,-80.8191,54767,28.812398,-80.883061,0,True,12,21,6,6,33,0.001177,0.032908,0.001192,0.001256,0.06143
4,3526826139003047,3.19,49632,44.2529,-85.017,1126,44.959148,-85.884734,0,True,12,21,6,6,65,0.001569,0.035594,0.001576,0.000641,0.031239


In [45]:
test_data.isnull().sum()

cc_num              0
amt                 0
zip                 0
lat                 0
long                0
city_pop            0
merch_lat           0
merch_long          0
is_fraud            0
gender_M            0
hour                0
day                 0
weekday             0
month               0
age                 0
city_encoded        0
state_encoded       0
job_encoded         0
merchant_encoded    0
category_encoded    0
dtype: int64

In [46]:
X_train = train_data.drop("is_fraud", axis=1)
y_train = train_data["is_fraud"]

In [47]:
X_test = test_data.drop("is_fraud", axis=1)
y_test = test_data["is_fraud"]

In [48]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [49]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
}

In [50]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"{name} trained.")

Logistic Regression trained.
Decision Tree trained.


In [51]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1]
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_proba))



=== Logistic Regression ===
              precision    recall  f1-score   support

           0       1.00      0.96      0.98    553574
           1       0.07      0.73      0.12      2145

    accuracy                           0.96    555719
   macro avg       0.53      0.84      0.55    555719
weighted avg       1.00      0.96      0.98    555719

ROC-AUC: 0.8614135979933506

=== Decision Tree ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    553574
           1       0.79      0.76      0.77      2145

    accuracy                           1.00    555719
   macro avg       0.90      0.88      0.89    555719
weighted avg       1.00      1.00      1.00    555719

ROC-AUC: 0.8772367019945483


### Analysis:
**Logistic Regression** achieved the highest recall (77%) and the best ROC-AUC score (0.84), indicating strong ability to detect fraud cases. However, it suffers from extremely low precision (1%), meaning a high number of false positives.

**Decision Tree** provided a better balance between precision and recall, resulting in the highest F1-score (0.19) for the fraud class. It also showed a very high overall accuracy (98%) and a strong ROC-AUC score (0.83).


In [52]:
import pickle as pk

In [53]:
### Create a Pickle file using serialization 
for model_name, trained_model in models.items():
    file_name = model_name.lower().replace(" ", "_") + ".pkl"
    model_file = open(file_name, "wb")
    pk.dump(trained_model, model_file)
    model_file.close()


In [54]:
with open("scaler.pkl", "wb") as f:
    pk.dump(scaler, f)

In [55]:
with open('freq_maps.pkl', 'wb') as f:
    pk.dump(freq_maps, f)

In [56]:
import numpy as np

In [57]:
test_input =[X_test[0]] 
for name, model in models.items():
    prediction = model.predict(test_input)
    print(f"{name} prediction: {prediction}")

Logistic Regression prediction: [0]
Decision Tree prediction: [0]
