In [70]:
# Credit card fraud detection using random forest classifier

In [71]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
pd.set_option('display.max_columns', None)


In [72]:
df = pd.read_csv("fraudTrain.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [73]:
df.shape

(1296675, 23)

In [74]:
df.dtypes

Unnamed: 0                 int64
trans_date_trans_time     object
cc_num                     int64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [75]:
df.isnull().sum()


Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [76]:
# 1- fraud , 0- legitimate
fraud_count = df["is_fraud"].value_counts()
fraud_rate = 100*fraud_count/df.shape[0]
fraud_data = pd.concat([fraud_count, fraud_rate],axis=1).reset_index()
fraud_data.columns = ['class','count','percentage']
 
fraud_data

Unnamed: 0,class,count,percentage
0,0,1289169,99.421135
1,1,7506,0.578865


In [77]:
# as the dataset is highly unstable we will balance it 
df_fraud = df[df['is_fraud']==1]
df_not_fraud = df[df['is_fraud']==0]
print(df_fraud.shape)
print(df_not_fraud.shape)

(7506, 23)
(1289169, 23)


In [78]:
df_not_fraud.amt.describe()


count    1.289169e+06
mean     6.766711e+01
std      1.540080e+02
min      1.000000e+00
25%      9.610000e+00
50%      4.728000e+01
75%      8.254000e+01
max      2.894890e+04
Name: amt, dtype: float64

In [79]:
df_not_fraud.amt.describe()

count    1.289169e+06
mean     6.766711e+01
std      1.540080e+02
min      1.000000e+00
25%      9.610000e+00
50%      4.728000e+01
75%      8.254000e+01
max      2.894890e+04
Name: amt, dtype: float64

Built a sample datasets containing sample no of values for fraud and legal transaction

In [80]:
legit_sample= df_not_fraud.sample(n=7506)
new_df = pd.concat([legit_sample, df_fraud] , axis=0)
new_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
972079,972079,2020-01-28 05:20:56,4998109455173950,fraud_Jaskolski-Vandervort,misc_net,2.09,Mary,Pruitt,F,2819 Luke Greens Suite 563,Kansas City,MO,64114,38.9621,-94.5959,545147,Counsellor,1987-11-18,9845676936859dde6542d69bff58565c,1359350456,38.048927,-94.593717,0
1009036,1009036,2020-02-19 07:06:27,4393520897625,fraud_Smitham-Boehm,grocery_net,42.13,Charles,Rodriguez,M,240 Tracy Forges,Easton,KS,66020,39.3391,-95.0999,1442,Air broker,1982-05-20,df6c5bbeb47e45efc0124637ea978bf5,1361257587,39.277697,-95.570665,0
153785,153785,2019-03-23 19:34:40,2233882705243596,fraud_Auer LLC,personal_care,14.31,Jamie,Robinson,F,67089 Caitlin Meadow Apt. 905,Sturgis,MS,39769,33.357,-89.0473,1923,Medical physicist,1960-01-16,8799385d174104b1e689673952f1abde,1332531280,33.381773,-88.90296,0
869363,869363,2019-12-20 00:29:07,3565423334076143,fraud_Hermann and Sons,shopping_pos,9.47,Nathan,Thomas,M,4923 Campbell Pines Suite 717,Carlisle,IN,47838,38.9763,-87.3667,4081,Energy engineer,1938-03-15,8f546d4ea4ba3b73e7e63e4e48e8b81d,1355963347,39.559486,-86.88999,0
1168060,1168060,2020-05-02 22:39:35,375534208663984,fraud_O'Hara-Wilderman,food_dining,53.1,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,c030c0afbbe341292ef0f31153d36bbf,1367534375,38.981525,-79.736159,0


In [81]:
new_df['is_fraud'].value_counts()

is_fraud
0    7506
1    7506
Name: count, dtype: int64

In [82]:
new_df=new_df.drop(['trans_date_trans_time','merchant','category','first','last','gender','street','city','state' ,'city_pop','job','dob','trans_num' ],axis=1)
new_df.groupby('is_fraud').mean()


Unnamed: 0_level_0,Unnamed: 0,cc_num,amt,zip,lat,long,unix_time,merch_lat,merch_long
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,646228.029843,4.015152e+17,69.008359,49517.993738,38.611351,-90.582971,1349168000.0,38.615721,-90.582495
1,624949.724354,4.003577e+17,531.320092,48038.714229,38.663609,-89.916041,1348389000.0,38.653901,-89.915808


In [83]:
# Splitting data into feartures and targets
x_train= new_df.drop(columns='is_fraud',axis=1)
y_train=new_df['is_fraud']

In [84]:
print(x_train)

         Unnamed: 0            cc_num      amt    zip      lat      long  \
972079       972079  4998109455173950     2.09  64114  38.9621  -94.5959   
1009036     1009036     4393520897625    42.13  66020  39.3391  -95.0999   
153785       153785  2233882705243596    14.31  39769  33.3570  -89.0473   
869363       869363  3565423334076143     9.47  47838  38.9763  -87.3667   
1168060     1168060   375534208663984    53.10  24433  38.4207  -79.4629   
...             ...               ...      ...    ...      ...       ...   
1295399     1295399  3524574586339330   977.01  32960  27.6330  -80.4031   
1295491     1295491  3524574586339330  1210.91  32960  27.6330  -80.4031   
1295532     1295532  4005676619255478    10.24  70726  30.4590  -90.9027   
1295666     1295666  3560725013359375    21.69  79759  31.8599 -102.7413   
1295733     1295733  4005676619255478    10.20  70726  30.4590  -90.9027   

          unix_time  merch_lat  merch_long  
972079   1359350456  38.048927  -94.593717

In [85]:
print(y_train)

972079     0
1009036    0
153785     0
869363     0
1168060    0
          ..
1295399    1
1295491    1
1295532    1
1295666    1
1295733    1
Name: is_fraud, Length: 15012, dtype: int64


In [86]:

df2 = pd.read_csv('fraudTest.csv')
df2_fraud = df2[df2['is_fraud']==1]
df2_not_fraud = df2[df2['is_fraud']==0]
legit_sample2= df2_not_fraud.sample(n=7506)
new_df2 = pd.concat([legit_sample2, df2_fraud] , axis=0)
new_df2.head()
new_df2=df2.drop(['trans_date_trans_time','merchant','category','first','last','gender','street','city','state' ,'city_pop','job','dob','trans_num' ],axis=1)
new_df2.groupby('is_fraud').mean()
x_test= new_df.drop(columns='is_fraud',axis=1)
y_test=new_df['is_fraud']

      


In [87]:
randomforestmodel= Pipeline ([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])
randomforestmodel.fit(x_train ,y_train)

In [88]:
y_pred_log = randomforestmodel.predict(x_test)
y_pred_log

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [89]:
accuracy = accuracy_score(y_test ,y_pred_log)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 1.00


In [90]:
cr= classification_report(y_test , y_pred_log)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7506
           1       1.00      1.00      1.00      7506

    accuracy                           1.00     15012
   macro avg       1.00      1.00      1.00     15012
weighted avg       1.00      1.00      1.00     15012

