In [1]:
# Import dependencies
import pandas as pd
import numpy as np

pd.options.display.max_columns = 100
pd.options.display.max_rows = 900
pd.set_option('float_format', '{:f}'.format)

In [2]:
# Read train CSV
df_train = pd.read_csv("fraudTrain.csv", usecols = ['trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'dob',
       'merch_lat', 'merch_long', 'is_fraud'])

In [3]:
# Read test CSV
df_test = pd.read_csv("fraudTest.csv", usecols = ['trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'dob',
       'merch_lat', 'merch_long', 'is_fraud'])

In [4]:
# Concat CSVs
df = pd.concat([df_train,df_test])
df.reset_index(drop=True, inplace=True)

In [5]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,1988-03-09,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,1978-06-21,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,1962-01-19,43.150704,-112.154481,0
3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,1967-01-12,47.034331,-112.561071,0
4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,1986-03-28,38.674999,-78.632459,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1852394 entries, 0 to 1852393
Data columns (total 19 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   trans_date_trans_time  object 
 1   cc_num                 int64  
 2   merchant               object 
 3   category               object 
 4   amt                    float64
 5   first                  object 
 6   last                   object 
 7   gender                 object 
 8   street                 object 
 9   city                   object 
 10  state                  object 
 11  zip                    int64  
 12  lat                    float64
 13  long                   float64
 14  city_pop               int64  
 15  dob                    object 
 16  merch_lat              float64
 17  merch_long             float64
 18  is_fraud               int64  
dtypes: float64(5), int64(4), object(10)
memory usage: 268.5+ MB


In [7]:
df.nunique()

trans_date_trans_time    1819551
cc_num                       999
merchant                     693
category                      14
amt                        60616
first                        355
last                         486
gender                         2
street                       999
city                         906
state                         51
zip                          985
lat                          983
long                         983
city_pop                     891
dob                          984
merch_lat                1754157
merch_long               1809753
is_fraud                       2
dtype: int64

In [8]:
# Convert transaction datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

In [9]:
# Notes
df['trans_hour'] = df['trans_date_trans_time'].dt.hour
df['trans_hour']

0           0
1           0
2           0
3           0
4           0
           ..
1852389    23
1852390    23
1852391    23
1852392    23
1852393    23
Name: trans_hour, Length: 1852394, dtype: int64

In [10]:
# Notes
df['day_of_week'] = df['trans_date_trans_time'].dt.day_name()
df['day_of_week']

0           Tuesday
1           Tuesday
2           Tuesday
3           Tuesday
4           Tuesday
             ...   
1852389    Thursday
1852390    Thursday
1852391    Thursday
1852392    Thursday
1852393    Thursday
Name: day_of_week, Length: 1852394, dtype: object

In [11]:
# Notes
df['year_month'] = df['trans_date_trans_time'].dt.to_period('M')
df['year_month']

0          2019-01
1          2019-01
2          2019-01
3          2019-01
4          2019-01
            ...   
1852389    2020-12
1852390    2020-12
1852391    2020-12
1852392    2020-12
1852393    2020-12
Name: year_month, Length: 1852394, dtype: period[M]

In [12]:
# Convert dob column to datetime
df['dob'] = pd.to_datetime(df['dob'])

In [13]:
# Create age column
df['age'] = np.round((df['trans_date_trans_time'] - df['dob'])/np.timedelta64(1,'Y'))

In [14]:
# One-hot encoding / get dummies
category_onehot = pd.get_dummies(df.category, prefix='category')
gender_onehot = pd.get_dummies(df.gender, prefix='gender')
day_of_week_onehot = pd.get_dummies(df.day_of_week, prefix='week')

In [15]:
# One-hot / get dummies cont.
df1 = pd.concat([df, category_onehot,gender_onehot,day_of_week_onehot], axis=1)

In [16]:
# Concat
df_final = pd.concat([df, category_onehot,gender_onehot,day_of_week_onehot], axis=1)

In [17]:
# Import scikitlearn
from sklearn.model_selection import train_test_split

In [18]:
# Balance by sampling
count_fraud = df_final[df_final['is_fraud'] == 1]['amt'].count()
df_fraud = df_final[df_final['is_fraud'] == 1]
df_legit = df_final[df_final['is_fraud'] == 0]

In [19]:
# Notes
df_legit_sample = df_legit.sample(count_fraud, replace=True)
df_undersample = pd.concat([df_fraud, df_legit_sample], axis=0)

print('Random over-sampling:')
print(df_undersample['is_fraud'].value_counts())

Random over-sampling:
0    9651
1    9651
Name: is_fraud, dtype: int64


In [20]:
df_undersample.shape

(19302, 46)

In [21]:
# Split train & test
X_cols = ['amt', 'trans_hour',
       'age', 'category_entertainment', 'category_food_dining',
       'category_gas_transport', 'category_grocery_net',
       'category_grocery_pos', 'category_health_fitness', 'category_home',
       'category_kids_pets', 'category_misc_net', 'category_misc_pos',
       'category_personal_care', 'category_shopping_net',
       'category_shopping_pos', 'category_travel', 'gender_F', 'gender_M',
       'week_Friday', 'week_Monday', 'week_Saturday', 'week_Sunday',
       'week_Thursday', 'week_Tuesday', 'week_Wednesday']

Y_cols = ['is_fraud']

In [22]:
# Split train & test
X_train, X_test, y_train, y_test = train_test_split(df_undersample[X_cols], df_undersample[Y_cols],
                                                    train_size=0.7, test_size=0.3)

In [23]:
# Split test data into validation and test set
X_test1, X_test2, y_test1, y_test2 = train_test_split(X_test, y_test, train_size=0.5, test_size=0.5)

In [24]:
# Import RandomForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [25]:
# Building Random Forest Model
rf_class = RandomForestClassifier(n_estimators = 50,max_depth = 20, verbose = 1)
rf_class.fit(X_train, y_train)

  rf_class.fit(X_train, y_train)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.4s finished


RandomForestClassifier(max_depth=20, n_estimators=50, verbose=1)

In [26]:
# Train Results
print("Train Results")
predict_train = rf_class.predict(X_train)

print(confusion_matrix(y_train, predict_train))
print(classification_report(y_train, predict_train))

Train Results
[[6719    5]
 [   9 6778]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6724
           1       1.00      1.00      1.00      6787

    accuracy                           1.00     13511
   macro avg       1.00      1.00      1.00     13511
weighted avg       1.00      1.00      1.00     13511



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished


In [27]:
# Test Results
print("Test Results")
pred_test = rf_class.predict(X_test1)

print(confusion_matrix(y_test1, pred_test))
print(classification_report(y_test1, pred_test))

Test Results


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.0s finished


[[1452   36]
 [  31 1376]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1488
           1       0.97      0.98      0.98      1407

    accuracy                           0.98      2895
   macro avg       0.98      0.98      0.98      2895
weighted avg       0.98      0.98      0.98      2895



In [28]:
# Export final X & y tests
X_test2.to_csv("X_test.csv")
y_test2.to_csv("y_test.csv")

In [29]:
# Export model
import joblib
joblib.dump(rf_class, "model.pkl", compress=9)

['model.pkl']