In [36]:
import pandas as pd

In [37]:
data = pd.read_csv('./fraudTrain.csv')

In [39]:
data = data.drop(['Unnamed: 0', 'city_pop', 'job', 'street', 'gender', 'first', 'last', 'dob', 'unix_time', 'city', 'state', 'zip', 'category'], axis=1)

In [40]:
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
data = data.sort_values(['trans_date_trans_time'])

In [41]:
data['customer_time_diff'] = list(map(lambda x : x.total_seconds(), data.groupby('cc_num')['trans_date_trans_time'].diff()))
data['merch_time_diff'] = list(map(lambda x : x.total_seconds(), data.groupby('merchant')['trans_date_trans_time'].diff()))

In [42]:
data['last_customer_lat'] = data.groupby('cc_num')['lat'].shift()
data['last_customer_long'] = data.groupby('cc_num')['long'].shift()

data['last_merchant_lat'] = data.groupby('merchant')['lat'].shift()
data['last_merchant_long'] = data.groupby('merchant')['long'].shift()

In [43]:
import math

def calculate_distance(lat_prev, long_prev, lat_current, long_current):
    R = 6371  
    lat_prev_rad = math.radians(lat_prev)
    long_prev_rad = math.radians(long_prev)
    lat_current_rad = math.radians(lat_current)
    long_current_rad = math.radians(long_current)

    delta_lat = lat_current_rad - lat_prev_rad
    delta_long = long_current_rad - long_prev_rad

    a = math.sin(delta_lat / 2) ** 2 + math.cos(lat_prev_rad) * math.cos(lat_current_rad) * math.sin(delta_long / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return distance

In [44]:
data['customer_distance'] = data.apply(lambda x: calculate_distance(x['last_customer_lat'], x['last_customer_long'], x['lat'], x['long']), axis=1)
data['merchant_distance'] = data.apply(lambda x: calculate_distance(x['last_merchant_lat'], x['last_merchant_long'], x['merch_lat'], x['merch_long']), axis=1)

In [45]:
data['merch_score'] = data.groupby('merchant')['is_fraud'].transform('mean')
data['customer_score'] = data.groupby('cc_num')['is_fraud'].transform('mean')

In [47]:
data['trans_date_trans_time'] = data['trans_date_trans_time'].astype(str)

In [49]:
data = data.dropna()

In [51]:
data

Unnamed: 0,trans_date_trans_time,cc_num,merchant,amt,lat,long,trans_num,merch_lat,merch_long,is_fraud,customer_time_diff,merch_time_diff,last_customer_lat,last_customer_long,last_merchant_lat,last_merchant_long,customer_distance,merchant_distance,merch_score,customer_score
43,2019-01-01 00:32:15,30074693890476,fraud_Lockman Ltd,212.75,37.9931,-100.9893,d9dcde500bed2cad48ee41e44c362596,38.862183,-101.234087,0,1653.0,1519.0,37.9931,-100.9893,27.7898,-82.7243,0.0,2108.155687,0.014446,0.007778
93,2019-01-01 01:09:57,180048185037117,"fraud_Rippin, Kub and Mann",3.79,40.6152,-74.4150,7c63dd07b14aac7dc99f4587d83dd599,40.813669,-75.283465,0,3161.0,4179.0,40.6152,-74.4150,36.0788,-81.1781,0.0,734.980063,0.014207,0.004545
100,2019-01-01 01:16:52,6593250708747804,fraud_Brekke and Sons,55.18,26.7383,-80.2760,3c6158c556727d527f8b51cc03b30236,27.346033,-80.475563,0,117.0,501.0,26.7383,-80.2760,34.0326,-82.2027,0.0,761.591420,0.003769,0.003856
125,2019-01-01 01:33:51,3568736585751727,fraud_Kutch-Hegmann,55.60,41.8114,-93.4855,a85f17caa74a4eb0acaff9bae866b1d9,41.081282,-92.859090,0,3128.0,1130.0,41.8114,-93.4855,33.3398,-92.7442,0.0,860.873399,0.006486,0.006645
135,2019-01-01 01:37:20,3567879740649740,fraud_DuBuque LLC,113.19,44.0577,-76.0196,61aa9cb51f56910884272f434a55955c,44.465636,-75.525258,0,208.0,1345.0,44.0577,-76.0196,26.7383,-80.2760,0.0,2016.502491,0.012043,0.007253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,15.56,37.7175,-112.4777,440b587732da4dc1a6395aba5fb41669,36.841266,-111.690765,0,16781.0,47110.0,37.7175,-112.4777,34.5091,-92.4828,0.0,1751.229291,0.002622,0.005948
1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,51.70,39.2667,-77.5101,278000d2e0d2277d1de2f890067dcc0a,38.906881,-78.246528,0,7962.0,40189.0,39.2667,-77.5101,30.7148,-85.0210,0.0,1100.247329,0.002284,0.015066
1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,105.93,32.9396,-105.8189,483f52fe67fabef353d552c1e662974c,33.619513,-105.130529,0,29074.0,14743.0,32.9396,-105.8189,40.9918,-73.9800,0.0,2857.319826,0.001079,0.005314
1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",74.90,43.3526,-102.5411,d667cdcbadaaed3da3f4020e83591c83,42.788940,-103.241160,0,91018.0,44237.0,43.3526,-102.5411,37.6395,-97.1714,0.0,769.954633,0.002094,0.003953


In [52]:
merchant_history = data[['merchant', 'last_merchant_lat', 'last_merchant_long', 'merch_score']]
merchant_history = merchant_history.drop_duplicates()
customer_history = data[['cc_num', 'last_customer_lat', 'last_customer_long', 'customer_score']]
customer_history = customer_history.rename(columns={'cc_num': 'merchant', 'last_customer_lat': 'last_merchant_lat', 'last_customer_long': 'last_merchant_long', 'customer_score': 'merch_score'})
customer_history = customer_history.drop_duplicates()

combined_history = pd.concat([merchant_history, customer_history])
combined_history = combined_history.drop_duplicates()

In [53]:
combined_history.to_csv('combined_history.csv', index=False)

In [55]:
new = data[['customer_score', 'merch_score', 'customer_distance', 'merchant_distance', 'customer_time_diff', 'merch_time_diff', 'is_fraud']]

In [56]:
new.to_csv('new.csv', index=False)

In [59]:
data

Unnamed: 0,trans_date_trans_time,cc_num,merchant,amt,lat,long,trans_num,merch_lat,merch_long,is_fraud,customer_time_diff,merch_time_diff,last_customer_lat,last_customer_long,last_merchant_lat,last_merchant_long,customer_distance,merchant_distance,merch_score,customer_score
43,2019-01-01 00:32:15,30074693890476,fraud_Lockman Ltd,212.75,37.9931,-100.9893,d9dcde500bed2cad48ee41e44c362596,38.862183,-101.234087,0,1653.0,1519.0,37.9931,-100.9893,27.7898,-82.7243,0.0,2108.155687,0.014446,0.007778
93,2019-01-01 01:09:57,180048185037117,"fraud_Rippin, Kub and Mann",3.79,40.6152,-74.4150,7c63dd07b14aac7dc99f4587d83dd599,40.813669,-75.283465,0,3161.0,4179.0,40.6152,-74.4150,36.0788,-81.1781,0.0,734.980063,0.014207,0.004545
100,2019-01-01 01:16:52,6593250708747804,fraud_Brekke and Sons,55.18,26.7383,-80.2760,3c6158c556727d527f8b51cc03b30236,27.346033,-80.475563,0,117.0,501.0,26.7383,-80.2760,34.0326,-82.2027,0.0,761.591420,0.003769,0.003856
125,2019-01-01 01:33:51,3568736585751727,fraud_Kutch-Hegmann,55.60,41.8114,-93.4855,a85f17caa74a4eb0acaff9bae866b1d9,41.081282,-92.859090,0,3128.0,1130.0,41.8114,-93.4855,33.3398,-92.7442,0.0,860.873399,0.006486,0.006645
135,2019-01-01 01:37:20,3567879740649740,fraud_DuBuque LLC,113.19,44.0577,-76.0196,61aa9cb51f56910884272f434a55955c,44.465636,-75.525258,0,208.0,1345.0,44.0577,-76.0196,26.7383,-80.2760,0.0,2016.502491,0.012043,0.007253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,2020-06-21 12:12:08,30263540414123,fraud_Reichel Inc,15.56,37.7175,-112.4777,440b587732da4dc1a6395aba5fb41669,36.841266,-111.690765,0,16781.0,47110.0,37.7175,-112.4777,34.5091,-92.4828,0.0,1751.229291,0.002622,0.005948
1296671,2020-06-21 12:12:19,6011149206456997,fraud_Abernathy and Sons,51.70,39.2667,-77.5101,278000d2e0d2277d1de2f890067dcc0a,38.906881,-78.246528,0,7962.0,40189.0,39.2667,-77.5101,30.7148,-85.0210,0.0,1100.247329,0.002284,0.015066
1296672,2020-06-21 12:12:32,3514865930894695,fraud_Stiedemann Ltd,105.93,32.9396,-105.8189,483f52fe67fabef353d552c1e662974c,33.619513,-105.130529,0,29074.0,14743.0,32.9396,-105.8189,40.9918,-73.9800,0.0,2857.319826,0.001079,0.005314
1296673,2020-06-21 12:13:36,2720012583106919,"fraud_Reinger, Weissnat and Strosin",74.90,43.3526,-102.5411,d667cdcbadaaed3da3f4020e83591c83,42.788940,-103.241160,0,91018.0,44237.0,43.3526,-102.5411,37.6395,-97.1714,0.0,769.954633,0.002094,0.003953


In [60]:
data = data.drop(['cc_num', 'merchant', 'trans_num', 'trans_date_trans_time', 'last_merchant_lat', 'last_merchant_long', 'last_customer_lat', 'last_customer_long'], axis = 1)

In [61]:
data.to_csv('preprocessed_fraudTrain.csv')

In [62]:
from imblearn.over_sampling import SMOTE

X = new.iloc[:, :-1].values
y = new.iloc[:, -1].values

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)


In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=0)

classifier = RandomForestClassifier(n_estimators=100, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [80]:
y_pred = classifier.predict(X_test[:1000])

In [82]:
print(classification_report(y_test[:1000], y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       514
           1       0.98      0.99      0.98       486

    accuracy                           0.98      1000
   macro avg       0.98      0.98      0.98      1000
weighted avg       0.98      0.98      0.98      1000



In [77]:
import pickle

with open('fraud_detection_model_new.sav', 'wb') as file:
    pickle.dump(classifier, file)