In [1]:
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestRegressor

# Get the predicted Fraud labelled (for dropping once joined)

In [2]:
# load the rf
with open('../models/consumer_fraud_rfr.pickle', 'rb') as g:
    rfr = pickle.load(g)

In [3]:
# load the full data to undergo prediction
data = pd.read_csv('../data/curated/fraud/full_consumer_fraud_data.csv')

data = data.rename(columns = {'mean_#distinct_merchants': '#distinct_merchants', \
        'mean_#daily_orders': '#daily_orders', 'mean_transact_amount_perOrder': 'transact_amount_perOrder',\
        'mean_transact_amount_perOrder_sd': 'transact_amount_perOrder_sd'})

X = data[['transact_amount_perOrder_ratio', 'transact_amount_perOrder',
    'transact_amount_perOrder_sd', 'transact_amount_perOrder_sd_ratio']]

In [4]:
# make prediction
prediction = rfr.predict(X)

df = pd.DataFrame({'prediction': prediction})

Analysis

In [5]:
df.describe()[1:]

Unnamed: 0,prediction
mean,9.394051
std,0.920289
min,9.287148
25%,9.287148
50%,9.287148
75%,9.411906
max,85.362188


In [6]:
# inspect how many % of transactions get dropped off for each cutoff threshold rate
for i in range(10, 85, 5):
    print(i, ':', len(df[df['prediction']>i])/len(df))

10 : 0.011581764288277197
15 : 0.002210437233908996
20 : 0.0010706300587158878
25 : 0.0006338450769007805
30 : 0.0004225262524929105
35 : 0.00031057294804909946
40 : 0.00022702570592685248
45 : 0.00016141327178018119
50 : 0.00010772024417628378
55 : 7.675206642963757e-05
60 : 5.45841981865347e-05
65 : 3.7094975502277664e-05
70 : 2.1610886628954556e-05
75 : 9.357291117691663e-06
80 : 1.6709448424449398e-06


In [7]:
# create fraud label
data['fraud rate'] = prediction
final_fraud_prediction = data[['user_id', 'order_datetime', 'fraud rate']]
final_fraud_prediction['fraud'] = final_fraud_prediction['fraud rate'].apply(lambda x:1 if x >= 20 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_fraud_prediction['fraud'] = final_fraud_prediction['fraud rate'].apply(lambda x:1 if x >= 20 else 0)


In [8]:
final_fraud_prediction.to_csv('../data/curated/fraud/final_fraud_prediction.csv', index=False)

# Get the actual fraud labelled (for dropping once joined)

In [9]:
consumer_fraud_df = pd.read_csv('../data/tables/consumer_fraud_probability.csv')

In [10]:
# find threshold that drops 0.001 of labelled transactions
consumer_fraud_df.quantile(0.999)

user_id              24057.137000
fraud_probability       83.006761
Name: 0.999, dtype: float64

In [11]:
# create labels
consumer_fraud_df['fraud'] = consumer_fraud_df['fraud_probability'].apply(lambda x:1 if x >= 83 else 0)
consumer_fraud_df

Unnamed: 0,user_id,order_datetime,fraud_probability,fraud
0,6228,2021-12-19,97.629808,1
1,21419,2021-12-10,99.247380,1
2,5606,2021-10-17,84.058250,1
3,3101,2021-04-17,91.421921,1
4,22239,2021-10-19,94.703425,1
...,...,...,...,...
34859,18466,2022-02-22,8.679873,0
34860,17552,2021-12-26,8.349463,0
34861,17875,2021-06-27,8.288847,0
34862,10401,2021-09-18,8.842216,0


In [12]:
consumer_fraud_df.to_csv('../data/curated/fraud/final_fraud_real.csv', index=False)

# Join together and drop fraud rows

In [13]:
# import libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from statistics import mean, stdev, pstdev
import os

In [14]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/22 13:13:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/22 13:13:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/09/22 13:13:27 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/09/22 13:13:27 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [15]:
# import transactions data
data1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
data2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
data3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot/")

data = data1.union(data2)
data = data.union(data3)

                                                                                

In [16]:
# read both fraud data
final_fraud_prediction = spark.read.option("header",True).csv('../data/curated/fraud/final_fraud_prediction.csv', )
final_fraud_real = spark.read.option("header",True).csv('../data/curated/fraud/final_fraud_real.csv', )

In [17]:
# rename columns before join
final_fraud_real = final_fraud_real.withColumnRenamed("user_id", "user_id3")\
    .withColumnRenamed("order_datetime", "order_datetime3").withColumnRenamed("fraud", "fraud3")

final_fraud_prediction = final_fraud_prediction.withColumnRenamed("user_id", "user_id2")\
    .withColumnRenamed("order_datetime", "order_datetime2").withColumnRenamed("fraud", "fraud2")

In [18]:
# join up
data_withFraud = final_fraud_prediction.join(data, [final_fraud_prediction.user_id2 == data.user_id, \
    final_fraud_prediction.order_datetime2 == data.order_datetime], 'outer')

data_withFraud = data_withFraud.join(final_fraud_real, [data_withFraud.user_id == final_fraud_real.user_id3, \
    data_withFraud.order_datetime == final_fraud_real.order_datetime3], 'left').fillna(0)

In [19]:
data_withFraud.head()

                                                                                

Row(user_id2='1', order_datetime2='2021-03-11', fraud rate='9.287148398864032', fraud2='0', user_id=1, merchant_abn=86578477987, dollar_value=23.300617635489033, order_id='661ed6fb-c52f-4c5e-9f1d-73fb6006c088', order_datetime=datetime.date(2021, 3, 11), user_id3=None, order_datetime3=None, fraud_probability=None, fraud3=None)

In [20]:
# drop fraud (picked up by prediction)
data_withoutFraud = data_withFraud.filter(F.col('fraud2') == 0)

In [21]:

data_withoutFraud.count()

                                                                                

14179422

In [22]:
# drop fraud (picked up by observation)
data_withoutFraud = data_withoutFraud.filter((F.col('fraud3') == 0) | (F.col('fraud3').isNull()))

In [23]:
# realise that predicted fraud includes all the observed fraud, so this is good emperical evidence that our work is good
data_withoutFraud.count()

                                                                                

14179422

In [None]:
data_withoutFraud_correctDates = data_withoutFraud.filter(data_withoutFraud["order_datetime"] >= F.lit('2021-03-01')) \
       .filter(data_withoutFraud["order_datetime"] <= F.lit('2022-08-14'))

In [27]:
data_withoutFraud_correctDates.count()

                                                                                

12143162

In [28]:
data_withoutFraud_correctDates.write.mode('overwrite').parquet('../data/curated/fraud/transactions_withoutfraud')

                                                                                

In [29]:
data_withoutFraud_correctDates.columns

['user_id2',
 'order_datetime2',
 'fraud rate',
 'fraud2',
 'user_id',
 'merchant_abn',
 'dollar_value',
 'order_id',
 'order_datetime',
 'user_id3',
 'order_datetime3',
 'fraud_probability',
 'fraud3']

In [37]:
data_withoutFraud_correctDates.select(F.col('user_id'), F.col('merchant_abn'), 
    F.col('dollar_value'), F.col('order_id'), F.col('order_datetime'))

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime
1,86578477987,23.300617635489036,661ed6fb-c52f-4c5...,2021-03-11
1,72472909171,26.84427554025195,02627b89-1d59-476...,2021-03-21
1,86010199872,218.49116722954264,1554c198-01aa-4a8...,2021-03-21
1,83893827922,2848.168589428446,41fa3fa4-7988-4a6...,2021-05-03
1,46804135891,67.09721885094145,c3dcb1b2-54d4-47f...,2021-05-03
1,32234779638,77.68589068822394,0f8d252d-79e0-48e...,2021-05-12
1,58392414752,77.59241393534961,23a7b399-3399-400...,2021-05-12
1,80324045558,100.7884172212041,dd8e06be-113c-4b8...,2021-05-16
1,49505931725,79.89116533221068,faa38b1e-729d-490...,2021-05-20
1,83690644458,102.77148602868758,68195b1a-e3cb-4b8...,2021-05-20
