In [1]:
# import libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from statistics import mean, stdev, pstdev

In [2]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

22/09/15 17:13:01 WARN Utils: Your hostname, modaxuexiweiyuanzhangde-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.120.37 instead (on interface en0)
22/09/15 17:13:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/15 17:13:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/15 17:13:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
data1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
data2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
data3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot/")

data = data1.union(data2)
data = data.union(data3)

                                                                                

In [4]:
merchant_fraud = spark.read.option("header",True).csv('../data/tables/merchant_fraud_probability.csv', )
consumer_fraud = spark.read.option("header",True).csv('../data/tables/consumer_fraud_probability.csv', )

In [5]:
merchant_fraud.head()

Row(merchant_abn='19492220327', order_datetime='2021-11-28', fraud_probability='44.403658647495355')

In [6]:
merchant_fraud_join_data = merchant_fraud.join(data, [merchant_fraud.merchant_abn == data.merchant_abn, \
    merchant_fraud.order_datetime == data.order_datetime], 'left')

In [7]:
consumer_fraud_join_data = consumer_fraud.join(data, [consumer_fraud.user_id == data.user_id, \
    consumer_fraud.order_datetime == data.order_datetime], 'left')

In [32]:
merchant_fraud_join_data_df = merchant_fraud_join_data.toPandas()

                                                                                

In [33]:
consumer_fraud_join_data_df = consumer_fraud_join_data.toPandas()

                                                                                

In [34]:
consumer_fraud_join_data_df = consumer_fraud_join_data_df.iloc[:, 2:]

In [35]:
merchant_fraud_join_data_df = merchant_fraud_join_data_df.iloc[:, 2:]

In [36]:
merchant_fraud_join_data_df['dollar_value2'] = merchant_fraud_join_data_df['dollar_value']
consumer_fraud_join_data_df['dollar_value2'] = consumer_fraud_join_data_df['dollar_value']

In [37]:
consumer_fraud_join_data_df

Unnamed: 0,fraud_probability,user_id,merchant_abn,dollar_value,order_id,order_datetime,dollar_value2
0,10.58055311139687,9,77990807356,2295.010789,5a85b720-1a32-481e-bc1c-3654d5e320c2,2021-12-13,2295.010789
1,10.58055311139687,9,24852446429,10.251999,dd9f50e9-e9cf-4fdb-8e74-eed0da81ab10,2021-12-13,10.251999
2,9.213002123639797,13,27440079026,2133.187970,6f4377b6-9fa0-4cc8-b354-60d0288af55c,2021-12-06,2133.187970
3,21.67558802682104,21,11439466003,402.082402,d01441bf-9da6-46de-8567-63fdd1651679,2021-06-08,402.082402
4,21.67558802682104,21,55092047467,12298.652944,9d048426-40f6-4d44-a5da-0cfd53645bff,2021-06-08,12298.652944
...,...,...,...,...,...,...,...
80555,32.702645569452784,24067,49891706470,20.597043,d2fa99f8-a50e-4e86-a0c9-d67452bb6703,2021-09-04,20.597043
80556,32.702645569452784,24067,38049816588,9674.595193,c04878b3-3c76-4656-9d5c-5ffc748653f0,2021-09-04,9674.595193
80557,14.254816637840468,24070,43833568675,2934.275292,e280debe-3b62-42b3-a75d-3545fc5784d9,2021-11-16,2934.275292
80558,14.254816637840468,24070,30372130473,137.421301,336644b0-5f13-4a4f-842c-8d8dfe26fa1b,2021-11-16,137.421301


In [38]:
merchant_fraud_instance_agg = merchant_fraud_join_data_df.groupby(['merchant_abn', 'order_datetime']).agg({'dollar_value': mean, \
    'dollar_value2': pstdev, 'order_id':'count', 'user_id':'nunique', 'fraud_probability': 'first'})

In [39]:
consumer_fraud_instance_agg = consumer_fraud_join_data_df.groupby(['user_id', 'order_datetime']).agg({'dollar_value': mean, \
    'dollar_value2': pstdev, 'order_id':'count', 'merchant_abn':'nunique', 'fraud_probability': 'first'})

In [40]:
tmp = merchant_fraud_instance_agg['order_id']
tmp2 = [x-1 if x != 1 else 1 for x in tmp ]
merchant_fraud_instance_agg['count**'] = tmp2

In [41]:
tmp = consumer_fraud_instance_agg['order_id']
tmp2 = [x-1 if x != 1 else 1 for x in tmp ]
consumer_fraud_instance_agg['count**'] = tmp2

In [42]:
merchant_fraud_instance_agg['sd_transact'] = merchant_fraud_instance_agg['dollar_value2']*merchant_fraud_instance_agg['order_id']/merchant_fraud_instance_agg['count**']
consumer_fraud_instance_agg['sd_transact'] = consumer_fraud_instance_agg['dollar_value2']*consumer_fraud_instance_agg['order_id']/consumer_fraud_instance_agg['count**']

In [45]:
merchant_fraud_instance_agg = merchant_fraud_instance_agg.drop(['dollar_value2', 'count**'], axis=1)
consumer_fraud_instance_agg = consumer_fraud_instance_agg.drop(['dollar_value2', 'count**'], axis=1)

In [50]:
merchant_fraud_instance_agg = merchant_fraud_instance_agg.rename(columns = {'order_id': 'count', 'dollar_value': 'mean_transact', 'merchant_abn': 'distinct_merchants'})
consumer_fraud_instance_agg = consumer_fraud_instance_agg.rename(columns = {'order_id': 'count', 'dollar_value': 'mean_transact', 'merchant_abn': 'distinct_merchants'})

In [51]:
consumer_fraud_instance_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_transact,count,distinct_merchants,fraud_probability,sd_transact
user_id,order_datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2022-02-20,2479.076338,1,1,9.805431136520959,0.000000
2,2021-08-30,710.667418,3,3,9.599513915425788,1023.234306
2,2021-09-25,2236.461666,1,1,10.069850934775245,0.000000
3,2021-11-03,2334.493717,1,1,8.300636455314633,0.000000
4,2021-10-09,775.013087,3,3,9.633302411090419,1106.645341
...,...,...,...,...,...,...
24079,2021-10-12,4708.648444,1,1,14.948165055476418,0.000000
24079,2021-11-08,1408.125536,2,2,8.940524305562004,2185.655120
24079,2021-11-26,696.038011,4,4,8.838622117011905,1016.552311
24081,2021-10-08,2160.357725,2,2,14.343771755068074,4241.359534


In [52]:
merchant_fraud_instance_agg

Unnamed: 0_level_0,Unnamed: 1_level_0,mean_transact,count,user_id,fraud_probability,sd_transact
merchant_abn,order_datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11149063370,2021-08-28,47346.113374,1,1,56.43761254995139,0.000000
11149063370,2021-11-14,43965.463557,1,1,52.407803322764764,0.000000
11149063370,2022-02-25,42797.348359,1,1,51.01538421455241,0.000000
11470993597,2021-09-28,146.289984,5,5,63.37734364737917,88.726942
11590404675,2021-12-21,25903.860931,2,2,29.607818240092094,9171.975519
...,...,...,...,...,...,...
94493496784,2021-11-26,99.073302,455,451,30.57903215900633,63.697587
96680767841,2021-11-26,308.273051,149,148,29.555244690425017,203.101900
97217894162,2022-01-21,28906.525608,2,2,34.94582650821017,7625.255229
97884414539,2021-10-19,44615.022521,1,1,89.79919971536573,0.000000
