In [1]:
# import libraries
from gettext import npgettext
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
from statistics import mean, stdev

import json

In [2]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/28 16:16:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/28 16:16:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# import data
data1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
data2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
data3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot/")

                                                                                

In [4]:
consumer_lookup = spark.read.parquet("../data/tables/consumer_user_details.parquet")
consumer_data = spark.read.option("header",True) \
                         .option("inferSchema",True) \
                         .options(delimiter='|') \
                         .csv("../data/tables/tbl_consumer.csv")

In [5]:
data = data1.union(data2)
data = data.union(data3)

In [6]:
data = data.filter(data["order_datetime"] >= F.lit('2021-03-01')) \
       .filter(data["order_datetime"] <= F.lit('2022-08-14'))


In [7]:
consumer_data

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975
Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,407340
Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
Stephen Williams,6804 Wright Crest...,WA,6056,Male,448088
Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,650435
Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,1058499


In [8]:
join_df = data.join(consumer_lookup,["user_id"])
join_df = join_df.join(consumer_data, ["consumer_id"])

In [9]:
join_df

                                                                                

consumer_id,user_id,merchant_abn,dollar_value,order_id,order_datetime,name,address,state,postcode,gender
551,3471,68216911708,39.2325218595659,c921263e-489e-45a...,2021-08-21,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,77698107389,90.16967584866272,aaea14f5-48e8-4ef...,2021-08-21,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,46451548968,3.8405569898888423,7e8e7fca-04a1-4f2...,2021-08-19,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,35556933338,66.18325079406301,13b0be6a-20b7-439...,2021-08-22,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,91772719127,33.69274773234,3750984f-4313-47d...,2021-08-14,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,29216160692,171.76081688162495,f5917c53-caca-4b6...,2021-08-15,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,32234779638,80.46099153654205,b9423d5c-9083-4e6...,2021-07-15,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,68216911708,60.71366865869528,94fbe85c-434a-4e6...,2021-07-15,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,75034515922,5.4642431321898375,12e4b9c7-1738-481...,2021-07-15,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,24852446429,56.94751884265085,8e65c8ee-67f1-411...,2021-08-16,Todd Long,883 Patty Mountai...,NSW,2021,Male


In [38]:
join_df.describe()

                                                                                

summary,consumer_id,user_id,merchant_abn,dollar_value,order_id,Week,Year,Month,Day,Fortnight,name,address,state,postcode,gender
count,12157084.0,12157084.0,12157084,12157084.0,12157084,12157084.0,12157084.0,12157084.0,12157084.0,12157084.0,12157084,12157084,12157084,12157084.0,12157084
mean,753256.278102216,12039.659402287589,5.542397773206698...,166.35215026470863,,26.48216373268458,2021.4182861613851,6.562012897171723,15.814851571314303,13.491465469844576,,,,4043.303962858199,
stddev,435525.6477490294,6951.602269350454,2.532751109873882E10,520.5260587786956,,13.602001831795452,0.4932776789940705,3.1197910160833517,8.804069186278074,6.799815171682457,,,,1780.782568491656,
min,30.0,1.0,10023283211,9.756658099412162e-08,000000d4-d252-4e2...,1.0,2021.0,1.0,1.0,1.0,Aaron Adkins,000 Barnes Missio...,ACT,200.0,Female
max,1499867.0,24081.0,99990536339,105193.88578925544,fffffca5-42ab-49d...,52.0,2022.0,12.0,31.0,26.0,Zoe Randall,99994 Krystal Views,WA,9999.0,Undisclosed


In [10]:
# Since some postcodes are not included in the ABS data, we decide to remove them 
abs_df = spark.read.csv("../data/curated/abs_cleaned.csv", header = 'True')

In [11]:
abs_postcode = abs_df.select('_c0').rdd.map(lambda x : x[0]).collect()

22/09/28 16:17:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///Users/liyujie/Documents/GitHub/generic-buy-now-pay-later-project-group-8/data/curated/abs_cleaned.csv


In [12]:
len(abs_postcode)

2653

In [13]:
# join_df.createOrReplaceTempView("joindf")
# spark.sql(f"""SELECT * FROM joindf 
# WHERE postcode IN abs_postcode""").show()

from pyspark.sql.functions import col
new_join = join_df.where((join_df.postcode).isin(abs_postcode))

In [15]:
new_join

                                                                                

consumer_id,user_id,merchant_abn,dollar_value,order_id,order_datetime,name,address,state,postcode,gender
551,3471,68216911708,39.2325218595659,c921263e-489e-45a...,2021-08-21,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,77698107389,90.16967584866272,aaea14f5-48e8-4ef...,2021-08-21,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,46451548968,3.8405569898888423,7e8e7fca-04a1-4f2...,2021-08-19,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,35556933338,66.18325079406301,13b0be6a-20b7-439...,2021-08-22,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,91772719127,33.69274773234,3750984f-4313-47d...,2021-08-14,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,29216160692,171.76081688162495,f5917c53-caca-4b6...,2021-08-15,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,32234779638,80.46099153654205,b9423d5c-9083-4e6...,2021-07-15,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,68216911708,60.71366865869528,94fbe85c-434a-4e6...,2021-07-15,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,75034515922,5.4642431321898375,12e4b9c7-1738-481...,2021-07-15,Todd Long,883 Patty Mountai...,NSW,2021,Male
551,3471,24852446429,56.94751884265085,8e65c8ee-67f1-411...,2021-08-16,Todd Long,883 Patty Mountai...,NSW,2021,Male


# Groupby merchant abn and postcode

In [16]:
data_by_fornight = new_join.groupBy("merchant_abn","postcode")\
    .agg(F.count("order_id").alias("count_postcode_merchant"))

In [17]:
data_by_fornight

                                                                                

merchant_abn,postcode,count_postcode_merchant
95508140753,4553,4
80324045558,4055,76
63290521567,4356,87
52310302169,7019,1
80785565526,3687,2
24471518053,3013,1
65674339048,5307,2
49891706470,3165,62
19237425345,3529,4
14430838529,7024,6


In [18]:
totalorder_perfor_merchant = new_join.groupBy("merchant_abn")\
    .agg(F.count("order_id").alias("total_order_per_fornight"))

In [21]:
totalorder_perfor_merchant

                                                                                

merchant_abn,total_order_per_fornight
38700038932,5117
15613631617,1309
19839532017,557
73256306726,3824
35344855546,1103
83412691377,10241
24406529929,3000
73841664453,692
78916025936,48
60654402457,128


In [23]:
# data_by_fornight.withColumn("postcode_dict",F.to_json(F.struct("postcode", "count_postcode_merchant"))).show(truncate=False)
cond = ["merchant_abn"]
data_by_fornight = data_by_fornight.join(totalorder_perfor_merchant, cond, "inner")

In [24]:
data_by_fornight = data_by_fornight.withColumn("proportion_of_postcode", (data_by_fornight.count_postcode_merchant/data_by_fornight.total_order_per_fornight))

In [25]:
data_by_fornight.toPandas().to_csv('../data/curated/Grouping_bytotal.csv', index = False)

                                                                                

22/09/28 21:10:32 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 943353 ms exceeds timeout 120000 ms
22/09/28 21:10:32 WARN SparkContext: Killing executors is not supported by current scheduler.


In [57]:
data_by_fornight

                                                                                

merchant_abn,Fortnight,postcode,count_postcode_merchant,total_order_per_fornight,proportion_of_postcode
10206519221,1,6068,1,151,0.0066225165562913
10206519221,1,2565,1,151,0.0066225165562913
10206519221,1,6411,1,151,0.0066225165562913
10206519221,1,5372,1,151,0.0066225165562913
10206519221,1,6438,1,151,0.0066225165562913
10206519221,1,2753,1,151,0.0066225165562913
10206519221,1,4072,1,151,0.0066225165562913
10206519221,1,4673,1,151,0.0066225165562913
10206519221,1,3920,1,151,0.0066225165562913
10206519221,1,6227,1,151,0.0066225165562913
