In [5]:
# import libraries
from gettext import npgettext
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
from statistics import mean, stdev

import json

In [6]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [7]:
# import data
data1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
data2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
data3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot/")

                                                                                

In [8]:
consumer_lookup = spark.read.parquet("../data/tables/consumer_user_details.parquet")
consumer_data = spark.read.option("header",True) \
                         .option("inferSchema",True) \
                         .options(delimiter='|') \
                         .csv("../data/tables/tbl_consumer.csv")

In [9]:
data = data1.union(data2)
data = data.union(data3)

In [10]:
data = data.filter(data["order_datetime"] >= F.lit('2021-03-01')) \
       .filter(data["order_datetime"] <= F.lit('2022-08-14'))

data = data.withColumn("Week", F.weekofyear("order_datetime"))\
        .withColumn("Year", F.year("order_datetime"))\
        .withColumn("Month", F.month("order_datetime"))\
        .withColumn("Day", F.dayofmonth("order_datetime"))\
        .withColumn("Fortnight", ((F.col("Week")+1)/2).cast('int'))

In [11]:
#q
k = data.filter(data.Year == 2022)
k1 = data.filter(data.Year == 2021)

In [12]:
k.describe()

                                                                                

summary,user_id,merchant_abn,dollar_value,order_id,Week,Year,Month,Day,Fortnight
count,5085140.0,5085140,5085140.0,5085140,5085140.0,5085140,5085140.0,5085140.0,5085140.0
mean,12040.017127945346,5.542595241702891...,166.40679403485234,,17.863753210334426,2022.0,4.533074802266998,15.334098766208994,9.177738076041171
stddev,6952.176608707951,2.532974727501411...,520.88212597645,,9.439420452042802,2.057561698811217...,2.125801416515505,8.763953318114469,4.706971035095588
min,1.0,10023283211,7.37989805475434e-07,0000019e-1405-433...,1.0,2022,1.0,1.0,1.0
max,24081.0,99990536339,64309.577211019314,fffffbd0-f7da-416...,52.0,2022,8.0,31.0,26.0


In [13]:
from pyspark.sql.functions import when
data = data.withColumn("Fortnight",when(data.Year == 2022,data.Fortnight + 26).otherwise(data.Fortnight))

In [14]:
k1.describe()

                                                                                

summary,user_id,merchant_abn,dollar_value,order_id,Week,Year,Month,Day,Fortnight
count,7071944.0,7071944,7071944.0,7071944,7071944.0,7071944,7071944.0,7071944.0,7071944.0
mean,12039.402176685788,5.54225578183798E10,166.3128582084755,,32.679303314619006,2021.0,8.020937665795996,16.160540864011367,16.593286937792495
stddev,6951.189736707225,2.532590473135230...,520.2699083815513,,12.734465176550822,2.048415098073651...,2.897202636559719,8.816615786919016,6.36805129027306
min,1.0,10023283211,9.756658099412162e-08,000000d4-d252-4e2...,9.0,2021,3.0,1.0,5.0
max,24081.0,99990536339,105193.88578925544,fffffca5-42ab-49d...,52.0,2021,12.0,31.0,26.0


In [15]:
consumer_data

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975
Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,407340
Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
Stephen Williams,6804 Wright Crest...,WA,6056,Male,448088
Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,650435
Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,1058499


In [16]:
join_df = data.join(consumer_lookup,["user_id"])
join_df = join_df.join(consumer_data, ["consumer_id"])

In [17]:
join_df.select("merchant_abn","order_id","Week","Year","Month","Day","Fortnight","state","postcode")

                                                                                

merchant_abn,order_id,Week,Year,Month,Day,Fortnight,state,postcode
68216911708,c921263e-489e-45a...,33,2021,8,21,17,NSW,2021
77698107389,aaea14f5-48e8-4ef...,33,2021,8,21,17,NSW,2021
46451548968,7e8e7fca-04a1-4f2...,33,2021,8,19,17,NSW,2021
35556933338,13b0be6a-20b7-439...,33,2021,8,22,17,NSW,2021
91772719127,3750984f-4313-47d...,32,2021,8,14,16,NSW,2021
29216160692,f5917c53-caca-4b6...,32,2021,8,15,16,NSW,2021
32234779638,b9423d5c-9083-4e6...,28,2021,7,15,14,NSW,2021
68216911708,94fbe85c-434a-4e6...,28,2021,7,15,14,NSW,2021
75034515922,12e4b9c7-1738-481...,28,2021,7,15,14,NSW,2021
24852446429,8e65c8ee-67f1-411...,33,2021,8,16,17,NSW,2021


In [18]:
join_df.describe()

                                                                                

summary,consumer_id,user_id,merchant_abn,dollar_value,order_id,Week,Year,Month,Day,Fortnight,name,address,state,postcode,gender
count,12157084.0,12157084.0,12157084,12157084.0,12157084,12157084.0,12157084.0,12157084.0,12157084.0,12157084.0,12157084,12157084,12157084,12157084.0,12157084
mean,753256.278102216,12039.659402287589,5.542397773206698...,166.35215026470863,,26.48216373268458,2021.4182861613851,6.562012897171723,15.814851571314303,24.366905665865268,,,,4043.303962858199,
stddev,435525.6477490294,6951.602269350454,2.532751109873882E10,520.5260587786956,,13.602001831795452,0.4932776789940705,3.1197910160833517,8.804069186278074,10.81186243133092,,,,1780.782568491656,
min,30.0,1.0,10023283211,9.756658099412162e-08,000000d4-d252-4e2...,1.0,2021.0,1.0,1.0,5.0,Aaron Adkins,000 Barnes Missio...,ACT,200.0,Female
max,1499867.0,24081.0,99990536339,105193.88578925544,fffffca5-42ab-49d...,52.0,2022.0,12.0,31.0,52.0,Zoe Randall,99994 Krystal Views,WA,9999.0,Undisclosed


In [19]:
# Since some postcodes are not included in the ABS data, we decide to remove them 
abs_df = spark.read.csv("../data/curated/abs_cleaned.csv", header = 'True')

In [20]:
abs_postcode = abs_df.select('_c0').rdd.map(lambda x : x[0]).collect()

22/09/29 12:55:05 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///Users/liyujie/Documents/GitHub/generic-buy-now-pay-later-project-group-8/data/curated/abs_cleaned.csv


In [21]:
len(abs_postcode)

2653

In [22]:
# join_df.createOrReplaceTempView("joindf")
# spark.sql(f"""SELECT * FROM joindf 
# WHERE postcode IN abs_postcode""").show()

from pyspark.sql.functions import col
new_join = join_df.where((join_df.postcode).isin(abs_postcode))

In [23]:
new_join.describe()

                                                                                

summary,consumer_id,user_id,merchant_abn,dollar_value,order_id,Week,Year,Month,Day,Fortnight,name,address,state,postcode,gender
count,10195679.0,10195679.0,10195679,10195679.0,10195679,10195679.0,10195679.0,10195679.0,10195679.0,10195679.0,10195679,10195679,10195679,10195679.0,10195679
mean,753000.6175025714,12047.437289463507,5.541979538973208E10,166.4153065535396,,26.48272949746653,2021.418212558477,6.562021715277619,15.815977827469853,24.36525335880033,,,,4180.279968504305,
stddev,435169.9042534198,6950.053310956647,2.532609365166996...,520.7606355108969,,13.60169162878944,0.4932654845752156,3.119715775658305,8.804177163653394,10.811762190807832,,,,1521.0228576143002,
min,30.0,2.0,10023283211,9.756658099412162e-08,000000d4-d252-4e2...,1.0,2021.0,1.0,1.0,5.0,Aaron Adkins,000 Barnes Missio...,ACT,800.0,Female
max,1499861.0,24081.0,99990536339,105193.88578925544,fffffca5-42ab-49d...,52.0,2022.0,12.0,31.0,52.0,Zoe Randall,9999 Brenda Hills,WA,7470.0,Undisclosed


# Groupby forenightly and store the forenighlty order postcodes

In [24]:
data_by_fornight = new_join.groupBy("Fortnight","merchant_abn","postcode")\
    .agg(F.count("order_id").alias("count_postcode_merchant"))

In [25]:
data_by_fornight

                                                                                

Fortnight,merchant_abn,postcode,count_postcode_merchant
17,26925196872,4361,1
17,52959528548,6327,1
17,24852446429,4571,1
17,17324645993,5434,2
17,84088835754,4384,1
17,91848160033,3005,1
17,53003062892,4350,1
17,68289707002,3669,1
17,54291122944,4480,1
17,81761494572,5011,1


In [26]:
totalorder_perfor_merchant = new_join.groupBy("Fortnight","merchant_abn")\
    .agg(F.count("order_id").alias("total_order_per_fornight"))

In [27]:
totalorder_perfor_merchant

                                                                                

Fortnight,merchant_abn,total_order_per_fornight
17,47086412084,1055
17,27093785141,480
17,93558142492,425
17,26169574842,185
17,41170595493,131
17,63344521351,48
17,79830510987,51
17,12426913844,32
17,31003199153,4
17,47856542678,166


In [28]:
# data_by_fornight.withColumn("postcode_dict",F.to_json(F.struct("postcode", "count_postcode_merchant"))).show(truncate=False)
cond = ["merchant_abn","Fortnight"]
data_by_fornight = data_by_fornight.join(totalorder_perfor_merchant, cond, "inner")

In [29]:
data_by_fornight = data_by_fornight.withColumn("proportion_of_postcode", (data_by_fornight.count_postcode_merchant/data_by_fornight.total_order_per_fornight))

In [30]:
data_by_fornight.toPandas().to_csv('../data/curated/Grouping.csv', index = False)

                                                                                

In [31]:
data_by_fornight.describe()

                                                                                

summary,merchant_abn,Fortnight,postcode,count_postcode_merchant,total_order_per_fornight,proportion_of_postcode
count,8507906.0,8507906.0,8507906.0,8507906.0,8507906.0,8507906
mean,55059275743.63144,24.276046655898643,4176.667720588356,1.1983770154489248,815.7423222588496,0.01678109748745215
stddev,25691212144.3034,10.866091915184024,1520.8652282605633,0.6109136225974535,1325.4054174564658,0.061674211931999706
min,10023283211.0,5.0,800.0,1.0,1.0,1.028171910343409...
max,99990536339.0,52.0,7470.0,23.0,9726.0,1.0
