# Grouping transactions by postcodes for each merchant (persona) BY FORTNIGHTLY
## Pyspark set up

In [1]:
# import libraries
from gettext import npgettext
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
from statistics import mean, stdev
from pyspark.sql.functions import when
from pyspark.sql.functions import col
import json

In [2]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

22/10/06 11:40:31 WARN Utils: Your hostname, Lis-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.47.249 instead (on interface en0)
22/10/06 11:40:31 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/06 11:40:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/06 11:40:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/06 11:40:33 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/10/06 11:40:33 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


## Read in data

In [3]:
# import data
data1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
data2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
data3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot/")

                                                                                

In [4]:
consumer_lookup = spark.read.parquet("../data/tables/consumer_user_details.parquet")
consumer_data = spark.read.option("header",True) \
                         .option("inferSchema",True) \
                         .options(delimiter='|') \
                         .csv("../data/tables/tbl_consumer.csv")

### Concate all transaction records parquets together

In [5]:
data = data1.union(data2)
data = data.union(data3)

### Choosing transactions from March 2021 to August 14th 2022 only

In [6]:
data = data.filter(data["order_datetime"] >= F.lit('2021-03-01')) \
       .filter(data["order_datetime"] <= F.lit('2022-08-14'))

data = data.withColumn("Week", F.weekofyear("order_datetime"))\
        .withColumn("Year", F.year("order_datetime"))\
        .withColumn("Month", F.month("order_datetime"))\
        .withColumn("Day", F.dayofmonth("order_datetime"))\
        .withColumn("Fortnight", ((F.col("Week")+1)/2).cast('int'))

## Seperating fortnight records between 2021 and 2022 for each transacton 

In [7]:
# seperate records between 2021 and 2022
k = data.filter(data.Year == 2022)
k1 = data.filter(data.Year == 2021)

In [8]:
# for every record from 2022, we will add the fortnight by 26, i.e. starting from fortnight 27
data = data.withColumn("Fortnight",when(data.Year == 2022,data.Fortnight + 26).otherwise(data.Fortnight))

## Join transactions and consumer data for matching with persona data in later notebook

In [9]:
join_df = data.join(consumer_lookup,["user_id"])
join_df = join_df.join(consumer_data, ["consumer_id"])

In [10]:
join_df.select("merchant_abn","order_id","Week","Year","Month","Day","Fortnight","state","postcode")

                                                                                

merchant_abn,order_id,Week,Year,Month,Day,Fortnight,state,postcode
68216911708,c921263e-489e-45a...,33,2021,8,21,17,NSW,2021
77698107389,aaea14f5-48e8-4ef...,33,2021,8,21,17,NSW,2021
46451548968,7e8e7fca-04a1-4f2...,33,2021,8,19,17,NSW,2021
35556933338,13b0be6a-20b7-439...,33,2021,8,22,17,NSW,2021
91772719127,3750984f-4313-47d...,32,2021,8,14,16,NSW,2021
29216160692,f5917c53-caca-4b6...,32,2021,8,15,16,NSW,2021
32234779638,b9423d5c-9083-4e6...,28,2021,7,15,14,NSW,2021
68216911708,94fbe85c-434a-4e6...,28,2021,7,15,14,NSW,2021
75034515922,12e4b9c7-1738-481...,28,2021,7,15,14,NSW,2021
24852446429,8e65c8ee-67f1-411...,33,2021,8,16,17,NSW,2021


### Since some postcodes are not included in the ABS data, we decide to remove them 
#### Find postcodes listed in ABS data

In [11]:
# Since some postcodes are not included in the ABS data, we decide to remove them 
abs_df = spark.read.csv("../data/curated/abs_cleaned.csv", header = 'True')

In [12]:
abs_postcode = abs_df.select('_c0').rdd.map(lambda x : x[0]).collect()

22/10/06 11:40:55 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///Users/liyujie/Documents/GitHub/generic-buy-now-pay-later-project-group-8/data/curated/abs_cleaned.csv


#### keep records in transaction+customer table if its postcode is in abs_postcode

In [13]:
new_join = join_df.where((join_df.postcode).isin(abs_postcode))

## Groupby merchant abn and postcode
### For each merchant, find number of orders for each postcode and total orders

In [14]:
data_by_fornight = new_join.groupBy("Fortnight","merchant_abn","postcode")\
    .agg(F.count("order_id").alias("count_postcode_merchant"))

In [15]:
totalorder_perfor_merchant = new_join.groupBy("Fortnight","merchant_abn")\
    .agg(F.count("order_id").alias("total_order_per_fornight"))

### Create a lookup table for every postcode of all merchant
### Join table with number of orders for each postcode and table with total orders for each merchant


In [16]:
cond = ["merchant_abn","Fortnight"]
data_by_fornight = data_by_fornight.join(totalorder_perfor_merchant, cond, "inner")

### For every merchant, calculate the proportion of each postcode over its overall orders

In [17]:
data_by_fornight = data_by_fornight.withColumn("proportion_of_postcode", (data_by_fornight.count_postcode_merchant/data_by_fornight.total_order_per_fornight))

In [18]:
data_by_fornight.toPandas().to_csv('../data/curated/Grouping.csv', index = False)

                                                                                