# Grouping transactions by postcodes for each merchant (persona)

## Pyspark set up

In [1]:
# import libraries
from gettext import npgettext
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
from statistics import mean, stdev
from pyspark.sql.functions import col
import json

In [2]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

22/10/05 16:52:59 WARN Utils: Your hostname, Lis-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.47.249 instead (on interface en0)
22/10/05 16:52:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 16:52:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/05 16:53:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/05 16:53:00 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Read in data

In [3]:
# import data
data1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
data2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
data3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot/")

                                                                                

In [4]:
consumer_lookup = spark.read.parquet("../data/tables/consumer_user_details.parquet")
consumer_data = spark.read.option("header",True) \
                         .option("inferSchema",True) \
                         .options(delimiter='|') \
                         .csv("../data/tables/tbl_consumer.csv")

### Concate all transaction records parquets together

In [5]:
data = data1.union(data2)
data = data.union(data3)

### Choosing transactions from March 2021 to August 14th 2022 only

In [6]:
data = data.filter(data["order_datetime"] >= F.lit('2021-03-01')) \
       .filter(data["order_datetime"] <= F.lit('2022-08-14'))


## Join transactions and consumer data for matching with persona data in later notebook

In [7]:
join_df = data.join(consumer_lookup,["user_id"])
join_df = join_df.join(consumer_data, ["consumer_id"])

### Since some postcodes are not included in the ABS data, we decide to remove them 
#### Find postcodes listed in ABS data

In [8]:
abs_df = spark.read.csv("../data/curated/abs_cleaned.csv", header = 'True')

In [9]:
# selecting postcodes from ABS persona data
abs_postcode = abs_df.select('_c0').rdd.map(lambda x : x[0]).collect()

22/10/05 16:53:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///Users/liyujie/Documents/GitHub/generic-buy-now-pay-later-project-group-8/data/curated/abs_cleaned.csv


#### keep records in transaction+customer table if its postcode is in abs_postcode

In [10]:
new_join = join_df.where((join_df.postcode).isin(abs_postcode))

## Groupby merchant abn and postcode
### For each merchant, find number of orders for each postcode and total orders

In [11]:
total_ords_postcode_merchant = new_join.groupBy("merchant_abn","postcode")\
    .agg(F.count("order_id").alias("total_order_postcode_merchant"))

In [12]:
total_ords_merchant = new_join.groupBy("merchant_abn")\
    .agg(F.count("order_id").alias("total_order_merchant"))

### Create a lookup table for every postcode of all merchant
### Join table with number of orders for each postcode and table with total orders for each merchant


In [13]:
cond = ["merchant_abn"]
data_final = total_ords_postcode_merchant.join(total_ords_merchant, cond, "inner")

### For every merchant, calculate the proportion of each postcode over its overall orders

In [14]:
data_final1 = data_final.withColumn("proportion_of_postcode", (data_final.total_order_postcode_merchant/data_final.total_order_merchant))

In [15]:
data_final1.toPandas().to_csv('../data/curated/Grouping_bytotal.csv', index = False)

                                                                                