# Data preparation for clustering method
# Objective: Generate features to a dataset for take rate imputation through clustering

In [1]:
# importing the require libraries to run the code in the this notebook
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrameStatFunctions as statFunc

In [2]:
# create directory persona if does not exist
import os

dirs_to_create = ['../data/curated/clusters', '../data/curated/clusters/input', '../data/curated/clusters/output']

def create_dirs(dirs_to_create):
    # check if it exists as it makedir will raise an error if it does exist
    for dir_to_create in dirs_to_create:
        if not os.path.exists(dir_to_create):
            os.makedirs(dir_to_create)

create_dirs(dirs_to_create)

In [3]:
# running spark
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

22/10/12 17:08:12 WARN Utils: Your hostname, modaxuexiweiyuanzhangde-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.81.59 instead (on interface en0)
22/10/12 17:08:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/12 17:08:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/12 17:08:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# read in the require dataset
transaction_sdf = spark.read.parquet("../data/curated/fraud/output/transactions_withoutfraud")
transaction_sdf

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime,merchant_name,prod_desc,revenue_level,take_rate,consumer_name,address,state,postcode,gender,consumer_id
2,38971488186,148.60906137492083,2826603b-9674-451...,2021-03-04,Primis In LLP,"books, periodical...",b,3.6,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,40515399744,1447.0521429027654,5e081812-fb3d-49c...,2021-03-13,,,,,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,68216911708,11.868321758401615,8e380a03-98ed-401...,2021-03-30,Placerat Eget Ven...,"computers, comput...",c,3.0,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,96161947306,72.35346983449212,57f9abcf-9a4b-453...,2021-04-08,Imperdiet Dictum ...,shoe shops,b,4.5,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,11566786699,3.2469512509425007,8b5c4f65-d366-465...,2021-04-09,Euismod Et Institute,"opticians, optica...",c,2.6,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,96880556465,107.29786246813728,1bb65738-3e4b-4af...,2021-04-15,At Lacus Quisque ...,"gift, card, novel...",a,5.7,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,63071746690,42.958957505765824,ef2983bc-b71e-400...,2021-04-15,Mollis Vitae LLC,"opticians, optica...",a,6.9,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,11237511112,83.322997064485,6113ee2b-aa70-438...,2021-04-15,Magna Institute,"opticians, optica...",c,2.1,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,32844138686,96.91526773380824,51dbde7d-0918-4c1...,2021-05-13,At Augue Id Assoc...,computer programm...,c,1.8,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
2,41974958954,138.8682159362684,162cb687-ede6-4f4...,2021-06-24,Sed Libero Proin ...,"cable, satellite,...",a,5.5,Mary Smith,3764 Amber Oval,NSW,2782,Female,179208


## Perform monthly aggregation to obtain monthly number of orders, mean revenue, and standard deviation of revenue

In [5]:
monthly_agg = transaction_sdf.groupBy(
                                      "merchant_abn",
                                      F.year("order_datetime"),
                                      F.month("order_datetime")
                                      ) \
                                      .agg(
                                        F.count("order_id").alias("number_of_order"),
                                        F.sum("dollar_value").alias("total_dollar_value")
                                      ) \
                                      .groupBy("merchant_abn") \
                                      .agg(
                                        F.mean("number_of_order").alias("monthly_average_number_of_orders"),
                                        F.mean("total_dollar_value").alias("monthly_average_bnpl_revenue"),
                                        F.stddev("total_dollar_value").alias("stddev_of_monthly_average_bnpl_revenue")
                                      )

Get aggregated transaction dataset with engineered features, this dataset are the missing merchants that requires imputation

In [6]:
# find all unknown merchants by looking for merchants with only ABN but not name, revenue level, and take rate.
# And conduct feature engineering for total number of customers, standard deviation of total dollar value and mean of dollar value.
# and join with the monthly features to transaction sdf.


agg_transaction_pred_sdf = transaction_sdf.filter(F.col("merchant_name").isNull()) \
                                          .groupBy("merchant_abn") \
                                          .agg(
                                               F.countDistinct("consumer_id").alias("total_number_of_distinct_customers"),
                                               F.stddev("dollar_value").alias("stddev_of_dorllar_value"),
                                               F.mean("dollar_value").alias("average_dollar_value"),
                                               F.percentile_approx("dollar_value", 0.5).alias("median_dollar_value")
                                               ) \
                                          .join(
                                            monthly_agg,
                                            ['merchant_abn', ],
                                            "left"
                                          ).toPandas().to_csv("../data/curated/clusters/input/agg_transaction_pred.csv", index=False)                 

                                                                                

Get aggregated transaction dataset with engineered features, this dataset are the known merchants with information present and is use to create clusters.

In [7]:
# find all known merchants by looking for merchants with known merchant information
# And conduct feature engineering for total number of customers, standard deviation of total dollar value and mean of dollar value.
# and join with the monthly features to transaction sdf.
agg_transaction_train_sdf = transaction_sdf.filter(F.col("merchant_name").isNotNull()) \
                                          .groupBy("merchant_abn") \
                                          .agg(
                                               F.countDistinct("consumer_id").alias("total_number_of_distinct_customers"),
                                               F.stddev("dollar_value").alias("stddev_of_dorllar_value"),
                                               F.mean("dollar_value").alias("average_dollar_value"),
                                               F.percentile_approx("dollar_value", 0.5).alias("median_dollar_value"),
                                               F.mean("take_rate").alias("take_rate")
                                               ) \
                                          .join(
                                            monthly_agg,
                                            ['merchant_abn', ],
                                            "left"
                                          ).toPandas().to_csv("../data/curated/clusters/input/agg_transaction_train.csv", index=False)

                                                                                