# Understand data
# Objective: Conduct a general inspection on the released datasets

In [26]:
# importing the require libraries to run the code in the this notebook
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import re
import pandas as pd

In [27]:
# running spark
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)


In [28]:
# reading in the require dataset
merchant_sdf = spark.read.parquet("../data/tables/tbl_merchants.parquet")
consumer_detail_sdf = spark.read.parquet("../data/tables/consumer_user_details.parquet")
consumer_sdf = spark.read.option("header",True) \
                         .option("inferSchema",True) \
                         .options(delimiter='|') \
                         .csv("../data/tables/tbl_consumer.csv")
transaction_28_sdf = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot")
transaction_82_sdf = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")

# Transaction dataset
- should be our main focus in terms on analysing the number of transactions related to each customer and each merchant
- could be joined to consumer_detail by using user_id and consumer_id with the help of tbl_consumer
- join merchant on merchant abn (should be unique by personal knowledge)

In [29]:
transaction_28_sdf.orderBy(F.column("order_datetime"))

user_id,merchant_abn,dollar_value,order_id,order_datetime
11,20692490685,196.93916081228323,09bc8dd6-419f-4cb...,2021-02-28
5,83177825742,66.66426160206629,43e3b3fe-791b-47f...,2021-02-28
18495,66667026714,48.3228941934147,ca150cbf-6e34-489...,2021-02-28
18489,43186523025,98.14878546968934,9008a98e-1b02-4de...,2021-02-28
18491,64974914166,130.12601873970038,4bc15338-83eb-43d...,2021-02-28
18493,67979471799,53.03432496202065,dd61feff-5aa3-43d...,2021-02-28
11,98269572896,129.46280909485031,8f5d0cab-8055-435...,2021-02-28
18485,62191208634,79.13140006851712,9e18b913-0465-4fd...,2021-02-28
2,80779820715,48.12397733548124,cd09bdd6-f56d-489...,2021-02-28
18490,93558142492,232.83335268750145,2bda0665-796f-4f2...,2021-02-28


# Merchant Dataset

In [30]:
# quick look ar a random row
merchant_sdf.where(F.col("merchant_abn") == "20692490685")

name,tags,merchant_abn
Aliquet Molestie ...,"[[stationery, off...",20692490685


In [31]:
merchant_sdf.select(F.col("tags")).take(1)

[Row(tags='((furniture, home furnishings and equipment shops, and manufacturers, except appliances), (e), (take rate: 0.18))')]

# Consumer Key (conversion table)

In [32]:
consumer_detail_sdf

user_id,consumer_id
1,1195503
2,179208
3,1194530
4,154128
5,712975
6,407340
7,511685
8,448088
9,650435
10,1058499


# Consumer Dataset

In [33]:
consumer_sdf

name,address,state,postcode,gender,consumer_id
Yolanda Williams,413 Haney Gardens...,WA,6935,Female,1195503
Mary Smith,3764 Amber Oval,NSW,2782,Female,179208
Jill Jones MD,40693 Henry Greens,NT,862,Female,1194530
Lindsay Jimenez,00653 Davenport C...,NSW,2780,Female,154128
Rebecca Blanchard,9271 Michael Mano...,WA,6355,Female,712975
Karen Chapman,2706 Stewart Oval...,NSW,2033,Female,407340
Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
Stephen Williams,6804 Wright Crest...,WA,6056,Male,448088
Stephanie Reyes,5813 Denise Land ...,NSW,2482,Female,650435
Jillian Gonzales,461 Ryan Common S...,VIC,3220,Female,1058499


In [34]:
consumer_detail_sdf.printSchema()

root
 |-- user_id: long (nullable = true)
 |-- consumer_id: long (nullable = true)



# Check for null values in datasets (except merchant)

In [35]:
transaction_82_sdf.select(F.col("user_id")).count()

4508106

In [36]:
consumer_missing = consumer_sdf.select([F.count(F.when(F.col(c).contains('None') | \
                                                       F.col(c).contains('NULL') | \
                                                      (F.col(c) == '' ) | \
                                                       F.col(c).isNull() | \
                                                       F.isnan(c), c 
                                                      )).alias(c)
                                                       for c in consumer_sdf.columns])
consumer_missing.show()




+----+-------+-----+--------+------+-----------+
|name|address|state|postcode|gender|consumer_id|
+----+-------+-----+--------+------+-----------+
|   0|      0|    0|       0|     0|          0|
+----+-------+-----+--------+------+-----------+



                                                                                

In [37]:
consumer_detail_missing = consumer_detail_sdf.select([F.count(F.when(F.col(c).contains('None') | \
                                                       F.col(c).contains('NULL') | \
                                                      (F.col(c) == '' ) | \
                                                       F.col(c).isNull() | \
                                                       F.isnan(c), c 
                                                      )).alias(c)
                                                       for c in consumer_detail_sdf.columns])
consumer_detail_missing.show()

+-------+-----------+
|user_id|consumer_id|
+-------+-----------+
|      0|          0|
+-------+-----------+



In [38]:
transact_col = ['user_id', 'merchant_abn', 'dollar_value', 'order_id']
transaction_missing = transaction_28_sdf.select([F.count(F.when(F.col(c).contains('None') | \
                                                       F.col(c).contains('NULL') | \
                                                      (F.col(c) == '' ) | \
                                                       F.col(c).isNull() | \
                                                       F.isnan(c), c 
                                                      )).alias(c)
                                                       for c in transact_col])
transaction_missing.show()

[Stage 73:=====>                                                   (1 + 9) / 10]

+-------+------------+------------+--------+
|user_id|merchant_abn|dollar_value|order_id|
+-------+------------+------------+--------+
|      0|           0|           0|       0|
+-------+------------+------------+--------+



                                                                                

In [39]:
transact_col = ['user_id', 'merchant_abn', 'dollar_value', 'order_id']
transaction_missing = transaction_82_sdf.select([F.count(F.when(F.col(c).contains('None') | \
                                                       F.col(c).contains('NULL') | \
                                                      (F.col(c) == '' ) | \
                                                       F.col(c).isNull() | \
                                                       F.isnan(c), c 
                                                      )).alias(c)
                                                       for c in transact_col])
transaction_missing.show()



+-------+------------+------------+--------+
|user_id|merchant_abn|dollar_value|order_id|
+-------+------------+------------+--------+
|      0|           0|           0|       0|
+-------+------------+------------+--------+



                                                                                

# Fixed tags in Merchant
The tags in merchant are all grouped in the same column lets, split it to each respective columns

In [40]:
# regex to split the tags
split_col = F.split(merchant_sdf['tags'], '\]|\)', 3)
merchant_sdf = merchant_sdf.withColumn('prod_desc', split_col.getItem(0))
merchant_sdf = merchant_sdf.withColumn('revenue_level', split_col.getItem(1))
merchant_sdf = merchant_sdf.withColumn('take_rate', split_col.getItem(2))

In [41]:
merchant_sdf

name,tags,merchant_abn,prod_desc,revenue_level,take_rate
Felis Limited,"((furniture, home...",10023283211,"((furniture, home...",", (e",", (take rate: 0.18))"
Arcu Ac Orci Corp...,"([cable, satellit...",10142254217,"([cable, satellit...",", [b",", [take rate: 4.22])"
Nunc Sed Company,"([jewelry, watch,...",10165489824,"([jewelry, watch,...",", [b",", [take rate: 4.40])"
Ultricies Digniss...,"([wAtch, clock, a...",10187291046,"([wAtch, clock, a...",", [b",", [take rate: 3.29])"
Enim Condimentum PC,([music shops - m...,10192359162,([music shops - m...,", [a",", [take rate: 6.33])"
Fusce Company,"[(gift, card, nov...",10206519221,"[(gift, card, nov...",", (a",", (take rate: 6.34)]"
Aliquam Enim Inco...,"[(computers, comP...",10255988167,"[(computers, comP...",", (b",", (take rate: 4.32)]"
Ipsum Primis Ltd,"[[watch, clock, a...",10264435225,"[[watch, clock, a...",", [c",", [take rate: 2.39]]"
Pede Ultrices Ind...,([computer progra...,10279061213,([computer progra...,", [a",", [take rate: 5.71])"
Nunc Inc.,"[(furniture, home...",10323485998,"[(furniture, home...",", (a",", (take rate: 6.61)]"


In [42]:
merchant_sdf.select(F.col("take_rate")).take(5)

[Row(take_rate=', (take rate: 0.18))'),
 Row(take_rate=', [take rate: 4.22])'),
 Row(take_rate=', [take rate: 4.40])'),
 Row(take_rate=', [take rate: 3.29])'),
 Row(take_rate=', [take rate: 6.33])')]

In [50]:
@F.udf(returnType=StringType())
def clean_str(str):
    """ cleans string after extractions from TAGS columns, 
        remove brakcets.

    Args:
        str (str): 

    Returns:
        str: lowercase, with no leading or trailing parenthesis 
    """
    str = str.lstrip(' ,([')
    str = str.rstrip(')]')
    str = str.lower()
    return str

In [52]:
# cleaning the tags
merchant_sdf = merchant_sdf.withColumn('prod_desc', clean_str(F.col('prod_desc')))
merchant_sdf = merchant_sdf.withColumn('take_rate', clean_str(F.col('take_rate')))
merchant_sdf = merchant_sdf.withColumn('revenue_level', clean_str(F.col('revenue_level')))
merchant_sdf = merchant_sdf.withColumn('take_rate', F.regexp_extract('take_rate', '\\d*\\.\\d', 0))
merchant_sdf = merchant_sdf.withColumn('take_rate', F.col('take_rate').cast('double'))
merchant_sdf = merchant_sdf.drop("tags")

In [46]:
merchant_sdf

name,merchant_abn,prod_desc,revenue_level,take_rate
Felis Limited,10023283211,"furniture, home f...",e,0.1
Arcu Ac Orci Corp...,10142254217,"cable, satellite,...",b,4.2
Nunc Sed Company,10165489824,"jewelry, watch, c...",b,4.4
Ultricies Digniss...,10187291046,"watch, clock, and...",b,3.2
Enim Condimentum PC,10192359162,music shops - mus...,a,6.3
Fusce Company,10206519221,"gift, card, novel...",a,6.3
Aliquam Enim Inco...,10255988167,"computers, comput...",b,4.3
Ipsum Primis Ltd,10264435225,"watch, clock, and...",c,2.3
Pede Ultrices Ind...,10279061213,computer programm...,a,5.7
Nunc Inc.,10323485998,"furniture, home f...",a,6.6


## Check for missing

In [49]:
merchant_missing = merchant_sdf.select([F.count(F.when(F.col(c).contains('None') | \
                                                       F.col(c).contains('NULL') | \
                                                      (F.col(c) == '' ) | \
                                                       F.col(c).isNull() | \
                                                       F.isnan(c), c 
                                                      )).alias(c)
                                                       for c in merchant_sdf.columns])
merchant_missing.show()

+----+------------+---------+-------------+---------+
|name|merchant_abn|prod_desc|revenue_level|take_rate|
+----+------------+---------+-------------+---------+
|   0|           0|        0|            0|        0|
+----+------------+---------+-------------+---------+

