In [135]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [91]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

#### **PlanA**: Cluster merchants
#### Attributes: average dollar amount, number of unique customers, total number of orders

#### **PlanB**: Classify merchants
#### Attributes: order history of customer who purchase product in the missing merchant, average dollar amount, number of unique customers, total number of orders
### **Classification pipeline**:
 1. Data Engineering
  * Prepare data into ready-to-use format
    * Create a new dataframe from the original dataframe which relates unknown merchants to order history of its customers who purchases from at least one known merchant stores.
    * Clean the prod_desc
    * Create a new dataframe which retains only the useful attributes for revenue level and take rate
  * Need one curated dataset for modeling product description and one dataset for modeling revenue level and take rate
 2. Feature Engineering
  * Aggregate data to produce more useful features for modeling revenue level and take rate
  * Recommended features for prod_desc: dollar value, consumer id/ user id and consumer gender
  * Recommended features for revenue level and take rate: average dollar amount per month/week/day, total number of orders per month/week/day and number of distinct customers per month/week/day
 3. Data Modeling
  * Discuss and finalize choice of model: XGBoost, MLP, NB(last resort)
  * Fitting and Tuning model to achieve optimal performance 
 4. Model Validation
 * Metrics:
    * Categorical(prod_desc and revenue_level): 
      * Accuracy
      * f1 score
    * Continuous(take_rate):
      * RMSE
 * Visualization:
    * Categorical:
      * learning curve
      * ROC curve
      * confusion matrix 
    * Continuous:
      * RMSE vs. fitted value
 5. Model deployment
   * Use the prediction to impute missing information

In [92]:
transaction_20210828_20220227_sdf = spark.read.parquet("../data/curated/transactions_20210828_20220227")
transaction_20210828_20220227_missings_sdf = spark.read.parquet("../data/curated/transactions_20210828_20220227_missing_counts")

In [123]:
num_of_unknown_merchants = transaction_20210828_20220227_sdf.where(F.col("merchant_name").isNull()) \
                            .select(F.col("merchant_abn")).distinct().count()
num_of_order_from_unknown_merchants = transaction_20210828_20220227_sdf.where(F.col("merchant_name").isNull()) \
                            .select(F.col("merchant_abn")).count()

print(f"num_of_unknown_merchants = {num_of_unknown_merchants}" + "\n" +
      f"num_of_order_from_unknown_merchants = {num_of_order_from_unknown_merchants}")

num_of_unknown_merchants = 378
num_of_order_from_unknown_merchants = 149228


### Checking if every missing merchant has at least one related known merchant, i.e. if there is at least one customer who buys from an unknown merchant also buy from other merchants

In [116]:
transaction_20210828_20220227_sdf.createOrReplaceTempView('transactions')

spark.sql("""
SELECT merchant_abn, user_id
FROM transactions
WHERE merchant_name IS NULL
""").createOrReplaceTempView('unknown_merchants')

spark.sql("""
SELECT *
FROM transactions
WHERE merchant_name IS NOT NULL
""").createOrReplaceTempView('orders_in_known_merchants')


joined_sdf = spark.sql("""
SELECT table1.merchant_abn AS unknown_merchant_abn, 
    COUNT(table1.merchant_abn) AS num_of_order_in_known_merchant,
    COUNT(DISTINCT table2.user_id) AS num_of_distinct_customers,
    COUNT(DISTINCT table2.merchant_abn) AS num_of_known_merchant,
    COUNT(DISTINCT table2.prod_desc) AS num_of_distinct_prod_desc,
    COUNT(DISTINCT table2.revenue_level) AS num_of_distinct_revenue_level
FROM unknown_merchants AS table1
LEFT JOIN orders_in_known_merchants AS table2 
ON table1.user_id=table2.user_id
GROUP BY table1.merchant_abn
""")

joined_sdf.limit(5)

unknown_merchant_abn,num_of_order_in_known_merchant,num_of_distinct_customers,num_of_known_merchant,num_of_distinct_prod_desc,num_of_distinct_revenue_level
24406529929,151201,1021,3233,119,5
56395390867,1287,9,587,45,5
28767881738,136,1,116,24,4
45925655949,3286,23,1000,67,5
87802246756,116776,778,3092,116,5
87921002735,17654,122,1981,89,5
92220967360,15037,103,1848,87,5
31507950402,2402,17,869,57,5
49369565194,137,1,121,29,4
18899854203,285,2,201,28,5


In [124]:
joined_sdf.filter(F.col("num_of_order_in_known_merchant") == 0)

unknown_merchant_abn,num_of_order_in_known_merchant,num_of_distinct_customers,num_of_known_merchant,num_of_distinct_prod_desc,num_of_distinct_revenue_level


In [128]:
# Need to clean the data
transaction_20210828_20220227_sdf.select(F.col("prod_desc")).distinct().orderBy(F.col("prod_desc")).take(5)

[Row(prod_desc=None),
 Row(prod_desc='antique shops -  sales, repairs, and restoration services'),
 Row(prod_desc='antique shops - sales,  repairs, and restoration services'),
 Row(prod_desc='antique shops - sales, repairs, and restoration services'),
 Row(prod_desc='art dealers and  galleries')]

In [131]:
# take_rate is the independent of the dollar amount of an order
transaction_20210828_20220227_sdf.orderBy(F.col("merchant_abn")).limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,merchant_name,prod_desc,revenue_level,take_rate,consumer_name,address,state,postcode,gender,consumer_id
1242,10023283211,165.99501221571956,4e70d489-1f67-4b0...,2021-04-22,Felis Limited,"furniture, home f...",e,0.1,Erica Watson,53564 Kelley Cape,TAS,7177,Undisclosed,684491
1810,10023283211,171.66432177797722,96675508-f728-484...,2021-07-09,Felis Limited,"furniture, home f...",e,0.1,Alexandra Rosales,321 Erin Trail Ap...,WA,6466,Female,360964
1252,10023283211,314.4575966294427,c45c6aaf-f340-4aa...,2021-03-20,Felis Limited,"furniture, home f...",e,0.1,Linda Parsons,299 Bennett Overp...,VIC,3158,Female,1142978
369,10023283211,203.80384686785308,64b70a7f-e09c-48c...,2021-07-27,Felis Limited,"furniture, home f...",e,0.1,Taylor Santos,18268 Mario Manors,SA,5311,Female,78143
1530,10023283211,404.4366951596358,d4bdf3fb-f62c-4f0...,2021-06-15,Felis Limited,"furniture, home f...",e,0.1,Lisa Wiley,01925 Jillian Str...,SA,5253,Undisclosed,680149


In [150]:
# Take rate and revenue level are highly negatively correlated
from pyspark.ml.stat import Correlation
from pyspark.sql.types import ByteType, FloatType
revenue_dict = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}

@F.udf(returnType=ByteType())
def revenue_level_converter(rl):
    return revenue_dict[rl]

transaction_20210828_20220227_sdf.filter(F.col("revenue_level").isNotNull()) \
                                 .withColumn("take_rate", F.col("take_rate").cast(FloatType())) \
                                 .withColumn("revenue_level_int", revenue_level_converter(F.col("revenue_level"))) \
                                 .corr('revenue_level_int', 'take_rate')

-0.95208568740324