# Creating a ranking feature (Loyalty rate)

# Objective: conduct feature engineering to create a loyalty rate for every merchant

In [38]:
# importing require libraries to run the code in this notebook
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# runing spark
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/08 11:07:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Feature Engineering for Ranking

#### 1. Creating features
- Loyality rate <br>
&nbsp;  
Groupby each merchant abn, and calculate:  
    $\text{repeat purchase rate} = \frac {number \ of \ repeat \ customers}{total \ customers}$  
    code implementation  
    - number of repeat customer - can be calculated by the number of re-occuring consumer id
    - total customers - total count of customers  

#### 2. Converting features to scores for ranking


In [4]:
# separating the transaction dataset into training and the full set
transaction_sdf = spark.read.parquet("../data/curated/fraud/output/transactions_withoutfraud")
train_transaction_sdf = transaction_sdf.where(F.col("order_datetime") <= "2022-1-16")

### Conduct aggregation and calculate loyality rate

In [5]:
# group by each merchant and count how many consumers purchased in this merchant
merchant_loyality_rate = transaction_sdf.groupBy("merchant_abn")\
                                        .agg(F.countDistinct("consumer_id").alias("Total_consumers"))
train_merchant_loyality_rate = train_transaction_sdf.groupBy("merchant_abn")\
                                                    .agg(F.countDistinct("consumer_id").alias("Total_consumers"))
merchant_loyality_rate.orderBy("merchant_abn")

                                                                                

merchant_abn,Total_consumers
10023283211,2619
10142254217,2439
10165489824,1
10187291046,287
10192359162,317
10206519221,6902
10255988167,702
10264435225,3921
10279061213,472
10323485998,7412


In [7]:
# find in each merchant, for each consumer whether they have purchased more than once in this merchant
customer_purchase_counts = transaction_sdf.groupBy("merchant_abn", "consumer_id")\
                                          .agg(F.count("merchant_abn").alias("customer_purchase_count"))
customer_purchase_counts = customer_purchase_counts.where(F.col("customer_purchase_count") > 1)
train_customer_purchase_counts = train_transaction_sdf.groupBy("merchant_abn", "consumer_id")\
                                                      .agg(F.count("merchant_abn").alias("customer_purchase_count"))
train_customer_purchase_counts = train_customer_purchase_counts.where(F.col("customer_purchase_count") > 1)
customer_purchase_counts.orderBy("merchant_abn")

                                                                                

merchant_abn,consumer_id,customer_purchase_count
10023283211,956479,2
10023283211,337772,2
10023283211,5807,2
10023283211,329389,2
10023283211,330651,2
10023283211,964186,2
10023283211,1202141,2
10023283211,1019655,2
10023283211,1417148,2
10023283211,1327376,2


In [8]:
# Count the number of customers that has repeated purchase in each merchant
repeated_customers = customer_purchase_counts.groupBy("merchant_abn")\
                                             .agg(F.count("consumer_id").alias("repeated_customers"))
repeated_customers = repeated_customers.withColumnRenamed("merchant_abn", "merchant_abn_del")

train_repeated_customers = train_customer_purchase_counts.groupBy("merchant_abn")\
                                             .agg(F.count("consumer_id").alias("repeated_customers"))
train_repeated_customers = train_repeated_customers.withColumnRenamed("merchant_abn", "merchant_abn_del")
repeated_customers.orderBy("merchant_abn_del")

                                                                                

merchant_abn_del,repeated_customers
10023283211,151
10142254217,139
10187291046,1
10192359162,1
10206519221,1143
10255988167,13
10264435225,329
10279061213,3
10323485998,1271
10342410215,7


In [9]:
# merging the counts of repeated customer and total customer in each merchant together
merchant_loyality_rate = merchant_loyality_rate.join(repeated_customers, 
                                                     merchant_loyality_rate.merchant_abn == repeated_customers.merchant_abn_del, 
                                                     "left").drop("merchant_abn_del")
train_merchant_loyality_rate = train_merchant_loyality_rate.join(train_repeated_customers, 
                                                                train_merchant_loyality_rate.merchant_abn == train_repeated_customers.merchant_abn_del, 
                                                                "left").drop("merchant_abn_del")

In [10]:
# for null values, meaning no purchase ever, or no repeated customer 0 is filled in.
merchant_loyality_rate = merchant_loyality_rate.na.fill(value=0,subset=["repeated_customers"])
train_merchant_loyality_rate = train_merchant_loyality_rate.na.fill(value=0,subset=["repeated_customers"])
merchant_loyality_rate

                                                                                

merchant_abn,Total_consumers,repeated_customers
35344855546,1268,35
83412691377,9587,2267
38700038932,5416,655
73256306726,4155,404
15613631617,1499,49
73841664453,808,11
24406529929,3307,248
48214071373,449,3
28767881738,3,0
96946925998,112,0


In [11]:
# calculate purchase rate by dividing repeated customers by total customers
merchant_loyality_rate = merchant_loyality_rate.withColumn("repeated_purchase_rate", 
                                                          (F.col("repeated_customers")) / (F.col("Total_consumers")))
train_merchant_loyality_rate = train_merchant_loyality_rate.withColumn("repeated_purchase_rate", 
                                                          (F.col("repeated_customers")) / (F.col("Total_consumers")))

In [12]:
merchant_loyality_rate.orderBy("repeated_purchase_rate", ascending=False)

                                                                                

merchant_abn,Total_consumers,repeated_customers,repeated_purchase_rate
86578477987,24080,24070,0.9995847176079734
24852446429,24080,24070,0.9995847176079734
64203420245,24080,24064,0.9993355481727576
49891706470,24078,24054,0.9990032394717168
46804135891,24076,24042,0.9985878052832696
45629217853,24076,24015,0.9974663565376308
89726005175,24071,23984,0.9963856923268664
43186523025,24064,23930,0.9944315159574468
80324045558,24063,23904,0.993392345094128
63290521567,24038,23799,0.9900574091022548


In [13]:
# check for null values
merchant_loyality_rate.select([F.count(F.when(F.col(c).contains('None') | \
                                                       F.col(c).contains('NULL') | \
                                                      (F.col(c) == '' ) | \
                                                       F.col(c).isNull() | \
                                                       F.isnan(c), c 
                                                      )).alias(c)
                                                       for c in merchant_loyality_rate.columns])

                                                                                

merchant_abn,Total_consumers,repeated_customers,repeated_purchase_rate
0,0,0,0


In [41]:
# grouping data by merchant abn to get standard deviation
agg_trans_sdf = transaction_sdf.groupBy("merchant_abn").agg(F.stddev("dollar_value").alias("SD of merchant revenue"))
agg_trans_sdf = agg_trans_sdf.withColumnRenamed("merchant_abn", "merchant_abn_del")

In [44]:
merchant_loyality_rate_vis = merchant_loyality_rate.join(agg_trans_sdf, 
                                                     merchant_loyality_rate.merchant_abn == agg_trans_sdf.merchant_abn_del, 
                                                     "left").drop("merchant_abn_del")
# plotting standard deviation against repeated purchase rate
merchant_loyality_rate_vis = merchant_loyality_rate_vis.select("merchant_abn", "repeated_purchase_rate", "SD of merchant revenue").toPandas()        
merchant_loyality_rate_vis["SD of merchant revenue"] = np.log10(merchant_loyality_rate_vis["SD of merchant revenue"])
sns.scatterplot(data=merchant_loyality_rate_vis, x="repeated_purchase_rate", y="SD of merchant revenue")
plt.xlabel("Repeated Purchase Rate")
plt.title("Repeated Purchase Rate against SD of merchant revenue")
plt.savefig("../plots/RPRvsSD.png")

[Stage 351:>                                                      (0 + 10) / 11]

In [None]:
# output the file
merchant_loyality_rate.toPandas().to_csv('../data/curated/final_model/input/loyalty_full.csv', index=False)
train_merchant_loyality_rate.toPandas().to_csv('../data/curated/final_model/input/loyalty_train.csv', index=False)

Exception in thread "serve-DataFrame" java.net.SocketTimeoutException: Accept timed out
	at java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)
