In [22]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd

In [23]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

## Classification of merchants
### **Objective**: Predict product description, revenue level and take rate of the missing merchants
### **Classification pipeline**:
 0. Preliminary Data Analysis
 1. Data Engineering
  * Mostly done in ETL
    * Encode revenue level into integer value, e.g. 1, 2, 3, 4, 5
    * Clean the prod_desc (has been updated in ETL)
  * Need one curated dataset for modeling product description and one dataset for modeling revenue level and take rate
 2. Feature Engineering
  * Aggregate data to produce more useful features for modeling revenue level and take rate
  * Recommended features for prod_desc: dollar value, user id and order datetime
  * Recommended features for revenue level and take rate: total revenue, total number of orders, number of distinct customers, average revenue per order, median revenue, variance of dollar amount
 3. Data Modeling
  * Choice of classification model: XGBClassifier, RandomForest, Naive Bayes(Last resort)
  * Choice of regression model: Linear regression, XGBregressor
  * Fitting and Tuning model to achieve optimal performance 
 4. Model Validation
 * Metrics:
    * Categorical(prod_desc and revenue_level): 
      * Accuracy
      * f1 score
    * Continuous(take_rate):
      * RMSE
 * Visualization:
    * Categorical:
      * learning curve
      * ROC curve
      * confusion matrix 
    * Continuous:
      * RMSE vs. fitted value
 5. Model deployment
   * Use the prediction to impute missing information


### Preliminary Data Analysis
#### Due to the limitation of my device, the PDA is performed on transaction between 2021/02/28 and 2021/08/27

In [24]:
transaction_20210228_20210827_sdf = spark.read.parquet("../data/curated/transactions_20210228_20210827")
transaction_20210828_20220227_sdf = spark.read.parquet("../data/curated/transactions_20210828_20220227")
transactions_20220228_20220828_sdf = spark.read.parquet("../data/curated/transactions_20220228_20220828")

In [25]:
num_of_unknown_merchants = transaction_20210228_20210827_sdf.where(F.col("merchant_name").isNull()) \
                            .select(F.col("merchant_abn")).distinct().count()
num_of_order_from_unknown_merchants = transaction_20210228_20210827_sdf.where(F.col("merchant_name").isNull()) \
                            .select(F.col("merchant_abn")).count()

print(f"num_of_unknown_merchants = {num_of_unknown_merchants}" + "\n" +
      f"num_of_order_from_unknown_merchants = {num_of_order_from_unknown_merchants}")

num_of_unknown_merchants = 378
num_of_order_from_unknown_merchants = 149228


#### Checking if every missing merchant has at least one related known merchant, <u>i.e. if there is at least one customer who buys from an unknown merchant also buy from other merchants</u>

In [26]:
transaction_20210828_20220227_sdf.createOrReplaceTempView('transactions')

spark.sql("""
SELECT merchant_abn, user_id
FROM transactions
WHERE merchant_name IS NULL
""").createOrReplaceTempView('unknown_merchants')

spark.sql("""
SELECT *
FROM transactions
WHERE merchant_name IS NOT NULL
""").createOrReplaceTempView('orders_in_known_merchants')


joined_sdf = spark.sql("""
SELECT table1.merchant_abn AS unknown_merchant_abn, 
    COUNT(table1.merchant_abn) AS num_of_order_in_known_merchant,
    COUNT(DISTINCT table2.user_id) AS num_of_distinct_customers,
    COUNT(DISTINCT table2.merchant_abn) AS num_of_known_merchant,
    COUNT(DISTINCT table2.prod_desc) AS num_of_distinct_prod_desc,
    COUNT(DISTINCT table2.revenue_level) AS num_of_distinct_revenue_level
FROM unknown_merchants AS table1
LEFT JOIN orders_in_known_merchants AS table2 
ON table1.user_id=table2.user_id
GROUP BY table1.merchant_abn
""")

joined_sdf.limit(5)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# Check if there is at least one customer for every unknown merchant who purchase from a known merchant
joined_sdf.filter(F.col("num_of_order_in_known_merchant") == 0)

unknown_merchant_abn,num_of_order_in_known_merchant,num_of_distinct_customers,num_of_known_merchant,num_of_distinct_prod_desc,num_of_distinct_revenue_level


In [None]:
# number of distinct product description
transaction_20210828_20220227_sdf.select(F.col("prod_desc")).distinct().orderBy(F.col("prod_desc")).count()

26

In [None]:
# take_rate is the independent of the dollar amount of an order
transaction_20210828_20220227_sdf.orderBy(F.col("merchant_abn")).limit(5)

user_id,merchant_abn,dollar_value,order_id,order_datetime,merchant_name,prod_desc,revenue_level,take_rate,consumer_name,address,state,postcode,gender,consumer_id
833,10023283211,21.775346128615805,9847e779-c5de-457...,2022-02-01,Felis Limited,"furniture, home f...",e,0.1,Robin Raymond,385 Mark Fords Ap...,NSW,2036,Female,920584
1695,10023283211,148.82340223562522,2bcea090-f737-455...,2021-10-09,Felis Limited,"furniture, home f...",e,0.1,Kayla Wallace,1396 Martinez Tunnel,NSW,2799,Female,92026
1126,10023283211,218.24789332638656,e292ab21-ca4b-408...,2022-02-18,Felis Limited,"furniture, home f...",e,0.1,Brian Weaver,804 Silva Rue Apt...,NSW,2152,Male,276072
146,10023283211,431.8016693626082,554224af-d0ee-40e...,2021-12-20,Felis Limited,"furniture, home f...",e,0.1,Nathan Barr,2778 Wayne Route,QLD,4467,Male,40487
1367,10023283211,521.8924051338662,93687f9c-e045-46b...,2021-11-17,Felis Limited,"furniture, home f...",e,0.1,Christine Jones,522 Douglas Green...,NSW,1163,Female,1170023


In [None]:
# Take rate and revenue level are highly negatively correlated
from pyspark.ml.stat import Correlation
from pyspark.sql.types import ByteType

revenue_dict = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}

@F.udf(returnType=ByteType())
def revenue_level_converter(rl):
    if rl:
        return revenue_dict[rl]
    else:
        return None

pc = transaction_20210828_20220227_sdf.filter(F.col("revenue_level").isNotNull()) \
                                      .withColumn("revenue_level_int", revenue_level_converter(F.col("revenue_level"))) \
                                      .corr('revenue_level_int', 'take_rate')
print("The Pearson's correlation coefficient between revenue level and take rate is " + str(pc))

The Pearson's correlation coefficient between revenue level and take rate is -0.9520944347764044


In [None]:
# 需要改的地方
transaction_sdf = transaction_20210228_20210827_sdf \
                    .union(transaction_20210828_20220227_sdf) \
                    .union(transactions_20220228_20220828_sdf)
merchant_df = transaction_sdf.withColumn("revenue_level_int", 
                                          revenue_level_converter(F.col("revenue_level"))) \
                             .groupBy("merchant_abn") \
                             .agg(
                                    F.mean("revenue_level_int").alias("revenue_level_int")
                             ).toPandas()

### Modeling 

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
import xgboost as xgb
rmse = make_scorer(mean_squared_error, squared=False)

In [None]:
# 需要改的地方
train_set = pd.read_csv("agg_transaction_train_sdf.csv")
pred_set = pd.read_csv("agg_transaction_pred_sdf.csv")
train_set = train_set.rename(columns={"first(take_rate)": "take_rate"})
pred_set = pred_set.rename(columns={"first(take_rate)": "take_rate"})
train_set = train_set.merge(merchant_df, on="merchant_abn", how="left")
train_set = train_set.iloc[:, 2:] # drop merchant_abn
train_set["revenue_level_int"] = train_set["revenue_level_int"].astype("int")
pred_set = pred_set.iloc[:, 2:] # drop merchant_abn
mse = make_scorer(mean_squared_error, squared=False)
train_set = train_set.fillna(0)

In [None]:
# 需要改的地方
X = train_set.iloc[:, :-2]
y_revenue_level = train_set.iloc[:, -1]
y_revenue_level = LabelEncoder().fit_transform(y_revenue_level)
y_take_rate = train_set.iloc[:, -2]

#### Classification of revenue level

##### Baseline Classifier

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
cross_val_score(dummy_clf, X, y_revenue_level, cv=5).mean()

0.3980609375482834

##### XGB Classifier

In [None]:
xgb_clf = xgb.XGBClassifier(
 eta=0.1,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=1,
 gamma=0.1,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'multi:softmax',
 nthread=4,
 num_class = 5,
 seed=2022)

cross_val_score(xgb_clf, X, y_revenue_level, cv=5).mean()

0.35430332808009646

##### Random Forest Classifier

In [None]:
rf_clf = RandomForestClassifier(max_depth=5, random_state=2022)
cross_val_score(rf_clf, X, y_revenue_level, cv=5).mean()

0.41322734155310403

#### Regression of take rate

##### Baseline Regressor

In [None]:
dummy_reg = DummyRegressor(strategy="mean")
cross_val_score(dummy_reg, X, y_revenue_level, scoring = rmse, cv=5).mean()

0.9144926857012308

##### XGB regressor

In [None]:
xgb_reg = xgb.XGBRegressor(
    eta=0.1,
    n_estimators=1000,
    max_depth=3,
    min_child_weight=1,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    nthread=4,
    seed=2022)

cross_val_score(xgb_reg, X, y_revenue_level, scoring = rmse, cv=5).mean()


0.9853465222115819

##### Linear Regression

In [None]:
lin_reg = LinearRegression()
cross_val_score(lin_reg, X, y_revenue_level, scoring = rmse, cv=5).mean()

0.913372486055068

#### Classification of prod_desc

In [None]:
# Naive Bayes Classification model is choosen because of its high efficiency
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, IndexToString
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
def col_indexer(col, data):
    """
    return transformed data where col(s) is mapped to an index
    """
    data = StringIndexer(inputCol=col[0], outputCol=col[0]+"_idx").fit(data).transform(data)
    if len(col) > 1:
        return col_indexer(col[1:], data)
    else:
        return data

def col_ohe(col, data):
    """
    return transformed data where col(s) is turned into a k-dimension vector, 
    k is the number of unique value of col
    """
    data = OneHotEncoder(inputCol=col[0], outputCol=col[0]+"_vector").fit(data).transform(data)
    if len(col) > 1:
        return col_ohe(col[1:], data)
    else:
        return data

assembler = VectorAssembler(inputCols=[
    "user_id_vector", "dollar_value",
    "order_doy_vector",
], outputCol='features')



modified_transaction_sdf = transaction_20210828_20220227_sdf.filter(F.col("merchant_name").isNotNull())\
                                                            .withColumn("order_doy", F.dayofyear("order_datetime"))
ohe_transaction_sdf = col_ohe(["user_id", "order_doy"], data=modified_transaction_sdf)
idx_transaction_sdf = col_indexer(["prod_desc"], data=ohe_transaction_sdf)
assembled_transaction_sdf = assembler.transform(idx_transaction_sdf).select("features", "prod_desc_idx")
assembled_transaction_sdf.limit(5)

features,prod_desc_idx
"(24447,[1,24081,2...",16.0
"(24447,[1,24081,2...",2.0
"(24447,[1,24081,2...",9.0
"(24447,[1,24081,2...",6.0
"(24447,[1,24081,2...",11.0


In [None]:
train_set, test_set = assembled_transaction_sdf.randomSplit([0.8, 0.2], seed=2022)
nb = NaiveBayes(smoothing=1.0, 
                modelType="multinomial", 
                labelCol="prod_desc_idx").fit(train_set)

prediction = nb.transform(test_set)

evaluator = MulticlassClassificationEvaluator(labelCol="prod_desc_idx",
                                              predictionCol="prediction",
                                              metricName="accuracy")

accuracy = evaluator.evaluate(prediction)
print("Test set accuracy = " + str(accuracy))

Test set accuracy = 0.10442367067689359


### Summary
#### Since all the models have poor performance in terms of accuracy/RMSE, it is not feasible to deploy them in an imputation process. From our perspectives, tuning the model will be ultimately a waste of time because it will only refine our model instead of improving it significantly. <u>Therefore, we decide to leave these merchants as unknown which can still be involved in our portfolio</u>.