# Persona Score Notebook
### **Objective**: This notebook aims to define a score for the general profile of customers of a merchant. The higher the score is, the better the merchant.
### **Overview**: The persona score is based on **our research and knowledge about the BNPL industry**, and it is comprised of several factors such as median salary and age distributions of that region. Since we only have the geographical location of the customers, we will take the mean of all customers. <u>See the codes for more details</u>.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
import numpy as np
from functools import reduce

In [2]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/04 23:30:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/04 23:31:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
age_sdf = spark.read.option("header", True).csv("../data/raw/2016_age.csv")
education_sdf = spark.read.option("header", True).csv("../data/raw/2016_education.csv")

In [4]:
columns = ["sub_total", "18_24", "25_34", 
           "35_44", "45_54", "55_64", 
           "65_plus", "postcode"]

age_group = columns[:-1]

age_sdf = age_sdf.withColumn(columns[0], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(18, 116)])) \
                 .withColumn(columns[1], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(18, 25)]) / F.col(columns[0])) \
                 .withColumn(columns[2], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(25, 35)]) / F.col(columns[0])) \
                 .withColumn(columns[3], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(35, 45)]) / F.col(columns[0])) \
                 .withColumn(columns[4], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(45, 55)]) / F.col(columns[0])) \
                 .withColumn(columns[5], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(55, 65)]) / F.col(columns[0])) \
                 .withColumn(columns[6], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(65, 116)]) / F.col(columns[0])) \
                 .withColumn(columns[7], F.regexp_extract('AGEP Age', r'\d+', 0)) \
                 .select(*columns)
                
age_sdf.limit(5)

22/10/04 23:31:07 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


sub_total,18_24,25_34,35_44,45_54,55_64,65_plus,postcode
25969.0,0.2166044129539066,0.4111825638261003,0.1598059224459933,0.08290654241595749,0.0642689360391235,0.0652316223189187,2000
1210.0,0.9347107438016528,0.0454545454545454,0.0074380165289256,0.004958677685950...,0.0074380165289256,0.0,2006
8376.0,0.3998328557784145,0.3390639923591213,0.119269340974212,0.05742597898758357,0.0403533906399235,0.0440544412607449,2007
11191.0,0.3991600393173085,0.3594853006880529,0.1048163702975605,0.061388615852023945,0.0426235367706192,0.0325261370744348,2008
11534.0,0.1143575515866135,0.3546904803190567,0.2046991503381307,0.11938616264955783,0.1049939309866481,0.101872724119993,2009


In [5]:
student_sdf = education_sdf.withColumn("postcode", F.regexp_extract("postcode", r'\d+', 0)) \
                           .withColumnRenamed("total", "total_parttime_or_fulltime_students") \
                           .select("postcode", "total_parttime_or_fulltime_students")

student_sdf.limit(5)

postcode,total_parttime_or_fulltime_students
2000,8101
2006,1332
2007,4008
2008,5147
2009,1726


In [6]:
income_sdf = spark.read.option("header", True).csv("../data/raw/income_cleaned.csv") \
                                              .select("postcode", "median_salary")
income_sdf.limit(5)

postcode,median_salary
2000,575.0
2006,75.0
2007,350.0
2008,575.0
2009,1125.0


In [7]:
def mean(column_name):
   return F.mean(column_name).alias("mean_" + column_name)

def preprocessing_pipeline(dlist):
    """
    preprocess all dataframe in the list
        parameters: 
                   a list of dataframes
        returns:
                a generator of preprocessed dataframes
    """
    for data in dlist:
        # join persona data
        data = data.join(income_sdf, "postcode", "left") \
                   .join(student_sdf, "postcode", "left") \
                   .join(age_sdf, "postcode", "left")

        # remove rows contain null in persona
        data = data.filter(F.col("median_salary").isNotNull() \
                         | F.col("sub_total").isNotNull() \
                         | F.col("total_parttime_or_fulltime_students").isNotNull())

        # take the mean of all persona
        agg_data = data.groupBy("merchant_abn") \
                       .agg(
                            mean("median_salary"),
                            mean("total_parttime_or_fulltime_students"),
                            *[mean(name) for name in age_group]
                            )
        yield agg_data
        

In [8]:
transaction_sdf = spark.read.parquet("../data/curated/transactions_withoutfraud") \
                            .drop('user_id2',
                                  'order_datetime2',
                                  'fraud rate',
                                  'fraud2',
                                  'user_id3',
                                  'order_datetime3',
                                  'fraud_probability',
                                  'fraud3')

# training data is the part of data before the first fortnight of 2022
train_transaction_sdf = transaction_sdf.where(F.col("order_datetime") <= "2022-1-16")

data_list = [transaction_sdf, train_transaction_sdf]

data_list = list(preprocessing_pipeline(data_list))

data_list[0].limit(5)

                                                                                

merchant_abn,mean_median_salary,mean_total_parttime_or_fulltime_students,mean_sub_total,mean_18_24,mean_25_34,mean_35_44,mean_45_54,mean_55_64,mean_65_plus
34440496342,665.3571428571429,726.0285714285715,8046.771428571428,0.0961617859760343,0.1463846996979718,0.1560887016469868,0.1735831056885002,0.2077186469623857,0.220063060028121
15613631617,647.4765258215963,635.1502347417841,6140.830985915493,0.0935556474374147,0.1497074487677036,0.1565489160996497,0.1798134718044901,0.1928799103803407,0.227494605510401
83412691377,665.4689608636977,658.808439383791,6706.924426450742,0.1042725906515696,0.1514202470684468,0.1565249678996656,0.1837871420446609,0.1838697186090033,0.2201253337266535
24406529929,661.9674887892377,613.1066666666667,6236.961883408072,0.097030107357684,0.1508463018793074,0.1555258074090165,0.1851817022969094,0.1853457563373256,0.2260703247197568
73256306726,660.6919642857143,703.9039145907474,7043.998214285714,0.0993180955650494,0.1498894898085377,0.1557895084700852,0.187349726576246,0.1821697535401777,0.2254834260399038


In [9]:
def compute_persona_score(dlist, filenames):
    """
    compute persona scores for all merchants in a dataframe and write the result to a .csv file
    parameters:
               dlist (list): a list of dataframes
               filenames (list): a list of filenames in which the final results will be written
    return:
           a generator of dataframes containing merchant ABN and persona score
    """
    for data, filename in zip(dlist, filenames):
        data = data.withColumn("pop_index",
                                0.26*F.col("mean_18_24") + 
                                0.35*F.col("mean_25_34") + 
                                0.20*F.col("mean_35_44") + 
                                0.12*F.col("mean_45_54") + 
                                0.05*F.col("mean_55_64") +
                                0.01*F.col("mean_65_plus")
                                )
                                
        data = data.select("merchant_abn",
                            "mean_median_salary", 
                            "mean_total_parttime_or_fulltime_students",
                            "pop_index",
                            "mean_sub_total")

        for col in data.columns[1:]:
            col_max = data.agg({col:"max"}).collect()[0][0]
            col_min = data.agg({col:"min"}).collect()[0][0]
            data = data.withColumn(col, (F.col(col)-F.lit(col_min)) / (F.lit(col_max)-F.lit(col_min)) )
            data = data.withColumnRenamed(col, "normalized_"+col)
        

        data = data.withColumn("persona_score",
                               F.col("normalized_mean_median_salary") *
                               F.col("normalized_mean_total_parttime_or_fulltime_students") *
                               F.col("normalized_pop_index") *
                               F.col("normalized_mean_sub_total")
                               )
        
        data.select("merchant_abn", "persona_score").toPandas().to_csv("../data/curated" + filename, index=False)
        
        print("%s is done" % filename)

        yield data


In [10]:
filenames = ["full_persona_score.csv", "trained_persona_score.csv"]
list(compute_persona_score(data_list, filenames))[0].limit(5)

                                                                                

full_persona_score.csv is done


                                                                                

trained_persona_score.csv is done


                                                                                

merchant_abn,normalized_mean_median_salary,normalized_mean_total_parttime_or_fulltime_students,normalized_pop_index,normalized_mean_sub_total,persona_score
34440496342,0.3808755760368664,0.0708113304816708,0.401276553644346,0.1677114736127164,0.001815066069980...
15613631617,0.3693396940784492,0.0619477455127069,0.4044960327438271,0.1279679494936086,0.001184314286334...
83412691377,0.3809477166862566,0.0642551876898264,0.4208215090973179,0.139772383569329,0.001439769178086...
24406529929,0.3786887024446694,0.0597977827627686,0.4111098666098314,0.1299725140422068,0.001209976259968398
73256306726,0.3778657834101382,0.0686534589476979,0.4131089754842859,0.1468011972284117,0.001573237242596...
