# Persona Score Notebook
### **Objective**: This notebook aims to define a score for the general profile of customers of a merchant. The higher the score is, the better the merchant.
### **Overview**: The persona score is based on **our research and knowledge about the BNPL industry**, and it is comprised of several factors such as median salary and age distributions of that region. Since we only have the geographical location of the customers, we will take the mean of all customers. <u>See the codes for more details</u>.

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
import numpy as np
from functools import reduce

In [4]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [5]:
age_sdf = spark.read.option("header", True).csv("../data/tables/external/by_postcode/1 year age.csv")
education_sdf = spark.read.option("header", True).csv("../data/tables/external/by_postcode/education.csv")

                                                                                

In [6]:
columns = ["sub_total", "18_24", "25_34", "35_44", "45_54", "55_64", "65_plus", "postcode"]
age_group = columns[:-1]
age_sdf = age_sdf.withColumn(columns[0], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(18, 116)])) \
                .withColumn(columns[1], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(18, 25)]) / F.col(columns[0])) \
                .withColumn(columns[2], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(25, 35)]) / F.col(columns[0])) \
                .withColumn(columns[3], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(35, 45)]) / F.col(columns[0])) \
                .withColumn(columns[4], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(45, 55)]) / F.col(columns[0])) \
                .withColumn(columns[5], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(55, 65)]) / F.col(columns[0])) \
                .withColumn(columns[6], reduce(lambda x,y:x+y, [F.col(str(x)) for x in range(65, 116)]) / F.col(columns[0])) \
                .withColumn(columns[7], F.regexp_extract('AGEP Age', r'\d+', 0)) \
                .select(*columns)
age_sdf.limit(5)

22/10/06 21:43:59 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


sub_total,18_24,25_34,35_44,45_54,55_64,65_plus,postcode
25969.0,0.2166044129539066,0.4111825638261003,0.1598059224459933,0.08290654241595749,0.0642689360391235,0.0652316223189187,2000
1210.0,0.9347107438016528,0.0454545454545454,0.0074380165289256,0.004958677685950...,0.0074380165289256,0.0,2006
8376.0,0.3998328557784145,0.3390639923591213,0.119269340974212,0.05742597898758357,0.0403533906399235,0.0440544412607449,2007
11191.0,0.3991600393173085,0.3594853006880529,0.1048163702975605,0.061388615852023945,0.0426235367706192,0.0325261370744348,2008
11534.0,0.1143575515866135,0.3546904803190567,0.2046991503381307,0.11938616264955783,0.1049939309866481,0.101872724119993,2009


In [7]:
student_sdf = education_sdf.withColumn("postcode", F.regexp_extract("postcode", r'\d+', 0)) \
                           .withColumnRenamed("total", "total_parttime_or_fulltime_students") \
                           .select("postcode", "total_parttime_or_fulltime_students")

student_sdf.limit(5)

postcode,total_parttime_or_fulltime_students
2000,8101
2006,1332
2007,4008
2008,5147
2009,1726


In [9]:
income_sdf = spark.read.option("header", True).csv("../data/curated/persona/input/income_cleaned.csv") \
                                              .select("postcode", "median_salary")
income_sdf.limit(5)

postcode,median_salary
2000,575.0
2006,75.0
2007,350.0
2008,575.0
2009,1125.0


In [10]:
def mean(column_name):
   return F.mean(column_name).alias("mean_" + column_name)

def preprocessing_pipeline(dlist):
    """
    preprocess all dataframe in the list
        parameters: 
                   a list of dataframes
        returns:
                a generator of preprocessed dataframes
    """
    for data in dlist:
        # join persona data
        data = data.join(income_sdf, "postcode", "left") \
                   .join(student_sdf, "postcode", "left") \
                   .join(age_sdf, "postcode", "left")

        # remove rows contain null in persona
        data = data.filter(F.col("median_salary").isNotNull() \
                         | F.col("sub_total").isNotNull() \
                         | F.col("total_parttime_or_fulltime_students").isNotNull())

        # take the mean of all persona
        agg_data = data.groupBy("merchant_abn") \
                       .agg(
                            mean("median_salary"),
                            mean("total_parttime_or_fulltime_students"),
                            *[mean(name) for name in age_group]
                            )
        yield agg_data
        

In [16]:
transaction_sdf = spark.read.parquet("../data/curated/fraud/output/transactions_withoutfraud") \
                            .drop('user_id2',
                                  'order_datetime2',
                                  'fraud rate',
                                  'fraud2',
                                  'user_id3',
                                  'order_datetime3',
                                  'fraud_probability',
                                  'fraud3')

# training data is the part of data before the first fortnight of 2022
train_transaction_sdf = transaction_sdf.where(F.col("order_datetime") <= "2022-1-16")

data_list = [transaction_sdf, train_transaction_sdf]

data_list = list(preprocessing_pipeline(data_list))

data_list[0].limit(5)

                                                                                

merchant_abn,mean_median_salary,mean_total_parttime_or_fulltime_students,mean_sub_total,mean_18_24,mean_25_34,mean_35_44,mean_45_54,mean_55_64,mean_65_plus
24406529929,659.668001334668,678.0102921646746,6699.218218218218,0.0996586172815709,0.1534714554992223,0.1567116082161027,0.1859255201946583,0.1824763157896933,0.2217564830187523
35344855546,669.4192377495463,678.635294117647,6642.09165154265,0.103974865113541,0.161525246867963,0.1592912800681017,0.1836030600523174,0.1771480578959088,0.2144574900021678
83412691377,665.0935332616978,669.2734071484072,6706.147406466738,0.1004879132010974,0.1518681098266721,0.1575514444389872,0.1860731278860281,0.1819941024905343,0.2220253021566809
73256306726,660.8091313448456,691.7938036969539,6926.07326007326,0.1004888137879705,0.1532768305997583,0.1564661336160533,0.1862470174391106,0.1812607455503809,0.2222604590067261
38700038932,671.3919288645691,700.781019058732,7101.62634356068,0.0995006112054604,0.1532356202175673,0.1575686463386714,0.185597442249937,0.1808587483577184,0.2232389316306455


In [17]:
def compute_persona_score(dlist, filenames):
    """
    compute persona scores for all merchants in a dataframe and write the result to a .csv file
    parameters:
               dlist (list): a list of dataframes
               filenames (list): a list of filenames in which the final results will be written
    return:
           a generator of dataframes containing merchant ABN and persona score
    """
    for data, filename in zip(dlist, filenames):
        data = data.withColumn("pop_index",
                                0.26*F.col("mean_18_24") + 
                                0.35*F.col("mean_25_34") + 
                                0.20*F.col("mean_35_44") + 
                                0.12*F.col("mean_45_54") + 
                                0.05*F.col("mean_55_64") +
                                0.01*F.col("mean_65_plus")
                                )
                                
        data = data.select("merchant_abn",
                            "mean_median_salary", 
                            "mean_total_parttime_or_fulltime_students",
                            "pop_index",
                            "mean_sub_total")

        for col in data.columns[1:]:
            col_max = data.agg({col:"max"}).collect()[0][0]
            col_min = data.agg({col:"min"}).collect()[0][0]
            data = data.withColumn(col, (F.col(col)-F.lit(col_min)) / (F.lit(col_max)-F.lit(col_min)) )
            data = data.withColumnRenamed(col, "normalized_"+col)
        

        data = data.withColumn("persona_score",
                               F.col("normalized_mean_median_salary") *
                               F.col("normalized_mean_total_parttime_or_fulltime_students") *
                               F.col("normalized_pop_index") *
                               F.col("normalized_mean_sub_total")
                               )
        
        data.select("merchant_abn", "persona_score").toPandas().to_csv("../data/curated/final_model/input/" + filename, index=False)
        
        print("%s is done" % filename)

        yield data


In [18]:
filenames = ["persona_full.csv", "persona_train.csv"]
list(compute_persona_score(data_list, filenames))[0].limit(5)

                                                                                

persona_full.csv is done


                                                                                

persona_train.csv is done


                                                                                

merchant_abn,normalized_mean_median_salary,normalized_mean_total_parttime_or_fulltime_students,normalized_pop_index,normalized_mean_sub_total,persona_score
24406529929,0.3779721750736243,0.1328737180881907,0.4779098484947136,0.1883568128671969,0.00452091384808476
35344855546,0.3864515110865619,0.1329962034461027,0.5008619514909254,0.1867470243058767,0.004807352688715...
83412691377,0.3826900289232154,0.1311614986572525,0.4768259062620374,0.1885520727721908,0.004512785288221149
73256306726,0.3789644620389961,0.13557495499679,0.4783969948138737,0.1947494366971922,0.004786770514734344
38700038932,0.3881668946648426,0.1373362331575774,0.4775652307801571,0.1996964055445847,0.005084012069308436
