# Aggregate data for first 3 variables of final model (WITH TAKE RATE and WITHOUT FRAUD)

Purpose: 
- aggregate the transaction data into merchant-fortnights
- create the first three variables for the final model (historic mean revenue * take rate; historic sd of revenue * take rate; historic corr with market)


In [2]:
# import libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
from statistics import mean, stdev

import json

In [None]:
# create directory persona if does not exist
import os

dirs_to_create = ['../data/curated/final_model', '../data/curated/final_model/input', '../data/curated/final_model/output']

def create_dirs(dirs_to_create):
    # check if it exists as it makedir will raise an error if it does exist
    for dir_to_create in dirs_to_create:
        if not os.path.exists(dir_to_create):
            os.makedirs(dir_to_create)

create_dirs(dirs_to_create)

In [3]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

22/10/06 11:57:51 WARN Utils: Your hostname, modaxuexiweiyuanzhangde-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.13.88.250 instead (on interface en0)
22/10/06 11:57:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/06 11:57:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/06 11:57:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/06 11:57:53 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
data = spark.read.parquet("../data/curated/fraud/output/transactions_withoutfraud")

                                                                                

In [5]:
data.count()

                                                                                

12143162

In [6]:
# create some new columns
data = data.withColumn("Week", F.weekofyear("order_datetime"))\
        .withColumn("Year", F.year("order_datetime"))\
        .withColumn("Month", F.month("order_datetime"))\
        .withColumn("Day", F.dayofmonth("order_datetime"))\
        .withColumn("Fortnight", ((F.col("Week")+1)/2).cast('int'))

In [7]:
# drop dates that don't make a full week at start and end
data = data.filter(data["order_datetime"] >= F.lit('2021-03-01')) \
       .filter(data["order_datetime"] <= F.lit('2022-08-14'))
data

                                                                                

user_id2,order_datetime2,fraud rate,fraud2,user_id,merchant_abn,dollar_value,order_id,order_datetime,user_id3,order_datetime3,fraud_probability,fraud3,Week,Year,Month,Day,Fortnight
1,2021-04-18,9.290493571169254,0,1,82912636758,1435.794850018464,e79aeb7e-043b-45f...,2021-04-18,,,,,15,2021,4,18,8
1,2021-04-23,9.287148398864032,0,1,33604812025,93.0140647407636,ea1107f4-3d57-441...,2021-04-23,,,,,16,2021,4,23,8
1,2021-05-02,9.287148398864032,0,1,64203420245,13.795710586158274,7b2952e9-17d8-429...,2021-05-02,,,,,17,2021,5,2,9
1,2021-06-08,9.287148398864032,0,1,19575005485,14.277862286033695,a72b651c-bffb-4a9...,2021-06-08,,,,,23,2021,6,8,12
1,2021-07-23,9.287148398864032,0,1,46298404088,92.313093954783,2773225f-80a5-411...,2021-07-23,,,,,29,2021,7,23,15
1,2021-09-30,9.290493571169254,0,1,46674437504,517.5794837525516,e5746eb5-19e0-43d...,2021-09-30,,,,,39,2021,9,30,20
1,2021-10-30,9.287148398864032,0,1,51279178333,9.611126453764486,bfaf2ae8-068f-48f...,2021-10-30,,,,,43,2021,10,30,22
1,2021-11-03,9.287148398864032,0,1,98268965514,269.4124531844491,7d0a4cce-0d66-455...,2021-11-03,,,,,44,2021,11,3,22
1,2021-11-04,9.466559691318782,0,1,49891706470,46.15699347673238,df726124-0859-45c...,2021-11-04,,,,,44,2021,11,4,22
1,2021-11-04,9.466559691318782,0,1,45572698303,333.761456660311,7d351762-2877-4c2...,2021-11-04,,,,,44,2021,11,4,22


In [8]:
data.count()

                                                                                

12143162

## Aggregate by Weeks

In [9]:
transact_agg_by_comp_fortnight = data.groupBy("Year", "Fortnight", "merchant_abn")\
        .agg(F.sum("dollar_value")\
        .alias("sum_transactions"), \
         F.count("order_id").alias("number_of_customers"), \
         F.countDistinct("user_id").alias("distinct_customers"))

In [10]:
transact_agg_comp_fortnight_df = transact_agg_by_comp_fortnight.toPandas()

# get train data: 23 weeks
transact_agg_by_comp_fortnight_train_df = transact_agg_comp_fortnight_df[(transact_agg_comp_fortnight_df['Year'] == 2021) | \
    ((transact_agg_comp_fortnight_df['Year'] == 2022) & ((transact_agg_comp_fortnight_df['Fortnight'] == 1) | \
         (transact_agg_comp_fortnight_df['Fortnight'] == 26)))]

# get validate data: 16 weeks
transact_agg_by_comp_fortnight_validate_df = transact_agg_comp_fortnight_df[(transact_agg_comp_fortnight_df['Year'] == 2022) & \
    ((transact_agg_comp_fortnight_df['Fortnight'] > 1) & (transact_agg_comp_fortnight_df['Fortnight'] <= 16))]

                                                                                

In [11]:
len(transact_agg_by_comp_fortnight_train_df)

88181

In [12]:
len(transact_agg_by_comp_fortnight_validate_df)

55876

In [13]:
len(transact_agg_comp_fortnight_df)

144057

In [14]:
def get_fill_na_df(transact_agg_comp_period_df, period):
    """ helper function to create dataframe of all combos and 0 value for fillup """

    ## Fill up na
    # get set of merchants
    distinct_merchants = set(transact_agg_comp_period_df.merchant_abn)

    # get set of year and period
    year_period_set = list()

    for id, dta in transact_agg_comp_period_df.groupby(['Year', period]):
        year_period_set.append(id)

    year_period_set = year_period_set[:-1]

    years = [x[0] for x in year_period_set]
    periods = [x[1] for x in year_period_set]

    # make dataframes and join up
    fill_na_time = pd.DataFrame({"Year": years, period: periods})

    fill_na_companies = pd.DataFrame({"merchant_abn": list(distinct_merchants)})

    fill_na_df = fill_na_time.merge(fill_na_companies, how = 'cross')

    for col in transact_agg_comp_period_df.columns[3:]:
        fill_na_df[col] = 0

    return fill_na_df

In [15]:
def correct_stray_period(transact_agg_comp_period_df, period):
        """ correct problem of first few days of 2022 classified as wk 52 or fortnight 26, 
        which under our aggregation becomes separate week of 2022 week 52 or fortnight 26 """

        # figure out what the stray period number is
        if period == 'Week':
                stray_period_id = 52
        else:
                stray_period_id = 26

        # take out just the stray period data
        stray_period = transact_agg_comp_period_df[(transact_agg_comp_period_df[period] == stray_period_id) & 
                (transact_agg_comp_period_df['Year'] == 2022)]

        transact_agg_comp_period_df = transact_agg_comp_period_df.drop(stray_period.index)

        # update the stray period data's year
        stray_period['Year'] = stray_period['Year']-1
        # put it back
        transact_agg_comp_period_df = pd.concat([transact_agg_comp_period_df, stray_period])

        return transact_agg_comp_period_df

In [16]:
def fill_na(transact_agg_comp_period_df, fill_na_df, period):
    """ Helper function to fill na """
    
    transact_agg_comp_period_filled_df = pd.concat([transact_agg_comp_period_df, fill_na_df])
    transact_agg_comp_period_filled_df = transact_agg_comp_period_filled_df.groupby(['Year', period, 'merchant_abn'])\
        .agg({'sum_transactions': sum, 'number_of_customers': sum, 'distinct_customers': sum})
    transact_agg_comp_period_filled_df = transact_agg_comp_period_filled_df.reset_index()

    return transact_agg_comp_period_filled_df

In [17]:
def remove_prestart_filled_weeks(transact_agg_comp_period_filled_df, period): 
    """ Helper function to clean out wrongly filled 0 weeks (which will distort mean and variance) """

    # remove weeks before first BNPL transaction
    transact_agg_comp_period_filled_adjusted_df = pd.DataFrame()

    for id, dta in transact_agg_comp_period_filled_df.groupby(['merchant_abn']):

        # iterate down the rows, and drop rows until first row where there is non0 transaction value
        dta = dta.sort_values(['Year', 'Fortnight'])
        dta.index = range(len(dta))

        for row in dta.iterrows(): 
            if row[1][5] == 0: # if row's transaction value = 0
                dta = dta.drop(row[0])
            
            else:
                break

        transact_agg_comp_period_filled_adjusted_df = pd.concat([transact_agg_comp_period_filled_adjusted_df, dta])

    return transact_agg_comp_period_filled_adjusted_df

In [18]:
def get_transactions_agg_by_period(transact_agg_comp_period_df, period):
    """ ETL function that runs all helpers to clean the aggregated merchant transaction data """

    fill_na_df = get_fill_na_df(transact_agg_comp_period_df, period)

    transact_agg_comp_period_df = correct_stray_period(transact_agg_comp_period_df, period)

    transact_agg_comp_period_filled_df = fill_na(transact_agg_comp_period_df, fill_na_df, period)

    transact_agg_comp_period_filled_adjusted_df = remove_prestart_filled_weeks(transact_agg_comp_period_filled_df, period)

    return transact_agg_comp_period_filled_adjusted_df

In [19]:
transact_agg_comp_fortnight_filled_adjusted_df = get_transactions_agg_by_period(transact_agg_comp_fortnight_df, 'Fortnight')

transact_agg_comp_fortnight_fill_adjusted_train_df = get_transactions_agg_by_period(transact_agg_by_comp_fortnight_train_df, 'Fortnight')

transact_agg_comp_fortnight_fill_adjusted_validate_df = get_transactions_agg_by_period(transact_agg_by_comp_fortnight_validate_df, 'Fortnight')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stray_period['Year'] = stray_period['Year']-1


Run once without take rate - export for other model use

In [20]:
transact_agg_comp_fortnight_filled_adjusted_df.to_csv('../data/curated/final_model/input/fortnightly_agg_merchant_transactions_NOTAKE_NOFRAUD.csv', index=False)

transact_agg_comp_fortnight_fill_adjusted_train_df.to_csv('../data/curated/final_model/input/fortnightly_agg_merchant_transactions_train_NOTAKE_NOFRAUD.csv', index=False)

transact_agg_comp_fortnight_fill_adjusted_validate_df.to_csv('../data/curated/final_model/input/fortnightly_agg_merchant_transactions_validate_NOTAKE_NOFRAUD.csv', index=False)

Apply take rate and rerun

In [23]:
take_rate = pd.read_csv('../data/curated/clusters/output/merchant_take_rate_full.csv')
take_rate = take_rate.drop('label', axis=1)

In [24]:
def apply_take_rate(transact_agg_comp_filled_adjusted_df, take_rate):
    """ Helper function to run """

    transact_agg_comp_filled_adjusted_df = transact_agg_comp_filled_adjusted_df.merge(take_rate, on='merchant_abn', how='inner')
    transact_agg_comp_filled_adjusted_df['sum_transactions'] = transact_agg_comp_filled_adjusted_df['sum_transactions'] * transact_agg_comp_filled_adjusted_df['take_rate']
    
    return transact_agg_comp_filled_adjusted_df

In [25]:
transact_agg_comp_fortnight_filled_adjusted_df = apply_take_rate(transact_agg_comp_fortnight_filled_adjusted_df, take_rate)

transact_agg_comp_fortnight_fill_adjusted_train_df = apply_take_rate(transact_agg_comp_fortnight_fill_adjusted_train_df, take_rate)

transact_agg_comp_fortnight_fill_adjusted_validate_df = apply_take_rate(transact_agg_comp_fortnight_fill_adjusted_validate_df, take_rate)

In [26]:
transact_agg_comp_fortnight_filled_adjusted_df.to_csv('../data/curated/final_model/input/fortnightly_agg_merchant_transactions_NOFRAUD.csv', index=False)

transact_agg_comp_fortnight_fill_adjusted_train_df.to_csv('../data/curated/final_model/input/fortnightly_agg_merchant_transactions_train_NOFRAUD.csv', index=False)

transact_agg_comp_fortnight_fill_adjusted_validate_df.to_csv('../data/curated/final_model/input/fortnightly_agg_merchant_transactions_validate_NOFRAUD.csv', index=False)

In [27]:
len(transact_agg_comp_fortnight_filled_adjusted_df)

162709

In [28]:
len(transact_agg_comp_fortnight_fill_adjusted_train_df)

97179

In [29]:
len(transact_agg_comp_fortnight_fill_adjusted_validate_df)

62021

# Get Mean and Variance

In [30]:
def get_nperiods_and_drop_low_counts(transact_agg_comp_periods_df, period, low_counts=2):
    """ get nperiod values and also drop merchants that don't have enough periods of records """

    # get number of periods with observed data per merchant
    n_periods = transact_agg_comp_periods_df.groupby(['merchant_abn']).agg({period: 'count'})
    n_periods = n_periods.rename(columns = {period: 'n_periods'})

    # get list of low count abns and drop
    low_count_merchants = n_periods[n_periods['n_periods'] <= low_counts].index

    for abn in list(low_count_merchants):
        transact_agg_comp_periods_df = transact_agg_comp_periods_df[transact_agg_comp_periods_df['merchant_abn'] != abn]

    return transact_agg_comp_periods_df, n_periods

In [31]:
def get_mean_sd_nperiods(transact_agg_comp_period_df, n_periods):
     """ Helper to get mean, sd and nperiods of merchants """

     # create replicate column to allow for two aggregations on the same column of data
     transact_agg_comp_period_df['sum_transactions2'] = transact_agg_comp_period_df['sum_transactions']
     mean_sd = transact_agg_comp_period_df.groupby(['merchant_abn']).agg({'sum_transactions': mean, 'sum_transactions2': stdev})

     mean_sd = mean_sd.rename(columns = {'sum_transactions': 'mean',\
          'sum_transactions2': 'stdev'})

     mean_sd = mean_sd.reset_index()
     
     mean_sd_nperiods = mean_sd.merge(n_periods, on = ['merchant_abn'], how = 'inner')

     return mean_sd_nperiods

In [32]:
def get_merchant_mean_sd_nperiods(transact_agg_comp_periods_df, period, low_counts = 2):
    """ Function that runs helpers to get the merchant data in mean, sd and nperiods """

    transact_agg_comp_periods_df, n_periods = get_nperiods_and_drop_low_counts(transact_agg_comp_periods_df, period, low_counts)

    mean_sd_nperiods = get_mean_sd_nperiods(transact_agg_comp_periods_df, n_periods)

    return mean_sd_nperiods

In [33]:
mean_sd_nperiods_fortnight = get_merchant_mean_sd_nperiods(transact_agg_comp_fortnight_filled_adjusted_df, 'Fortnight', low_counts = 2)

mean_sd_nperiods_fortnight_train = get_merchant_mean_sd_nperiods(transact_agg_comp_fortnight_fill_adjusted_train_df, 'Fortnight', low_counts = 2)

mean_sd_nperiods_fortnight_validate = get_merchant_mean_sd_nperiods(transact_agg_comp_fortnight_fill_adjusted_validate_df, 'Fortnight', low_counts = 2)

In [34]:
mean_sd_nperiods_fortnight

Unnamed: 0,merchant_abn,mean,stdev,n_periods
0,10023283211,71944.954088,15268.757666,38
1,10142254217,22687.034837,6496.702667,38
2,10187291046,6831.716274,3379.999688,38
3,10192359162,40955.314495,17639.407542,38
4,10206519221,88038.400673,21292.527062,38
...,...,...,...,...
4372,99938978285,106551.591313,21699.148448,38
4373,99974311662,6326.735554,4783.350900,38
4374,99976658299,871306.137224,186985.910229,38
4375,99987905597,16290.096561,10052.773545,38


# Market

In [35]:
def get_market(transact_agg_comp_period_df, period):
    """ get the sum of all merchants data by period """

    market = transact_agg_comp_period_df.groupby(['Year', period])\
        .agg({'sum_transactions': sum, 'number_of_customers': 'count', 'distinct_customers': lambda x: x.nunique()})
    
    market = market.reset_index()

    return market

In [36]:
def get_market_mean_and_sd(market):
    """ get the mean and sd of the market """

    market_mean = mean(market['sum_transactions'])
    market_sd = stdev(market['sum_transactions'])

    return market_mean, market_sd

In [37]:
def get_market_and_stats(transact_agg_comp_period_df, period):

    market = get_market(transact_agg_comp_period_df, period)

    market_mean, market_sd = get_market_mean_and_sd(market)

    return market, market_mean, market_sd

In [38]:
market_fortnight, market_mean_fortnight, market_sd_fortnight = get_market_and_stats(transact_agg_comp_fortnight_df, 'Fortnight')

market_fortnight_train, market_mean_fortnight_train, market_sd_fortnight_train = get_market_and_stats(transact_agg_comp_fortnight_df, 'Fortnight')

market_fortnight_validate, market_mean_fortnight_validate, market_sd_fortnight_validate = get_market_and_stats(transact_agg_comp_fortnight_df, 'Fortnight')

In [39]:
market_fortnight.to_csv('../data/curated/final_model/input/market-all_by_fortnight_NOFRAUD.csv', index=False)

market_fortnight_train.to_csv('../data/curated/final_model/input/market-all_by_fortnight_train_NOFRAUD.csv', index=False)

market_fortnight_validate.to_csv('../data/curated/final_model/input/market-all_by_fortnight_validate_NOFRAUD.csv', index=False)

# Correlation

In [40]:
def get_corr_df(transact_agg_comp_period_df, market, period):
    """ get dataframe of correlations """

    merchant_abns = list()
    corr = list()

    # get each abn's periodic data, left join on market, and then get correlation
    for abn in list(set(transact_agg_comp_period_df['merchant_abn'])):
        merchant = transact_agg_comp_period_df[transact_agg_comp_period_df['merchant_abn'] == abn]

        merchant_market = merchant.merge(market, how = 'left', on = ['Year', period])
        merchant_abns.append(abn)
        corr.append(np.corrcoef(merchant_market['sum_transactions_x'], merchant_market['sum_transactions_y'])[0][1])

    corr_df = pd.DataFrame({'merchant_abn': merchant_abns, 'corr': corr})

    return corr_df

In [41]:
def get_mean_sd_corr_nperiods(mean_sd_nperiods, corr_df, market_sd):
    """ add corr and corr related variables to mean, sd dataframe """

    mean_sd_corr_nperiods = mean_sd_nperiods.merge(corr_df, on = ['merchant_abn'], how = 'inner')
    
    return mean_sd_corr_nperiods

In [42]:
def get_corr(transact_agg_comp_period_df, market, period, mean_sd_nperiods, market_sd):
    
    corr_df = get_corr_df(transact_agg_comp_period_df, market, period)

    mean_sd_corr_nperiods = get_mean_sd_corr_nperiods(mean_sd_nperiods, corr_df, market_sd)

    return mean_sd_corr_nperiods

In [43]:
mean_sd_corr_nperiods_fortnight = get_corr(transact_agg_comp_fortnight_df, market_fortnight, \
    'Fortnight', mean_sd_nperiods_fortnight, market_sd_fortnight)

mean_sd_corr_nperiods_fortnight_train = get_corr(transact_agg_by_comp_fortnight_train_df, market_fortnight_train, \
    'Fortnight', mean_sd_nperiods_fortnight_train, market_sd_fortnight_train)

mean_sd_corr_nperiods_fortnight_validate = get_corr(transact_agg_by_comp_fortnight_validate_df, market_fortnight_validate, \
    'Fortnight', mean_sd_nperiods_fortnight_validate, market_sd_fortnight_validate)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [44]:
mean_sd_corr_nperiods_fortnight.to_csv('../data/curated/final_model/input/agg_fortnightly_mean_sd_marketcorr_NOFRAUD.csv', index=False)

mean_sd_corr_nperiods_fortnight_train.to_csv('../data/curated/final_model/input/agg_fortnightly_mean_sd_marketcorr_train_NOFRAUD.csv', index=False)

mean_sd_corr_nperiods_fortnight_validate.to_csv('../data/curated/final_model/input/agg_fortnightly_mean_sd_marketcorr_validate_NOFRAUD.csv', index=False)

# Analysis

In [45]:
mean_sd_corr_nperiods_fortnight.describe().drop(['merchant_abn', 'n_periods'], axis=1)[1:]

Unnamed: 0,mean,stdev,corr
mean,99232.16,28569.586714,0.48695
std,238667.2,50442.428787,0.368809
min,82.86602,331.464099,-1.0
25%,6326.736,5485.799123,0.238953
50%,22570.57,12733.813006,0.521843
75%,102994.6,33207.642652,0.804873
max,2412375.0,529327.504715,1.0
