# Aggregate data for first 3 variables of final model

In [168]:
# import libraries
from gettext import npgettext
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np
from statistics import mean, stdev

import json

In [2]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/14 20:15:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [17]:
# import data
data1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
data2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
data3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot/")

                                                                                

In [18]:
data = data1.union(data2)
data = data.union(data3)

In [19]:
# create some new columns
data = data.withColumn("Week", F.weekofyear("order_datetime"))\
        .withColumn("Year", F.year("order_datetime"))\
        .withColumn("Month", F.month("order_datetime"))\
        .withColumn("Day", F.dayofmonth("order_datetime"))\
        .withColumn("Fortnight", ((F.col("Week")+1)/2).cast('int'))

In [21]:
# drop dates that don't make a full week at start and end
data = data.filter(data["order_datetime"] >= F.lit('2021-03-01')) \
       .filter(data["order_datetime"] <= F.lit('2022-08-14'))
data

## ONE THOUGHT: START THE WEEK ON A SUNDAY... but how does it affect the cycle? domain knowledge

## THEORY: create a new column that is 1 day later than current date, and then take weekofyear based on it.

user_id,merchant_abn,dollar_value,order_id,order_datetime,Week,Year,Month,Day,Fortnight
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,33,2021,8,20,17
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,33,2021,8,20,17
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20,33,2021,8,20,17
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20,33,2021,8,20,17
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20,33,2021,8,20,17
3,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20,33,2021,8,20,17
18479,67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20,33,2021,8,20,17
3,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20,33,2021,8,20,17
18482,70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20,33,2021,8,20,17
4,49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20,33,2021,8,20,17


In [22]:
data.count()

                                                                                

12157084

## Aggregate by Weeks

In [85]:
# aggregate by week and fortnight; retain sum of transactions, number of customers, number of transactions
transact_agg_by_comp_week = data.groupBy("Year", "Week", "merchant_abn")\
        .agg(F.sum("dollar_value")\
        .alias("sum_transactions"), \
         F.count("order_id").alias("number_of_customers"), \
         F.countDistinct("user_id").alias("distinct_customers"))


transact_agg_by_comp_fortnight = data.groupBy("Year", "Fortnight", "merchant_abn")\
        .agg(F.sum("dollar_value")\
        .alias("sum_transactions"), \
         F.count("order_id").alias("number_of_customers"), \
         F.countDistinct("user_id").alias("distinct_customers"))

In [86]:
# turn both into pandas dataframe
transact_agg_comp_week_df = transact_agg_by_comp_week.toPandas()

transact_agg_comp_fortnight_df = transact_agg_by_comp_fortnight.toPandas()

                                                                                

In [175]:
len(transact_agg_comp_week_df)

263758

In [176]:
len(transact_agg_comp_fortnight_df)

145859

In [114]:
def get_fill_na_df(transact_agg_comp_period_df, period):
    """ helper function to create dataframe of all combos and 0 value for fillup """

    ## Fill up na
    # get set of merchants
    distinct_merchants = set(transact_agg_comp_period_df.merchant_abn)

    # get set of year and period
    year_period_set = list()

    for id, dta in transact_agg_comp_period_df.groupby(['Year', period]):
        year_period_set.append(id)

    year_period_set = year_period_set[:-1]

    years = [x[0] for x in year_period_set]
    periods = [x[1] for x in year_period_set]

    # make dataframes and join up
    fill_na_time = pd.DataFrame({"Year": years, period: periods})

    fill_na_companies = pd.DataFrame({"merchant_abn": list(distinct_merchants)})

    fill_na_df = fill_na_time.merge(fill_na_companies, how = 'cross')

    for col in transact_agg_comp_week_df.columns[3:]:
        fill_na_df[col] = 0

    return fill_na_df

In [115]:
def correct_stray_period(transact_agg_comp_period_df, period):
        """ correct problem of first few days of 2022 classified as wk 52 or fortnight 26, 
        which under our aggregation becomes separate week of 2022 week 52 or fortnight 26 """

        # figure out what the stray period number is
        if period == 'Week':
                stray_period_id = 52
        else:
                stray_period_id = 26

        # take out just the stray period data
        stray_period = transact_agg_comp_period_df[(transact_agg_comp_period_df[period] == stray_period_id) & 
                (transact_agg_comp_period_df['Year'] == 2022)]

        transact_agg_comp_period_df = transact_agg_comp_period_df.drop(stray_period.index)

        # update the stray period data's year
        stray_period['Year'] = stray_period['Year']-1
        # put it back
        transact_agg_comp_period_df = pd.concat([transact_agg_comp_period_df, stray_period])

        return transact_agg_comp_period_df

In [116]:
def fill_na(transact_agg_comp_period_df, fill_na_df, period):
    """ Helper function to fill na """
    
    transact_agg_comp_period_filled_df = pd.concat([transact_agg_comp_period_df, fill_na_df])
    transact_agg_comp_period_filled_df = transact_agg_comp_period_filled_df.groupby(['Year', period, 'merchant_abn'])\
        .agg({'sum_transactions': sum, 'number_of_customers': sum, 'distinct_customers': sum})
    transact_agg_comp_period_filled_df = transact_agg_comp_period_filled_df.reset_index()

    return transact_agg_comp_period_filled_df

In [117]:
def remove_prestart_filled_weeks(transact_agg_comp_period_filled_df, period): 
    """ Helper function to clean out wrongly filled 0 weeks (which will distort mean and variance) """

    # remove weeks before first BNPL transaction
    transact_agg_comp_period_filled_adjusted_df = pd.DataFrame()

    for id, dta in transact_agg_comp_period_filled_df.groupby(['merchant_abn']):
        
        # iterate down the rows, and drop rows until first row where there is non0 transaction value
        dta = dta.sort_values(['Year', period])

        for row in dta.iterrows(): 
            if row[1][5] == 0: # if row's transaction value = 0
                dta = dta.drop(row[0])
            
            else:
                break

        transact_agg_comp_period_filled_adjusted_df = pd.concat([transact_agg_comp_period_filled_df, dta])

    return transact_agg_comp_period_filled_adjusted_df

In [118]:
def get_transactions_agg_by_period(transact_agg_comp_period_df, period):
    """ ETL function that runs all helpers to clean the aggregated merchant transaction data """

    fill_na_df = get_fill_na_df(transact_agg_comp_period_df, period)

    transact_agg_comp_period_df = correct_stray_period(transact_agg_comp_period_df, period)

    transact_agg_comp_period_filled_df = fill_na(transact_agg_comp_period_df, fill_na_df, period)

    transact_agg_comp_period_filled_adjusted_df = remove_prestart_filled_weeks(transact_agg_comp_period_filled_df, period)

    return transact_agg_comp_period_filled_adjusted_df

In [121]:
transact_agg_comp_week_filled_adjusted_df = get_transactions_agg_by_period(transact_agg_comp_week_df, 'Week')

transact_agg_comp_fortnight_filled_adjusted_df = get_transactions_agg_by_period(transact_agg_comp_fortnight_df, 'Fortnight')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stray_period['Year'] = stray_period['Year']-1


In [147]:
transact_agg_comp_week_filled_adjusted_df.to_csv('../data/curated/final_model/weekly_agg_merchant_transactions.csv', index=False)

transact_agg_comp_fortnight_filled_adjusted_df.to_csv('../data/curated/final_model/fortnightly_agg_merchant_transactions.csv', index=False)

In [174]:
len(transact_agg_comp_week_filled_adjusted_df)

331841

In [173]:
len(transact_agg_comp_fortnight_filled_adjusted_df)

168032

# Get Mean and Variance

In [100]:
## Inspect how many merchants have less than i weeks on record

# nweeks = transact_agg_comp_week_df.groupby(['merchant_abn']).agg({'Week': 'count'})

# print("Number of merchants with less than i weeks of activity")
# for i in range(20):

#     print(i, len(nweeks[nweeks['Week']<= i]))

In [124]:
def get_nperiods_and_drop_low_counts(transact_agg_comp_periods_df, period, low_counts=2):
    """ get nperiod values and also drop merchants that don't have enough periods of records """

    # get number of periods with observed data per merchant
    n_periods = transact_agg_comp_periods_df.groupby(['merchant_abn']).agg({period: 'count'})
    n_periods = n_periods.rename(columns = {period: 'n_periods'})

    # get list of low count abns and drop
    low_count_merchants = n_periods[n_periods['n_periods'] <= low_counts].index

    for abn in list(low_count_merchants):
        transact_agg_comp_periods_df = transact_agg_comp_periods_df[transact_agg_comp_periods_df['merchant_abn'] != abn]

    return transact_agg_comp_periods_df, n_periods

In [125]:
def get_mean_sd_nperiods(transact_agg_comp_week_df, n_periods):
     """ Helper to get mean, sd and nperiods of merchants """

     # create replicate column to allow for two aggregations on the same column of data
     transact_agg_comp_week_df['sum_transactions2'] = transact_agg_comp_week_df['sum_transactions']
     mean_sd = transact_agg_comp_week_df.groupby(['merchant_abn']).agg({'sum_transactions': mean, 'sum_transactions2': stdev})

     mean_sd = mean_sd.rename(columns = {'sum_transactions': 'mean',\
          'sum_transactions2': 'stdev'})

     mean_sd = mean_sd.reset_index()
     
     mean_sd_nperiods = mean_sd.merge(n_periods, on = ['merchant_abn'], how = 'inner')

     return mean_sd_nperiods

In [128]:
def get_merchant_mean_sd_nperiods(transact_agg_comp_periods_df, period, low_counts = 2):
    """ Function that runs helpers to get the merchant data in mean, sd and nperiods """

    transact_agg_comp_periods_df, n_periods = get_nperiods_and_drop_low_counts(transact_agg_comp_periods_df, period, low_counts)

    mean_sd_nperiods = get_mean_sd_nperiods(transact_agg_comp_week_df, n_periods)

    return mean_sd_nperiods

In [143]:
mean_sd_nperiods_week = get_merchant_mean_sd_nperiods(transact_agg_comp_week_filled_adjusted_df, 'Week', low_counts = 2)
mean_sd_nperiods_fortnight = get_merchant_mean_sd_nperiods(transact_agg_comp_fortnight_filled_adjusted_df, 'Fortnight', low_counts = 2)

# Market

In [136]:
def get_market(transact_agg_comp_period_df, period):
    """ get the sum of all merchants data by period """

    market = transact_agg_comp_period_df.groupby(['Year', period])\
        .agg({'sum_transactions': sum, 'number_of_customers': 'count', 'distinct_customers': lambda x: x.nunique()})

    market = market.reset_index()

    return market

In [137]:
def get_market_mean_and_sd(market):
    """ get the mean and sd of the market """

    market_mean = mean(market['sum_transactions'])
    market_sd = stdev(market['sum_transactions'])

    return market_mean, market_sd

In [138]:
def get_market_and_stats(transact_agg_comp_period_df, period):

    market = get_market(transact_agg_comp_period_df, period)

    market_mean, market_sd = get_market_mean_and_sd(market)

    return market, market_mean, market_sd

In [139]:
market_week, market_mean_week, market_sd_week = get_market_and_stats(transact_agg_comp_week_df, 'Week')

market_fortnight, market_mean_fortnight, market_sd_fortnight = get_market_and_stats(transact_agg_comp_fortnight_df, 'Fortnight')

In [148]:
market_week.to_csv('../data/curated/final_model/market-all_by_week.csv', index=False)

market_fortnight.to_csv('../data/curated/final_model/market-all_by_fortnight.csv', index=False)

In [154]:
json_data = {'Week': {'mean': market_mean_week, 'sd': market_sd_week}, \
    'Fortnight': {'mean': market_mean_fortnight, 'sd': market_sd_fortnight}}

Data = json.dumps(str(json_data))

with open('../data/curated/final_model/Market_stats.json', 'w') as f:
    json.dump(Data, f)

# Correlation

In [140]:
def get_corr_df(transact_agg_comp_period_df, market, period):
    """ get dataframe of correlations """

    merchant_abns = list()
    corr = list()

    # get each abn's periodic data, left join on market, and then get correlation
    for abn in list(set(transact_agg_comp_period_df['merchant_abn'])):
        merchant = transact_agg_comp_period_df[transact_agg_comp_period_df['merchant_abn'] == abn]

        merchant_market = merchant.merge(market, how = 'left', on = ['Year', period])
        merchant_abns.append(abn)
        corr.append(np.corrcoef(merchant_market['sum_transactions_x'], merchant_market['sum_transactions_y'])[0][1])

    corr_df = pd.DataFrame({'merchant_abn': merchant_abns, 'corr': corr})

    return corr_df

In [141]:
def get_mean_sd_corr_nperiods(mean_sd_nperiods, corr_df, market_sd):
    """ add corr and corr related variables to mean, sd dataframe """

    mean_sd_corr_nperiods = mean_sd_nperiods.merge(corr_df, on = ['merchant_abn'], how = 'inner')
    mean_sd_corr_nperiods['covar'] = mean_sd_corr_nperiods['stdev'] * mean_sd_corr_nperiods['corr'] * market_sd
    mean_sd_corr_nperiods['beta'] = mean_sd_corr_nperiods['covar']/(np.power(market_sd,2))
    
    return mean_sd_corr_nperiods

In [142]:
def get_corr(transact_agg_comp_period_df, market, period, mean_sd_nperiods, market_sd):
    
    corr_df = get_corr_df(transact_agg_comp_period_df, market, period)

    mean_sd_corr_nperiods = get_mean_sd_corr_nperiods(mean_sd_nperiods, corr_df, market_sd)

    return mean_sd_corr_nperiods

In [144]:
mean_sd_corr_nperiods_week = get_corr(transact_agg_comp_week_df, market_week, \
    'Week', mean_sd_nperiods_week, market_sd_week)
mean_sd_corr_nperiods_fortnight = get_corr(transact_agg_comp_fortnight_df, market_fortnight, \
    'Fortnight', mean_sd_nperiods_fortnight, market_sd_fortnight)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


In [155]:
mean_sd_corr_nperiods_week.to_csv('../data/curated/final_model/agg_weekly_mean_sd_marketcorr.csv', index=False)
mean_sd_corr_nperiods_fortnight.to_csv('../data/curated/final_model/agg_fortnightly_mean_sd_marketcorr.csv', index=False)

# Analysis

In [166]:
mean_sd_corr_nperiods_week.describe().drop(['merchant_abn', 'n_periods'], axis=1)[1:]


Unnamed: 0,mean,stdev,corr,covar,beta
mean,6820.88957,2664.480345,0.347603,6125052000.0,0.000221
std,13492.039026,4048.940496,0.314924,14934820000.0,0.000538
min,122.235276,67.364259,-0.9997,-88808390000.0,-0.0032
25%,752.050987,477.289633,0.141162,314186900.0,1.1e-05
50%,2396.810028,1250.690582,0.349894,1697796000.0,6.1e-05
75%,8446.905268,3177.137782,0.595043,7674692000.0,0.000277
max,109458.201106,44378.120027,0.999227,122895700000.0,0.004429


In [167]:
mean_sd_corr_nperiods_fortnight.describe().drop(['merchant_abn', 'n_periods'], axis=1)[1:]

Unnamed: 0,mean,stdev,corr,covar,beta
mean,6820.88957,2664.480345,0.481387,19196470000.0,0.000119
std,13492.039026,4048.940496,0.365678,42455420000.0,0.000263
min,122.235276,67.364259,-1.0,-212796100000.0,-0.001319
25%,752.050987,477.289633,0.230151,1281612000.0,8e-06
50%,2396.810028,1250.690582,0.513409,5983286000.0,3.7e-05
75%,8446.905268,3177.137782,0.802041,25311060000.0,0.000157
max,109458.201106,44378.120027,1.0,415899100000.0,0.002579


22/09/14 22:25:03 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 305719 ms exceeds timeout 120000 ms
22/09/14 22:25:03 WARN SparkContext: Killing executors is not supported by current scheduler.
