# Aggregate data for first 3 variables of final model

In [54]:
# import libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [55]:
# setup spark
spark = (
    SparkSession.builder.appName("aggregate data for first 3 final model variables")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [56]:
# import data
data1 = spark.read.parquet("../data/tables/transactions_20210228_20210827_snapshot/")
data2 = spark.read.parquet("../data/tables/transactions_20210828_20220227_snapshot/")
data3 = spark.read.parquet("../data/tables/transactions_20220228_20220828_snapshot/")

                                                                                

In [57]:
data = data1.union(data2)
data = data.union(data3)

In [58]:
# create some new columns
data = data.withColumn("Week", F.weekofyear("order_datetime"))\
        .withColumn("Year", F.year("order_datetime"))\
        .withColumn("Month", F.month("order_datetime"))\
        .withColumn("Day", F.dayofmonth("order_datetime"))

In [59]:
# drop 2-28 because we are aggregating by week/fortnight; 2021-03-01 is first monday of dataset, 2022-08-14 is last sunday of dataset  
data = data.filter(((data.Month >= 3) & (data.Year >= 2021)) | ((data.Day <= 14) & (data.Month <= 8) & (data.Year <= 2022)))

## ONE THOUGHT: START THE WEEK ON A SUNDAY... but how does it affect the cycle? domain knowledge

## THEORY: create a new column that is 1 day later than current date, and then take weekofyear based on it.

In [60]:
data

user_id,merchant_abn,dollar_value,order_id,order_datetime,Week,Year,Month,Day
18478,62191208634,63.255848959735246,949a63c8-29f7-4ab...,2021-08-20,33,2021,8,20
2,15549624934,130.3505283105634,6a84c3cf-612a-457...,2021-08-20,33,2021,8,20
18479,64403598239,120.15860593212784,b10dcc33-e53f-425...,2021-08-20,33,2021,8,20
3,60956456424,136.6785200286976,0f09c5a5-784e-447...,2021-08-20,33,2021,8,20
18479,94493496784,72.96316578355305,f6c78c1a-4600-4c5...,2021-08-20,33,2021,8,20
3,76819856970,448.529684285612,5ace6a24-cdf0-4aa...,2021-08-20,33,2021,8,20
18479,67609108741,86.4040605836911,d0e180f0-cb06-42a...,2021-08-20,33,2021,8,20
3,34096466752,301.5793450525113,6fb1ff48-24bb-4f9...,2021-08-20,33,2021,8,20
18482,70501974849,68.75486276223054,8505fb33-b69a-412...,2021-08-20,33,2021,8,20
4,49891706470,48.89796461900801,ed11e477-b09f-4ae...,2021-08-20,33,2021,8,20


In [61]:
data.count()

                                                                                

13598410

## Aggregate by Weeks

In [88]:
# data.orderBy(F.col("order_datetime").desc())

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime,Week,Year,Month,Day
11144,75104340635,182.8761482952268,73183bb6-d2f5-46f...,2022-10-26,43,2022,10,26
11142,57900494384,9.631165044586432,9a901339-9239-414...,2022-10-26,43,2022,10,26
22289,47086412084,30.69223575655585,fad96b4c-99b4-4a3...,2022-10-26,43,2022,10,26
22284,48701447259,113.3370672936551,534f804f-c8b4-421...,2022-10-26,43,2022,10,26
22287,41944909975,64.812002635129,988453e9-9d47-42c...,2022-10-26,43,2022,10,26
22288,81219314324,145.09435777132026,c956b44f-0386-4cc...,2022-10-26,43,2022,10,26
11144,80518954462,661.4437679336016,4b26e4a6-5bb0-42c...,2022-10-26,43,2022,10,26
22284,63290521567,25.30482082897359,359028c0-59b7-4e2...,2022-10-26,43,2022,10,26
11139,40252040480,55.01140475396662,d031d695-6041-4bf...,2022-10-26,43,2022,10,26
22286,96442520325,19.738085840632262,472bcf37-47a9-412...,2022-10-26,43,2022,10,26


In [142]:
# aggregate by week; retain sum of transactions, number of customers, number of transactions
transact_agg_by_comp_week = data.groupBy("Year", "Week", "merchant_abn")\
        .agg(F.sum("dollar_value")\
        .alias("sum_transactions"), \
         F.count("order_id").alias("number_of_customers"), \
         F.countDistinct("user_id").alias("distinct_customers"))

In [144]:
# turn into pandas dataframe
transact_agg_comp_week_df = transact_agg_by_comp_week.toPandas()

                                                                                

In [163]:
len(transact_agg_comp_week_df)

290684

In [145]:
## Fill up na
# get set of merchants
distinct_merchants = set(transact_agg_comp_week_df.merchant_abn)

In [146]:
# get set of year and week
year_week_set = list()

for id, dta in transact_agg_comp_week_df.groupby(['Year', 'Week']):
    year_week_set.append(id)

In [147]:
year_week_set = year_week_set[:-1]

In [148]:
years = [x[0] for x in year_week_set]
weeks = [x[1] for x in year_week_set]

fill_na_time = pd.DataFrame({"Year": years, "Week": weeks})

In [149]:
fill_na_companies = pd.DataFrame({"merchant_abn": list(distinct_merchants)})

In [150]:
fill_na = fill_na_time.merge(fill_na_companies, how = 'cross')

for col in transact_agg_comp_week_df.columns[3:]:
    fill_na[col] = 0

In [151]:
transact_agg_comp_week_df

Unnamed: 0,Year,Week,merchant_abn,sum_transactions,number_of_customers,distinct_customers
0,2021,33,97259357542,663.743667,9,9
1,2021,20,55270512213,619.075001,4,4
2,2021,21,72472909171,74922.723247,1041,1024
3,2021,34,38425242099,938.438232,6,6
4,2021,31,89502033586,15954.471504,218,215
...,...,...,...,...,...,...
290679,2021,38,92426644718,1401.439911,1,1
290680,2022,42,49802393568,619.726511,1,1
290681,2022,36,56488744658,5038.085170,1,1
290682,2021,32,98065308169,113.314105,1,1


In [152]:
# correct problem of first few days of 2022 classified as wk 52, and under our aggregation becomes
# 2022 week 52

week52 = transact_agg_comp_week_df[(transact_agg_comp_week_df['Week'] == 52) & 
        (transact_agg_comp_week_df['Year'] == 2022)]

transact_agg_comp_week_df = transact_agg_comp_week_df.drop(week52.index)

week52['Year'] = week52['Year']-1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  week52['Year'] = week52['Year']-1


In [153]:
transact_agg_comp_week_df = pd.concat([transact_agg_comp_week_df, week52])


In [154]:
transact_agg_comp_week_filled_df = pd.concat([transact_agg_comp_week_df, fill_na])
transact_agg_comp_week_filled_df = transact_agg_comp_week_filled_df.groupby(['Year', 'Week', 'merchant_abn'])\
    .agg({'sum_transactions': sum, 'number_of_customers': sum, 'distinct_customers': sum})

In [155]:
transact_agg_comp_week_filled_df = transact_agg_comp_week_filled_df.reset_index()

In [156]:
transact_agg_comp_week_filled_df

Unnamed: 0,Year,Week,merchant_abn,sum_transactions,number_of_customers,distinct_customers
0,2021,9,10023283211,6362.419840,27,27
1,2021,9,10142254217,1138.039207,30,30
2,2021,9,10165489824,0.000000,0,0
3,2021,9,10187291046,536.875497,3,3
4,2021,9,10192359162,1198.636395,4,4
...,...,...,...,...,...,...
371275,2022,43,99974311662,0.000000,0,0
371276,2022,43,99976658299,22885.007579,154,154
371277,2022,43,99987905597,571.928910,1,1
371278,2022,43,99989036621,0.000000,0,0


In [157]:
# remove weeks before joining BNPL
transact_agg_comp_week_filled_adj_df = pd.DataFrame()

for id, dta in transact_agg_comp_week_df.groupby(['merchant_abn']):
    for row in dta.iterrows():
        if row[1][5] == 0:
            dta = dta.drop(row[0])
        
        else:
            break

    transact_agg_comp_week_filled_adj_df = pd.concat([transact_agg_comp_week_filled_adj_df, dta])


In [162]:
transact_agg_comp_week_filled_adj_df.sort_values(['Year', 'Week'])

Unnamed: 0,Year,Week,merchant_abn,sum_transactions,number_of_customers,distinct_customers
127562,2021,9,10023283211,6362.419840,27,27
17792,2021,9,10142254217,1138.039207,30,30
245732,2021,9,10187291046,536.875497,3,3
192236,2021,9,10192359162,1198.636395,4,4
219180,2021,9,10206519221,3214.728342,85,84
...,...,...,...,...,...,...
99868,2022,43,99861963809,104.636859,1,1
256789,2022,43,99904689266,4396.947704,43,43
222156,2022,43,99938978285,3594.756447,100,100
86456,2022,43,99976658299,22885.007579,154,154


Create mean and stdev

In [179]:
# nweeks = transact_agg_comp_week_df.groupby(['merchant_abn']).agg({'Week': 'count'})

# print("Number of companies with less than i weeks of activity")
# for i in range(20):

#     print(i, len(nweeks[nweeks['Week']<= i]))

Number of companies with less than i weeks of activity
0 0
1 15
2 31
3 60
4 92
5 122
6 148
7 170
8 197
9 225
10 255
11 291
12 321
13 345
14 367
15 392
16 422
17 443
18 459
19 476


In [182]:
drop_low_count = transact_agg_comp_week_df.groupby(['merchant_abn']).agg({'Week': 'count'})
low_count_merchants = drop_low_count[drop_low_count['Week'] == 1].index
low_count_merchants

Int64Index([10404542215, 18261886835, 29068447069, 37145789569, 39150153670,
            47047735645, 50154587122, 50532670634, 50794104497, 62688594508,
            67213521157, 67264251405, 86868464441, 93267734067, 99989036621],
           dtype='int64', name='merchant_abn')

In [184]:
for abn in list(low_count_merchants):
    transact_agg_comp_week_df = transact_agg_comp_week_df[transact_agg_comp_week_df['merchant_abn'] != abn]

In [187]:
from statistics import mean, stdev

transact_agg_comp_week_df['sum_transactions2'] = transact_agg_comp_week_df['sum_transactions']
mean_sd = transact_agg_comp_week_df.groupby(['merchant_abn']).agg({'sum_transactions': mean, 'sum_transactions2': stdev})

mean_sd = mean_sd.rename(columns = {'sum_transactions': 'mean',\
     'sum_transactions2': 'stdev'})

In [188]:
mean_sd

Unnamed: 0_level_0,mean,stdev
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1
10023283211,7907.369218,2447.382313
10142254217,1335.596313,503.821199
10165489824,11310.946039,8097.733445
10187291046,478.606199,285.418837
10192359162,2039.630643,1293.121995
...,...,...
99938978285,5890.611154,1536.838745
99974311662,560.216446,429.376847
99976658299,39085.939358,10195.335136
99987905597,964.755729,714.906307


In [189]:
mean_sd.describe()

Unnamed: 0,mean,stdev
count,4405.0,4405.0
mean,6942.65818,2831.66216
std,13636.636108,4346.592733
min,118.989899,66.797244
25%,767.880926,494.669073
50%,2469.659743,1304.649868
75%,8745.031169,3429.232284
max,111008.514444,46934.404559


# Market

In [204]:
market = transact_agg_comp_week_df.groupby(['Year', 'Week'])\
    .agg({'sum_transactions': sum, 'number_of_customers': 'count', 'distinct_customers': lambda x: x.nunique()})

In [205]:
market = market.reset_index()

In [209]:
market_mean = mean(market['sum_transactions'])
market_sd = stdev(market['sum_transactions'])

market_mean, market_sd

(26899283.712585505, 6184225.293264331)