In [55]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import DataFrameStatFunctions as statFunc
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost
import sklearn


In [56]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [57]:
transaction_20210828_20220227_sdf = spark.read.parquet("../data/curated/transactions_20210828_20220227_all_details")
transaction_20210228_20210827_sdf = spark.read.parquet("../data/curated/transactions_20210228_20210827_all_details")
transaction_20210228_20210827_sdf = spark.read.parquet("../data/curated/transactions_20220228_20220828_all_details")
transaction_sdf = transaction_20210828_20220227_sdf \
                    .union(transaction_20210228_20210827_sdf) \
                    .union(transaction_20210228_20210827_sdf)

In [None]:
monthly_agg = transaction_sdf.groupBy(
    "merchant_abn",
    F.year("order_datetime"),
    F.month("order_datetime")
    ) \
    .agg(
      F.count("order_id").alias("number_of_order"),
      F.sum("dollar_value").alias("total_dollar_value")
    ) \
    .groupBy("merchant_abn") \
    .agg(
      F.mean("number_of_order").alias("monthly_average_number_of_orders"),
      F.mean("total_dollar_value").alias("monthly_average_bnpl_revenue"),
      F.stddev("total_dollar_value").alias("stddev_of_monthly_average_bnpl_revenue")
    )

agg_transaction_pred_sdf = transaction_sdf.filter(F.col("merchant_name").isNull()) \
                                          .groupBy("merchant_abn") \
                                          .agg(
                                               F.countDistinct("consumer_id").alias("total_number_of_distinct_customers"),
                                               F.stddev("dollar_value").alias("stddev_of_dorllar_value"),
                                               F.mean("dollar_value").alias("average_dollar_value"),
                                               F.percentile_approx("dollar_value", 0.5).alias("median_dollar_value")
                                               ) \
                                          .join(
                                            monthly_agg,
                                            ['merchant_abn', ],
                                            "left"
                                          ) \
                                          .write.csv("agg_transaction_pred.csv")



agg_transaction_train_sdf = transaction_sdf.filter(F.col("merchant_name").isNotNull()) \
                                          .groupBy("merchant_abn") \
                                          .agg(
                                               F.countDistinct("consumer_id").alias("total_number_of_distinct_customers"),
                                               F.stddev("dollar_value").alias("stddev_of_dorllar_value"),
                                               F.mean("dollar_value").alias("average_dollar_value"),
                                               F.percentile_approx("dollar_value", 0.5).alias("median_dollar_value")
                                               ) \
                                          .join(
                                            monthly_agg,
                                            ['merchant_abn', ],
                                            "left"
                                          ) \
                                          .write.csv("agg_transaction_train.csv")

In [5]:
transaction_sdf

                                                                                

user_id,merchant_abn,dollar_value,order_id,order_datetime,merchant_name,prod_desc,revenue_level,take_rate,consumer_name,address,state,postcode,gender,consumer_id
7,71041015148,226.70111326425848,ba1c3c0b-6143-49c...,2021-11-26,Orci Ltd,digital goods: bo...,c,2.1,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,14492521225,28.69248621050033,c82c2951-6a2c-4bf...,2021-11-26,Praesent Eu Assoc...,tent and awning s...,c,2.5,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,68004106739,93.96901941736228,40d71489-2203-485...,2021-11-26,Nec Ante Ltd,"cable, satellite,...",a,5.6,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,34096466752,92.43454448910455,293dc228-8398-49f...,2021-11-29,Nullam Enim Ltd,"computers, comput...",b,3.2,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,62773208456,24.606053328805864,918d63ec-d125-43b...,2021-11-29,Ac Institute,"watch, clock, and...",c,2.6,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,67874735704,871.400942067755,0dc0a909-7b11-423...,2021-11-29,Ultricies Adipisc...,florists supplies...,c,2.0,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,36125151647,22.37653874205458,ebd96051-41c5-44d...,2021-11-29,Sed Nec Corp.,"hobby, toy and ga...",c,1.8,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,86578477987,14.932370248640328,b13248d9-3c6d-4b0...,2021-11-29,Leo In Consulting,"watch, clock, and...",a,6.4,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,48534649627,16.362338872391526,96a5d5f4-2f24-4c9...,2021-11-29,Dignissim Maecena...,"opticians, optica...",a,6.6,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685
7,76767266140,144.7959046161148,734de9e3-dcc1-4da...,2021-11-29,Phasellus At Limited,"furniture, home f...",b,4.6,Andrea Jones,122 Brandon Cliff,QLD,4606,Female,511685


# Getting the data that need to be predicted

In [6]:
agg_transaction_pred_sdf = transaction_sdf.filter(F.col("merchant_name").isNull()) \
                                     .groupBy("merchant_abn") \
                                     .agg(F.countDistinct("consumer_id"),
                                          F.count("merchant_abn"),
                                          F.sum("dollar_value"),
                                          F.stddev("dollar_value"),
                                          F.sum("dollar_value") / F.count("order_id"),
                                          F.percentile_approx("dollar_value", 0.5)
                                         )\
                                     .withColumnRenamed("count(consumer_id)", "distinct_consumer_count") \
                                     .withColumnRenamed("count(merchant_abn)", "order_count") \
                                     .withColumnRenamed("sum(dollar_value)", "total_revenue") \
                                     .withColumnRenamed("percentile_approx(dollar_value, 0.5, 10000)",
                                                        "median_dollar_value") \
                                     .withColumnRenamed("stddev_samp(dollar_value)", "dollar_value_standard_dev") \
                                     .withColumnRenamed("(sum(dollar_value) / count(order_id))", "dollar_per_order")
                                    
agg_transaction_pred_sdf

                                                                                

merchant_abn,distinct_consumer_count,order_count,total_revenue,dollar_value_standard_dev,dollar_per_order,median_dollar_value
24406529929,2943,4968,327429.3857203502,65.83948623583439,65.90768633662444,45.96901058385736
28767881738,3,4,16538.731756372705,2543.5079878257584,4134.682939093176,2883.3416199954663
56395390867,38,58,55589.8695520354,637.1183882996874,958.446026759231,861.6872806970155
98345415950,8,12,105269.44253670526,7913.006782425887,8772.453544725438,3896.208340820999
45925655949,67,99,771700.7514011513,5544.280890452739,7794.957084860114,5929.815410254777
18899854203,10,17,32738.084698023667,916.912241509521,1925.7696881190395,1931.0378187385836
31507950402,43,70,828801.12754154,6479.452915154887,11840.016107736285,10930.959611877985
87802246756,2057,3386,245853.59327101917,43.91162022558144,72.60885802451836,63.93463642725792
87921002735,313,505,1038863.8138238942,1193.9446974968198,2057.1560669780083,1822.206905815044
92220967360,301,480,725244.5330631013,763.5021679402876,1510.9261105481278,1479.0966455009404


In [14]:
agg_transaction_pred_sdf.toPandas().to_csv('agg_transaction_pred_sdf.csv')

                                                                                

# Getting the training dataset

In [7]:
agg_transaction_train_sdf = transaction_sdf.filter(F.col("merchant_name").isNotNull()) \
                                     .groupBy("merchant_abn") \
                                     .agg(F.countDistinct("consumer_id"),
                                          F.count("merchant_abn"),
                                          F.sum("dollar_value"),
                                          F.stddev("dollar_value"),
                                          F.sum("dollar_value") / F.count("order_id"),
                                          F.percentile_approx("dollar_value", 0.5),
                                          F.first("take_rate")
                                         )\
                                     .withColumnRenamed("count(consumer_id)", "distinct_consumer_count") \
                                     .withColumnRenamed("count(merchant_abn)", "order_count") \
                                     .withColumnRenamed("sum(dollar_value)", "total_revenue") \
                                     .withColumnRenamed("percentile_approx(dollar_value, 0.5, 10000)",
                                                        "median_dollar_value") \
                                     .withColumnRenamed("stddev_samp(dollar_value)", "dollar_value_standard_dev") \
                                     .withColumnRenamed("(sum(dollar_value) / count(order_id))", "dollar_per_order") \
                                     .withColumnRenamed("first(take_rate)", "take_rate")
agg_transaction_train_sdf

                                                                                

merchant_abn,distinct_consumer_count,order_count,total_revenue,dollar_value_standard_dev,dollar_per_order,median_dollar_value,take_rate
10023283211,2303,3839,833176.759060391,141.15063824319634,217.02963247209976,183.498769765699,0.1
10342410215,615,988,382746.0645105066,277.6588585943003,387.3948021361402,319.4335420059913,6.3
10346855916,4,7,20039.35054697081,2195.2154339169288,2862.764363852973,1823.107772751072,3.5
10385163239,56,93,31227.18974316143,214.4157719556225,335.7762337974347,303.3375490743265,6.6
10648956813,11769,25654,1655961.4818585718,22.01799247391858,64.549835575683,64.30379773305644,6.6
10714068705,2454,4106,524871.5849475799,106.6270620695824,127.83039087861177,99.76539947573724,2.5
11024352823,337,564,118272.65769805566,123.47104096058492,209.70329379087883,181.33232586474924,2.6
11076688542,44,64,42672.34193990541,209.72979041869183,666.7553428110219,695.2640451261304,2.5
11243046390,362,565,134674.4700038157,207.6491002921548,238.3618938120632,188.6112371781011,1.9
11848576000,80,126,264091.9924150258,1070.69227581246,2095.9681937700457,1910.080955818674,1.5


In [16]:
agg_transaction_train_sdf.toPandas().to_csv('agg_transaction_train_sdf.csv')

                                                                                

In [8]:
agg_transaction_train_df = agg_transaction_train_sdf.toPandas()
agg_transaction_pred_df = agg_transaction_pred_sdf.toPandas()

                                                                                

In [9]:
agg_transaction_train_df = agg_transaction_train_df.drop('merchant_abn', axis=1)

In [10]:
agg_transaction_train_df = agg_transaction_train_df.fillna(0)

In [11]:
X = agg_transaction_train_df.iloc[:,:-1]
y = agg_transaction_train_df.iloc[:,-1]
X.iloc[:1,:1]

Unnamed: 0,distinct_consumer_count
0,2303


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=30034)

In [52]:
r = xgboost.XGBRegressor(n_estimators=10, learning_rate=0.3)

In [53]:
r.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.3, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=10, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [54]:
testing_accuracy = r.score(X_test, y_test)
print(testing_accuracy)

-0.04640719718824493


22/09/16 00:21:25 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 304270 ms exceeds timeout 120000 ms
22/09/16 00:21:25 WARN SparkContext: Killing executors is not supported by current scheduler.


In [27]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(r, X, y, cv=5)

In [28]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

-0.21 accuracy with a standard deviation of 0.04


In [59]:
rf = RandomForestRegressor()
rf.fit(X,y)
rf.score(X,y)

0.8445085179839061

In [50]:
scores = cross_val_score(rf, X, y, cv=5)

In [51]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

-0.13 accuracy with a standard deviation of 0.02


In [30]:
X.count()

distinct_monthly_consumer_count    4022
monthly_order_count                4022
monthly_revenue                    4022
dollar_value_standard_dev          4017
dollar_per_order                   4022
median_dollar_value                4022
dtype: int64

In [84]:
X.dtypes

distinct_monthly_consumer_count      int64
monthly_order_count                  int64
monthly_revenue                    float64
dollar_value_standard_dev          float64
dollar_per_order                   float64
median_dollar_value                float64
dtype: object

In [1]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor().fit(X_train, y_train)
regr.score(X_train, y_train)

NameError: name 'X_train' is not defined