In [1]:
import numpy
import random
from time import time
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql import Row
from os.path import expanduser, join, abspath
from pyspark.sql.functions import col
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from pyspark.sql import functions as F
from pyspark.sql.functions import udf

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.sql.functions import col, asc
from pyspark.ml.feature import MinMaxScaler

from pyspark.sql import SparkSession
from pyspark.sql import Row

from matplotlib import cm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import sys
from pyspark.sql import *
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

from pysparkling import *
import h2o

PySparkling is using internally bundled H2O of version 3.24.0.1, but H2O installed in the python environment is of version 3.24.0.5.


In [2]:
sc

In [3]:
#Fetch data
feature_query="select * from grp_gdoop_clv_db.keep_cdf_final_features_t365d where record_date = '2018-06-12'"

df_features=spark.sql(feature_query)
df_features = df_features.filter(df_features.is_activation_x == 1).sample(False,0.01,301913)

df_features.cache()

print(df_features.count())

61116


In [4]:
df_features.groupby("most_recent_l1_cat_x","most_recent_l2_cat_x").count().show()

+--------------------+--------------------+-----+
|most_recent_l1_cat_x|most_recent_l2_cat_x|count|
+--------------------+--------------------+-----+
|          L1 - Local|    L2 - Home & Auto| 4168|
|          L1 - Local|               other|    4|
|          L1 - Local|L2 - Things to Do...| 2371|
|          L1 - Local|L2 - Things to Do...|17012|
|          L1 - Local|   L2 - Food & Drink| 6964|
|         L1 - Travel|              Travel| 1661|
|       L1 - Shopping|            Shopping|11462|
|               Local|               other|   33|
|          L1 - Local|         L2 - Retail| 7732|
|          L1 - Local|L2 - Health / Bea...| 9708|
|               other|               other|    1|
+--------------------+--------------------+-----+



In [4]:
target_query="select * from grp_gdoop_clv_db.temp_deact_target_t365d"

df_target=spark.sql(target_query)

df_target.cache()
df_target.groupby("deactivated").count().show()

+-----------+--------+
|deactivated|   count|
+-----------+--------+
|          1| 3262913|
|          0|11493143|
+-----------+--------+



In [5]:
tf = df_features.alias('tf')
tt = df_target.alias('tt')

training_data = tf.join(tt, "consumer_id" ,how='left_outer') 

training_data=training_data.fillna(1,subset="deactivated").drop("consumer_id","is_activation","is_reactivation_x","brand","record_date")

# need to fix queries and then remove
training_data = training_data.filter(training_data.most_recent_l2_cat_x != 'other')

training_data.cache()

DataFrame[recency_9block_cat_x: string, frequency_9block_cat_x: string, tenure_days_x: int, recency_x: int, frequency_t24m_x: bigint, nob_t24m_x: double, gp_t24m_x: double, frequency_t12m_x: bigint, nob_t12m_x: double, gp_t12m_x: double, local_orders_t24m_x: bigint, shopping_orders_t24m_x: bigint, travel_orders_t24m_x: bigint, app_orders_t24m_x: bigint, touch_orders_t24m_x: bigint, web_orders_t24m_x: bigint, most_recent_l1_cat_x: string, most_recent_l2_cat_x: string, most_recent_promo_type_cat_x: string, most_recent_platform_cat_x: string, is_activation_x: int, unique_purchase_quarters_t24m_x: bigint, visit_recency_x: int, visit_freq_7d_x: bigint, visit_freq_14_28d_x: bigint, visit_freq_28_60d_x: bigint, visit_freq_60_120d_x: bigint, visit_freq_120_180d_x: bigint, visit_freq_180_365d_x: bigint, appengaged_flag90d_x: int, webengaged_flag90d_x: int, touchengaged_flag90d_x: int, send_recency_x: int, sends_7d_x: bigint, sends_30d_x: bigint, open_recency_x: int, uniq_3day_opens_7d_x: bigint

In [7]:
training_data.groupby("deactivated").count().show()

+-----------+-----+
|deactivated|count|
+-----------+-----+
|          1|52124|
|          0| 8954|
+-----------+-----+



In [6]:
import pandas as pd
pd.DataFrame(training_data.take(5), columns=training_data.columns).transpose()

Unnamed: 0,0,1,2,3,4
recency_9block_cat_x,3-Low Rec (121-365 Days),3-Low Rec (121-365 Days),2-Med Rec (31-120 Days),3-Low Rec (121-365 Days),3-Low Rec (121-365 Days)
frequency_9block_cat_x,1-Low Freq (1 Order),1-Low Freq (1 Order),1-Low Freq (1 Order),1-Low Freq (1 Order),1-Low Freq (1 Order)
tenure_days_x,270,312,102,330,217
recency_x,270,312,102,330,217
frequency_t24m_x,1,1,1,1,1
nob_t24m_x,208.99,40,28.98,18,5
gp_t24m_x,-10,10.048,5.4763,8.5716,4.756
frequency_t12m_x,1,1,1,1,1
nob_t12m_x,208.99,40,28.98,18,5
gp_t12m_x,-10.3868,10.048,5.4763,8.5716,4.756


In [7]:
trainingdf,validatedf = training_data.randomSplit([.5,.5],492382)

trainingdf.cache()
validatedf.cache()
trainingdf.printSchema()

root
 |-- recency_9block_cat_x: string (nullable = true)
 |-- frequency_9block_cat_x: string (nullable = true)
 |-- tenure_days_x: integer (nullable = true)
 |-- recency_x: integer (nullable = true)
 |-- frequency_t24m_x: long (nullable = true)
 |-- nob_t24m_x: double (nullable = true)
 |-- gp_t24m_x: double (nullable = true)
 |-- frequency_t12m_x: long (nullable = true)
 |-- nob_t12m_x: double (nullable = true)
 |-- gp_t12m_x: double (nullable = true)
 |-- local_orders_t24m_x: long (nullable = true)
 |-- shopping_orders_t24m_x: long (nullable = true)
 |-- travel_orders_t24m_x: long (nullable = true)
 |-- app_orders_t24m_x: long (nullable = true)
 |-- touch_orders_t24m_x: long (nullable = true)
 |-- web_orders_t24m_x: long (nullable = true)
 |-- most_recent_l1_cat_x: string (nullable = true)
 |-- most_recent_l2_cat_x: string (nullable = true)
 |-- most_recent_promo_type_cat_x: string (nullable = true)
 |-- most_recent_platform_cat_x: string (nullable = true)
 |-- is_activation_x: integ

In [8]:
numeric_features = [t[0] for t in trainingdf.dtypes if t[1] != 'string']
trainingdf.select(numeric_features).describe().toPandas().transpose()


Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
tenure_days_x,30484,179.69646371867208,122.54160804994825,0,1167
recency_x,30484,174.70289332108646,107.93731848382116,0,365
frequency_t24m_x,30484,1.0347723395879806,0.4251986928723138,1,33
nob_t24m_x,30484,52.63257249704759,92.89785788326925,0.25,2000.0
gp_t24m_x,30484,11.541916644797276,19.213254199957213,-10.0,400.0
frequency_t12m_x,30484,1.0230612780475004,0.2778193801067243,1,20
nob_t12m_x,30484,51.33778703582201,79.6674537628344,0.25,1100.0
gp_t12m_x,30484,11.346563288938457,18.998393653228543,-160.0773,400.0
local_orders_t24m_x,30484,0.803437869046057,0.507600588542376,0,26


In [9]:
categorical_features = [t[0] for t in trainingdf.dtypes if t[1] == 'string']
trainingdf.select(categorical_features).describe().toPandas().transpose()


Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
recency_9block_cat_x,30484,,,1-High Rec (0-30 Days),3-Low Rec (121-365 Days)
frequency_9block_cat_x,30484,,,1-Low Freq (1 Order),3-High Freq (5+ Orders)
most_recent_l1_cat_x,30484,,,L1 - Local,L1 - Travel
most_recent_l2_cat_x,30484,,,L2 - Food & Drink,Travel
most_recent_promo_type_cat_x,30484,,,ILS,organic
most_recent_platform_cat_x,30484,,,app,web
gender_cat_x,30484,,,F,U
marital_status_cat_x,30484,,,0U,5U


In [10]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
stages = []

for categoricalCol in categorical_features:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]


# assemblerInputs = [c + "classVec" for c in categorical_features] + numeric_features
# assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
# stages += [assembler]

In [11]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = stages)
pipelineModel = pipeline.fit(trainingdf)
trainingdf = pipelineModel.transform(trainingdf)
validatedf = pipelineModel.transform(validatedf)

In [12]:
# selectedCols = ['retained', 'features'] 
featureCols = numeric_features + [c + "classVec" for c in categorical_features]
training_final = trainingdf.select(featureCols)
validate_final = validatedf.select(featureCols)
training_final.printSchema()

root
 |-- tenure_days_x: integer (nullable = true)
 |-- recency_x: integer (nullable = true)
 |-- frequency_t24m_x: long (nullable = true)
 |-- nob_t24m_x: double (nullable = true)
 |-- gp_t24m_x: double (nullable = true)
 |-- frequency_t12m_x: long (nullable = true)
 |-- nob_t12m_x: double (nullable = true)
 |-- gp_t12m_x: double (nullable = true)
 |-- local_orders_t24m_x: long (nullable = true)
 |-- shopping_orders_t24m_x: long (nullable = true)
 |-- travel_orders_t24m_x: long (nullable = true)
 |-- app_orders_t24m_x: long (nullable = true)
 |-- touch_orders_t24m_x: long (nullable = true)
 |-- web_orders_t24m_x: long (nullable = true)
 |-- is_activation_x: integer (nullable = true)
 |-- unique_purchase_quarters_t24m_x: long (nullable = true)
 |-- visit_recency_x: integer (nullable = true)
 |-- visit_freq_7d_x: long (nullable = true)
 |-- visit_freq_14_28d_x: long (nullable = true)
 |-- visit_freq_28_60d_x: long (nullable = true)
 |-- visit_freq_60_120d_x: long (nullable = true)
 |-- 

In [13]:
hc = H2OContext.getOrCreate(spark)

Connecting to H2O server at http://10.22.142.8:54321 ... successful.


0,1
H2O cluster uptime:,54 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.1
H2O cluster version age:,2 months and 19 days
H2O cluster name:,sparkling-water-pcourbois_application_1556810695108_842945
H2O cluster total nodes:,40
H2O cluster free memory:,464.0 Gb
H2O cluster total cores:,1440
H2O cluster allowed cores:,80



Sparkling Water Context:
 * H2O name: sparkling-water-pcourbois_application_1556810695108_842945
 * cluster size: 40
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (20,cerebro-worker502.snc1,54321)
  (36,cerebro-worker492.snc1,54321)
  (13,cerebro-worker386.snc1,54321)
  (7,cerebro-worker437.snc1,54321)
  (26,cerebro-worker407.snc1,54321)
  (11,cerebro-worker398.snc1,54321)
  (19,cerebro-worker378.snc1,54321)
  (32,cerebro-worker422.snc1,54321)
  (35,cerebro-worker385.snc1,54321)
  (9,cerebro-worker387.snc1,54321)
  (40,cerebro-worker314.snc1,54321)
  (34,cerebro-worker406.snc1,54321)
  (37,cerebro-worker493.snc1,54321)
  (18,cerebro-worker281.snc1,54321)
  (12,cerebro-worker346.snc1,54321)
  (25,cerebro-worker350.snc1,54321)
  (38,cerebro-worker326.snc1,54321)
  (24,cerebro-worker347.snc1,54321)
  (3,cerebro-worker320.snc1,54321)
  (28,cerebro-worker250.snc1,54321)
  (8,cerebro-worker156.snc1,54321)
  (17,cerebro-worker139.snc1,54321)
  (10,cerebro-wor

In [22]:
sc

In [14]:
h2o_train = hc.as_h2o_frame(training_final,framename="train")
h2o_test = hc.as_h2o_frame(validate_final,framename="test")

In [15]:
h2o_train.describe()

Rows:30484
Cols:73




Unnamed: 0,tenure_days_x,recency_x,frequency_t24m_x,nob_t24m_x,gp_t24m_x,frequency_t12m_x,nob_t12m_x,gp_t12m_x,local_orders_t24m_x,shopping_orders_t24m_x,travel_orders_t24m_x,app_orders_t24m_x,touch_orders_t24m_x,web_orders_t24m_x,is_activation_x,unique_purchase_quarters_t24m_x,visit_recency_x,visit_freq_7d_x,visit_freq_14_28d_x,visit_freq_28_60d_x,visit_freq_60_120d_x,visit_freq_120_180d_x,visit_freq_180_365d_x,appengaged_flag90d_x,webengaged_flag90d_x,touchengaged_flag90d_x,send_recency_x,sends_7d_x,sends_30d_x,open_recency_x,uniq_3day_opens_7d_x,uniq_3day_opens_30d_x,click_recency_x,uniq_3day_clicks_7d_x,uniq_3day_clicks_30d_x,unsubscription_30d_x,fnd_inv_csi_x,hbw_inv_csi_x,hna_inv_csi_x,retail_inv_csi_x,ttd_inv_csi_x,charity_inv_csi_x,total_inv_csi_x,age_x,email_contactable_x,mobile_contactable_x,deactivated,recency_9block_cat_xclassVec0,recency_9block_cat_xclassVec1,frequency_9block_cat_xclassVec0,frequency_9block_cat_xclassVec1,most_recent_l1_cat_xclassVec0,most_recent_l1_cat_xclassVec1,most_recent_l2_cat_xclassVec0,most_recent_l2_cat_xclassVec1,most_recent_l2_cat_xclassVec2,most_recent_l2_cat_xclassVec3,most_recent_l2_cat_xclassVec4,most_recent_l2_cat_xclassVec5,most_recent_l2_cat_xclassVec6,most_recent_promo_type_cat_xclassVec0,most_recent_promo_type_cat_xclassVec1,most_recent_promo_type_cat_xclassVec2,most_recent_promo_type_cat_xclassVec3,most_recent_platform_cat_xclassVec0,most_recent_platform_cat_xclassVec1,most_recent_platform_cat_xclassVec2,gender_cat_xclassVec0,gender_cat_xclassVec1,marital_status_cat_xclassVec0,marital_status_cat_xclassVec1,marital_status_cat_xclassVec2,marital_status_cat_xclassVec3
type,int,int,int,real,real,int,real,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,real,real,real,real,real,int,real,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,int
mins,0.0,0.0,1.0,0.25,-10.0,1.0,0.25,-160.0773,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,179.696463719,174.702893321,1.03477233959,52.632572497,11.5419166448,1.02306127805,51.3377870358,11.3465632889,0.803437869046,0.20309014565,0.0281459126099,0.318691772733,0.373113764598,0.328139351791,1.0,1.010825351,144.373966671,0.53454271093,0.983597953025,2.31393517911,4.84211389581,4.09359008004,11.116257709,0.266139614224,0.161264925863,0.209913397192,38.125344443,16.1245243406,61.6106482089,70.3505773521,1.07092245112,4.03864322267,89.27932686,0.0700695446792,0.249934391812,0.0138105235533,5.39078163299,37.0417311672,6.32916870489,2.24531641845,25.4569781623,0.0,76.46669029,40.0468770503,0.655491405327,0.241339719197,0.857039758562,0.627837554127,0.270994620129,0.9918645847,0.00642960241438,0.78595328697,0.186720902769,0.279490880462,0.186720902769,0.159526308883,0.125508463456,0.114191051043,0.0673796089752,0.0398569741504,0.598379477759,0.183342081092,0.102053536281,0.0867668285002,0.362846083191,0.317149980318,0.307800813542,0.657853300092,0.223625508463,0.752394698858,0.0869636530639,0.0857827056817,0.0627214276342
maxs,1167.0,365.0,33.0,2000.0,400.0,20.0,1100.0,400.0,26.0,19.0,3.0,19.0,10.0,10.0,1.0,7.0,900.0,36.0,63.0,116.0,200.0,191.0,563.0,1.0,1.0,1.0,99.0,46.0,176.0,99.0,37.0,121.0,99.0,8.0,31.0,1.0,42.1136,154.7243,24.0,15.6109,102.2619,0.0,302.9369,99.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
sigma,122.54160805,107.937318484,0.425198692872,92.8978578833,19.2132542,0.277819380107,79.6674537628,18.9983936532,0.507600588542,0.473476156641,0.167364037232,0.531799210374,0.511014527748,0.501633393569,0.0,0.155852027519,191.44363054,2.57443482688,4.12508152343,7.6543905008,14.2640439445,12.5161523269,26.6732547606,0.441945389296,0.367781166904,0.407253242591,47.9181982858,14.5890252709,55.7149709019,42.6025586109,3.81322056014,13.7340370319,27.6101573048,0.398893404706,1.08939636172,0.116705783028,6.10142732505,24.4837911519,4.14233311134,2.26416829194,21.49711071,0.0,54.3375858597,8.60751774322,0.475215562679,0.427902869344,0.350038041049,0.483389309905,0.444480614724,0.0898303680845,0.0799279187469,0.410166108434,0.389693711659,0.448756431002,0.389693711659,0.366172178214,0.331299999433,0.318048381847,0.250682386165,0.195626304886,0.490233987136,0.386953064644,0.302723666458,0.281497682761,0.480828854471,0.465374016015,0.461591228445,0.474436212271,0.416680736257,0.431628344169,0.28178641002,0.280047506329,0.242465211282
zeros,116,116,0,0,0,0,0,0,6484,24756,29635,21077,19346,20723,0,0,748,27085,25281,21744,18291,19240,12716,22371,25568,24085,17870,12069,11642,2211,24613,20909,224,29117,27087,30063,1490,392,408,2013,166,30484,5,0,10502,23127,4358,11345,22223,248,30288,6525,24792,21964,24792,25621,26658,27003,28430,29269,12243,24895,27373,27839,19423,20816,21101,10430,23667,7548,27833,27869,28572
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,2.0,2.0,1.0,62.0,10.5338,1.0,62.0,10.5338,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,4.0,4.0,99.0,0.0,0.0,99.0,0.0,0.0,0.0,4.0,35.0,6.0,1.69,20.0,0.0,66.69,38.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1,4.0,4.0,1.0,20.0,4.228,1.0,20.0,4.228,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,3.0,36.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,99.0,0.0,0.0,99.0,0.0,0.0,99.0,0.0,0.0,0.0,11.8503,63.6877,8.3856,6.6015,45.6149,0.0,136.14,38.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
2,5.0,5.0,1.0,12.98,4.3055,1.0,12.98,4.3055,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,5.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,99.0,0.0,0.0,99.0,0.0,0.0,99.0,0.0,0.0,0.0,1.2079,81.3326,8.6768,1.6166,21.9196,0.0,114.7535,38.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [16]:
from h2o.estimators import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch

In [17]:
h2o_train["deactivated"] = h2o_train["deactivated"].asfactor()
h2o_test["deactivated"] = h2o_test["deactivated"].asfactor()
feature_names = [c for c in h2o_train.names if "_x" in c]

In [18]:
feature_names = [c for c in h2o_train.names if "_x" in c]

rf_grid = h2o.grid.H2OGridSearch(
                h2o.estimators.H2ORandomForestEstimator(model_id="RF_grid"
                                                    ,score_tree_interval=25
                                                    ,min_rows=1000
                                                    ,stopping_metric='auc'
                                                    ,col_sample_rate_per_tree=0.66
                                                    ,sample_rate=0.50
                                                    ,stopping_tolerance=0.02
                                                    ,ntrees=300
                                                    ,seed=28372),
                   hyper_params = {
                       "max_depth": [6, 10, 20, 30],
                   },
                grid_id='rf_grid',                      
                search_criteria={'strategy': 'RandomDiscrete', 'max_models': 4, 'seed': 3923}
        )

In [None]:
rf_grid.train(y="deactivated",x=feature_names
                 ,training_frame   = h2o_train,
                validation_frame = h2o_test
                ,)

drf Grid Build progress: |████████████████████████████████████