# Compute targets for whether or not a user completed a purchase

In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import IntegerType
from datetime import datetime


In [17]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [35]:
# Load user dictionary which maps user_id_hashing to unique integers
users = pd.read_csv('data/user_dict.csv', header=None)
user_dict = {row[1][0]:int(row[1][1]) for row in users.iterrows()}

In [4]:
name = 'user_id_hash'
fn = UserDefinedFunction(lambda x: user_dict[x], IntegerType())

In [None]:
df = ss.read.csv('data/events.csv', header=True, inferSchema=True)


In [None]:
# Convert user_id_hash to user_id
purchases = df.withColumn('user_id', fn(df.user_id_hash))\
            .drop('user_id_hash', 'app_id', 'session_id')\
            .filter("event == 8")\
            .toPandas()

# Convert time feature
purchases['datetime'] = purchases['event_timestamp'].apply(lambda x:datetime.fromtimestamp(x/1000))

In [11]:
# Get purchases for 7 day and 14 day time periods 
purchases14 = purchases[(purchases['datetime'] >= '2018-12-01')
                        & (purchases['datetime'] <= '2018-12-14')
                        & (purchases['event_value'] != 0)]
purchases7 = purchases[(purchases['datetime'] >= '2018-12-01')
                        & (purchases['datetime'] <= '2018-12-07')
                        & (purchases['event_value'] != 0)]

In [12]:
# Get unique users for purchases
labels14 = purchases14.user_id.unique()
labels7 = purchases7.user_id.unique()

array([390699, 151528, 142719, 540810, 147408, 377990, 462529, 203937,
       471260, 422223])

In [49]:
labels14_dict = {user:1 if user in labels14 else 0 for user in user_dict.values()}
labels7_dict = {user:1 if user in labels7 else 0 for user in user_dict.values()}

In [None]:
# Create df of labels 
labels14_df = pd.DataFrame(labels14_dict.items(), columns=['user_id', 'label'])
labels7_df = pd.DataFrame(labels7_dict.items(), columns=['user_id', 'label'])

## Save df to csv

In [52]:
labels14_df.to_csv(path_or_buf='labels14.csv', index=False)
labels7_df.to_csv(path_or_buf='labels7.csv', index=False)