In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

In [3]:
portfolio.head()

Unnamed: 0,channels,difficulty,duration,id,offer_type,reward
0,"[email, mobile, social]",10,7,ae264e3637204a6fb9bb56bc8210ddfd,bogo,10
1,"[web, email, mobile, social]",10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,bogo,10
2,"[web, email, mobile]",0,4,3f207df678b143eea3cee63160fa8bed,informational,0
3,"[web, email, mobile]",5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,bogo,5
4,"[web, email]",20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,discount,5


In [4]:
transcript.head()

Unnamed: 0,event,person,time,value
0,offer received,78afa995795e4d85b5d9ceeca43f5fef,0,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'}
1,offer received,a03223e636434f42ac4c3df47e8bac43,0,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'}
2,offer received,e2127556f4f64592b11af22de27a7932,0,{'offer id': '2906b810c7d4411798c6938adc9daaa5'}
3,offer received,8ec6ce2a7e7949b1bf142def7d0e0586,0,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'}
4,offer received,68617ca6246f4fbc85e91a2a49552598,0,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'}


In [5]:
profile.head()

Unnamed: 0,age,became_member_on,gender,id,income
0,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,
1,55,20170715,F,0610b486422d4921ae7d2bf64640c50b,112000.0
2,118,20180712,,38fe809add3b4fcf9315a9694bb96ff5,
3,75,20170509,F,78afa995795e4d85b5d9ceeca43f5fef,100000.0
4,118,20170804,,a03223e636434f42ac4c3df47e8bac43,


In [6]:
df_1 = pd.merge(left=profile, right=transcript, left_on='id', right_on='person')

In [7]:
df_1 = df_1.drop('person', axis=1)

In [8]:
df_1.head()

Unnamed: 0,age,became_member_on,gender,id,income,event,time,value
0,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,,offer received,168,{'offer id': '2906b810c7d4411798c6938adc9daaa5'}
1,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,,offer viewed,216,{'offer id': '2906b810c7d4411798c6938adc9daaa5'}
2,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,,offer received,336,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'}
3,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,,offer viewed,348,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'}
4,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,,transaction,360,{'amount': 0.35000000000000003}


In [9]:
def parse_value_keys(row, value_type):
    if value_type in row.keys():
        label = row[value_type]
        return label
    else: pass

In [10]:
df_1['reward_amount'] = df_1.value.apply(lambda x: parse_value_keys(x, 'reward'))
df_1['transaction_amount'] = df_1.value.apply(lambda x: parse_value_keys(x, 'amount'))
df_1['offer_id'] = df_1.value.apply(lambda x: parse_value_keys(x, 'offer id'))

In [11]:
df_2 = pd.merge(left=df_1, right=portfolio, left_on='offer_id', right_on='id', how='outer',suffixes=('_customer','_offer'))

In [12]:
df_complete = df_2.drop(['value','id_offer'], axis=1)

In [13]:
df_complete.head()

Unnamed: 0,age,became_member_on,gender,id_customer,income,event,time,reward_amount,transaction_amount,offer_id,channels,difficulty,duration,offer_type,reward
0,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,,offer received,168,,,2906b810c7d4411798c6938adc9daaa5,"[web, email, mobile]",10.0,7.0,discount,2.0
1,118,20170212,,68be06ca386d4c31939f3a4f0e3dd783,,offer viewed,216,,,2906b810c7d4411798c6938adc9daaa5,"[web, email, mobile]",10.0,7.0,discount,2.0
2,68,20180426,M,e2127556f4f64592b11af22de27a7932,70000.0,offer received,0,,,2906b810c7d4411798c6938adc9daaa5,"[web, email, mobile]",10.0,7.0,discount,2.0
3,68,20180426,M,e2127556f4f64592b11af22de27a7932,70000.0,offer viewed,18,,,2906b810c7d4411798c6938adc9daaa5,"[web, email, mobile]",10.0,7.0,discount,2.0
4,118,20170925,,8ec6ce2a7e7949b1bf142def7d0e0586,,offer received,408,,,2906b810c7d4411798c6938adc9daaa5,"[web, email, mobile]",10.0,7.0,discount,2.0


In [14]:
df_complete['channels'] = df_complete['channels'].str.len()

`channels` column convert to length. All channels have `[web, email]` which is now 2. `[web, email, mobile]` is three and `[web, email, mobile, social]` is now 4. 

The offer types are also numeric now `bogo` = 1, `discount` = 2, and `informational` = 3

Events are numeric as such: 
    
`offer completed` = 2, `offer viewed` = 1, `offer received` = 0, `transaction` = 3

In [15]:
def parse_offers(row):
    if row == 'bogo':
        label = 1
    elif row == 'discount':
        label = 2
    else: label = 3
    return label

In [16]:
df_complete['offer_type'] = df_complete.offer_type.apply(lambda x: parse_offers(x))

In [17]:
events = {'offer received':0,'offer viewed':1,'offer completed':2,'transaction':3}
genders = {'M':1,'F':2, 'O':3, None:0}

df_complete.event = [events[item] for item in df_complete.event]
df_complete.gender = [genders[item] for item in df_complete.gender]

In [18]:
df_complete.fillna(value=0, inplace=True)

In [19]:
df_complete.head()

Unnamed: 0,age,became_member_on,gender,id_customer,income,event,time,reward_amount,transaction_amount,offer_id,channels,difficulty,duration,offer_type,reward
0,118,20170212,0,68be06ca386d4c31939f3a4f0e3dd783,0.0,0,168,0.0,0.0,2906b810c7d4411798c6938adc9daaa5,3.0,10.0,7.0,2,2.0
1,118,20170212,0,68be06ca386d4c31939f3a4f0e3dd783,0.0,1,216,0.0,0.0,2906b810c7d4411798c6938adc9daaa5,3.0,10.0,7.0,2,2.0
2,68,20180426,1,e2127556f4f64592b11af22de27a7932,70000.0,0,0,0.0,0.0,2906b810c7d4411798c6938adc9daaa5,3.0,10.0,7.0,2,2.0
3,68,20180426,1,e2127556f4f64592b11af22de27a7932,70000.0,1,18,0.0,0.0,2906b810c7d4411798c6938adc9daaa5,3.0,10.0,7.0,2,2.0
4,118,20170925,0,8ec6ce2a7e7949b1bf142def7d0e0586,0.0,0,408,0.0,0.0,2906b810c7d4411798c6938adc9daaa5,3.0,10.0,7.0,2,2.0


## FIX the offer_id and id_customer sha types to integers

In [50]:
from sklearn.externals import joblib

In [55]:
joblib.dump(df_complete, 'data/dataframe')

['data/dataframe']

In [58]:
#df_complete = joblib.load('data/dataframe')

Let's see if we can do a prediction for users who are likely to acheive any of the four labeled **events**. We can tag this outcome with probabilities to understand who is most likely to achieve each event type. 

We can then possible work backwards to understand the profile of a customer who is likely to acheive a given offer - or which offers typically give us specific **event** outcomes. 

In [36]:
from ml_test_tools import test_model

In [None]:
# !conda install -y -c conda-forge xgboost

In [41]:
import xgboost as xgb

In [42]:
models = {}

In [44]:
import random
N_TRIALS = 500
for i in range(N_TRIALS):
    learning_rate_factor = random.randint(1,25)
    xgb_rs = xgb.XGBClassifier(
        learning_rate=0.25 / learning_rate_factor,
        n_estimators=random.randint(20, 400),
        max_depth=random.randint(2, 10),
        min_child_weight=random.randint(1, 10),
        gamma=random.randint(0, 50)/5,
        subsample=1-(random.randint(1, 60)/100),
        colsample_bytree=1-(random.randint(1, 60)/100),
        random_state=9450, 
        objective='multi:softmax'
    )
    model = test_model(df_complete, xgb_rs, cv_folds=10, model_id=i)
    models.update({i:model})

ValueError: could not convert string to float: '68be06ca386d4c31939f3a4f0e3dd783'