In [1]:
import pandas as pd
import numpy as np
import math
import json

from pprint import pprint
import matplotlib.pyplot as plt

from scipy.stats import uniform, randint
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, r2_score

In [2]:
# load in DataFrame created in the data_aggregation notebook
enriched_dtevent = pd.read_csv('enriched_dtevent.csv')
pd.set_option('display.max_columns', None)
enriched_dtevent

Unnamed: 0,person,offer_id,viewed_dt_mean,completed_dt_mean,viewed_count,completed_count,completed,gender,age,became_member_on,income,gender_encoded,membership_startdate,reward,difficulty,duration,offer_type,ch_email,ch_web,ch_mobile,ch_social,ot_bogo,ot_discount,ot_informational,offer_type_encoded
0,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,1.500000,,1.0,,0,M,33,20170421,72000.0,1,2017-04-21,0,0,4,informational,1,1,1,0,0,0,1,2
1,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,1.000000,,1.0,,0,M,33,20170421,72000.0,1,2017-04-21,0,0,3,informational,1,0,1,1,0,0,1,2
2,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,2.000000,,1.0,,0,M,33,20170421,72000.0,1,2017-04-21,5,5,5,bogo,1,1,1,1,1,0,0,0
3,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,1.500000,,1.0,,0,M,33,20170421,72000.0,1,2017-04-21,2,10,10,discount,1,1,1,1,0,1,0,1
4,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,1.000000,6.000000,1.0,1.0,1,O,40,20180109,57000.0,2,2018-01-09,5,20,10,discount,1,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42334,fffad4f4828548d1b5583907f2e9906b,f19421c1d4aa40978ebb69ca19b0e20d,2.250000,0.750000,2.0,2.0,1,M,34,20170123,34000.0,1,2017-01-23,5,5,5,bogo,1,1,1,1,1,0,0,0
42335,ffff82501cea40309d5fdd7edcca4a07,0b1e1539f2cc45b7b9fa7c272da2e1d7,0.250000,1.000000,1.0,1.0,1,F,45,20161125,62000.0,0,2016-11-25,5,20,10,discount,1,1,0,0,0,1,0,1
42336,ffff82501cea40309d5fdd7edcca4a07,2906b810c7d4411798c6938adc9daaa5,0.416667,2.666667,3.0,3.0,1,F,45,20161125,62000.0,0,2016-11-25,2,10,7,discount,1,1,1,0,0,1,0,1
42337,ffff82501cea40309d5fdd7edcca4a07,9b98b8c7a33c4b65b9aebfe6a799e6d9,1.250000,,1.0,,0,F,45,20161125,62000.0,0,2016-11-25,5,5,7,bogo,1,1,1,0,1,0,0,0


In [3]:
# any null values?
# - yes
enriched_dtevent.isnull().any()

person                  False
offer_id                False
viewed_dt_mean           True
completed_dt_mean        True
viewed_count             True
completed_count          True
completed               False
gender                  False
age                     False
became_member_on        False
income                  False
gender_encoded          False
membership_startdate    False
reward                  False
difficulty              False
duration                False
offer_type              False
ch_email                False
ch_web                  False
ch_mobile               False
ch_social               False
ot_bogo                 False
ot_discount             False
ot_informational        False
offer_type_encoded      False
dtype: bool

In [4]:
# convert any nans we might use later during regression...
enriched_dtevent['viewed_count'] = enriched_dtevent.viewed_count.replace(np.nan, 0)
enriched_dtevent['completed_count'] = enriched_dtevent.completed_count.replace(np.nan, 0)
enriched_dtevent['completed'] = enriched_dtevent.completed.replace(np.nan, 0)
enriched_dtevent

Unnamed: 0,person,offer_id,viewed_dt_mean,completed_dt_mean,viewed_count,completed_count,completed,gender,age,became_member_on,income,gender_encoded,membership_startdate,reward,difficulty,duration,offer_type,ch_email,ch_web,ch_mobile,ch_social,ot_bogo,ot_discount,ot_informational,offer_type_encoded
0,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,1.500000,,1.0,0.0,0,M,33,20170421,72000.0,1,2017-04-21,0,0,4,informational,1,1,1,0,0,0,1,2
1,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,1.000000,,1.0,0.0,0,M,33,20170421,72000.0,1,2017-04-21,0,0,3,informational,1,0,1,1,0,0,1,2
2,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,2.000000,,1.0,0.0,0,M,33,20170421,72000.0,1,2017-04-21,5,5,5,bogo,1,1,1,1,1,0,0,0
3,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,1.500000,,1.0,0.0,0,M,33,20170421,72000.0,1,2017-04-21,2,10,10,discount,1,1,1,1,0,1,0,1
4,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,1.000000,6.000000,1.0,1.0,1,O,40,20180109,57000.0,2,2018-01-09,5,20,10,discount,1,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42334,fffad4f4828548d1b5583907f2e9906b,f19421c1d4aa40978ebb69ca19b0e20d,2.250000,0.750000,2.0,2.0,1,M,34,20170123,34000.0,1,2017-01-23,5,5,5,bogo,1,1,1,1,1,0,0,0
42335,ffff82501cea40309d5fdd7edcca4a07,0b1e1539f2cc45b7b9fa7c272da2e1d7,0.250000,1.000000,1.0,1.0,1,F,45,20161125,62000.0,0,2016-11-25,5,20,10,discount,1,1,0,0,0,1,0,1
42336,ffff82501cea40309d5fdd7edcca4a07,2906b810c7d4411798c6938adc9daaa5,0.416667,2.666667,3.0,3.0,1,F,45,20161125,62000.0,0,2016-11-25,2,10,7,discount,1,1,1,0,0,1,0,1
42337,ffff82501cea40309d5fdd7edcca4a07,9b98b8c7a33c4b65b9aebfe6a799e6d9,1.250000,,1.0,0.0,0,F,45,20161125,62000.0,0,2016-11-25,5,5,7,bogo,1,1,1,0,1,0,0,0


In [5]:
# transform high and variable income values into fewer categorical values:
le = LabelEncoder()
income_cats = pd.cut(enriched_dtevent.income, 5, labels=['low', 'moderate' ,'considerable', 'high', 'very high'],retbins=True)[0]
enriched_dtevent['income_bins_encoded'] = le.fit_transform(income_cats)
# transform high and variable age values into fewer categorical values:
le = LabelEncoder()
age_cuts = pd.cut(enriched_dtevent.age, 5, labels=['low', 'moderate' ,'considerable', 'high', 'very high'],retbins=True)[0]
enriched_dtevent['age_bins_encoded'] = le.fit_transform(age_cuts)

enriched_dtevent

Unnamed: 0,person,offer_id,viewed_dt_mean,completed_dt_mean,viewed_count,completed_count,completed,gender,age,became_member_on,income,gender_encoded,membership_startdate,reward,difficulty,duration,offer_type,ch_email,ch_web,ch_mobile,ch_social,ot_bogo,ot_discount,ot_informational,offer_type_encoded,income_bins_encoded,age_bins_encoded
0,0009655768c64bdeb2e877511632db8f,3f207df678b143eea3cee63160fa8bed,1.500000,,1.0,0.0,0,M,33,20170421,72000.0,1,2017-04-21,0,0,4,informational,1,1,1,0,0,0,1,2,0,2
1,0009655768c64bdeb2e877511632db8f,5a8bc65990b245e5a138643cd4eb9837,1.000000,,1.0,0.0,0,M,33,20170421,72000.0,1,2017-04-21,0,0,3,informational,1,0,1,1,0,0,1,2,0,2
2,0009655768c64bdeb2e877511632db8f,f19421c1d4aa40978ebb69ca19b0e20d,2.000000,,1.0,0.0,0,M,33,20170421,72000.0,1,2017-04-21,5,5,5,bogo,1,1,1,1,1,0,0,0,0,2
3,0009655768c64bdeb2e877511632db8f,fafdcd668e3743c1bb461111dcafc2a4,1.500000,,1.0,0.0,0,M,33,20170421,72000.0,1,2017-04-21,2,10,10,discount,1,1,1,1,0,1,0,1,0,2
4,0011e0d4e6b944f998e987f904e8c1e5,0b1e1539f2cc45b7b9fa7c272da2e1d7,1.000000,6.000000,1.0,1.0,1,O,40,20180109,57000.0,2,2018-01-09,5,20,10,discount,1,1,0,0,0,1,0,1,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42334,fffad4f4828548d1b5583907f2e9906b,f19421c1d4aa40978ebb69ca19b0e20d,2.250000,0.750000,2.0,2.0,1,M,34,20170123,34000.0,1,2017-01-23,5,5,5,bogo,1,1,1,1,1,0,0,0,2,2
42335,ffff82501cea40309d5fdd7edcca4a07,0b1e1539f2cc45b7b9fa7c272da2e1d7,0.250000,1.000000,1.0,1.0,1,F,45,20161125,62000.0,0,2016-11-25,5,20,10,discount,1,1,0,0,0,1,0,1,3,3
42336,ffff82501cea40309d5fdd7edcca4a07,2906b810c7d4411798c6938adc9daaa5,0.416667,2.666667,3.0,3.0,1,F,45,20161125,62000.0,0,2016-11-25,2,10,7,discount,1,1,1,0,0,1,0,1,3,3
42337,ffff82501cea40309d5fdd7edcca4a07,9b98b8c7a33c4b65b9aebfe6a799e6d9,1.250000,,1.0,0.0,0,F,45,20161125,62000.0,0,2016-11-25,5,5,7,bogo,1,1,1,0,1,0,0,0,3,3


In [6]:
# derive feature matrix and target vector from input data for prediction
X = enriched_dtevent[['gender_encoded','age_bins_encoded','income_bins_encoded','ch_social','ch_mobile','ch_email','ch_web','offer_type_encoded','reward', 'difficulty', 'duration']]
y = enriched_dtevent[['completed']]
display(X,y)

Unnamed: 0,gender_encoded,age_bins_encoded,income_bins_encoded,ch_social,ch_mobile,ch_email,ch_web,offer_type_encoded,reward,difficulty,duration
0,1,2,0,0,1,1,1,2,0,0,4
1,1,2,0,1,1,1,0,2,0,0,3
2,1,2,0,1,1,1,1,0,5,5,5
3,1,2,0,1,1,1,1,1,2,10,10
4,2,3,3,0,0,1,1,1,5,20,10
...,...,...,...,...,...,...,...,...,...,...,...
42334,1,2,2,1,1,1,1,0,5,5,5
42335,0,3,3,0,0,1,1,1,5,20,10
42336,0,3,3,0,1,1,1,1,2,10,7
42337,0,3,3,0,1,1,1,0,5,5,7


Unnamed: 0,completed
0,0
1,0
2,0
3,0
4,1
...,...
42334,1
42335,1
42336,1
42337,0


In [7]:
# split data set in to train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
# define the model
model = LinearRegression()

In [9]:
# train (fit) the model
model.fit(X_train, y_train)

LinearRegression()

In [10]:
# print report results on predicted X_test data
print(classification_report(y_test, model.predict(X_test).round()))

              precision    recall  f1-score   support

           0       0.78      0.57      0.66      5582
           1       0.63      0.83      0.72      5003

    accuracy                           0.69     10585
   macro avg       0.71      0.70      0.69     10585
weighted avg       0.71      0.69      0.69     10585

