In [35]:
# Import the Necessary Dependencies
import os
import gc
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder,LabelBinarizer

from keras.layers import Input, Embedding, Dense, Flatten, Dropout, concatenate
from keras.layers import BatchNormalization, SpatialDropout1D
from keras.callbacks import Callback
from keras.models import Model
from keras.optimizers import Adam

warnings.filterwarnings('ignore')
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
#os.environ['OMP_NUM_THREADS'] = '4'

In [65]:
#path = '/Users/804357/Desktop/MyFiles/Learn/LOM/Data/'
path = '/Users/Vishy/Files/AVDatahack/LOM/Data/' 
train = pd.read_csv(path+'train.csv', nrows=40000)
test = pd.read_csv(path+'test.csv', nrows=25000)
camp = pd.read_csv(path+'campaign_data.csv')

train_df = pd.merge(train, camp, on='campaign_id')
test_df = pd.merge(test, camp, on='campaign_id')
print('Shape of the train data is', train_df.shape)
print('Shape of the test data is', test_df.shape)

len_train = len(train_df)
train_df = train_df.append(test_df)
del test_df
gc.collect()

Shape of the train data is (40000, 14)
Shape of the test data is (25000, 12)


619

In [66]:
print('New variable creations')
#train['send_date'] = pd.to_datetime(train['send_date'])
train_df['hour'] = pd.to_datetime(train_df.send_date).dt.hour.astype('uint8')
train_df['day'] = pd.to_datetime(train_df.send_date).dt.day.astype('uint8')
train_df['dow'] = pd.to_datetime(train_df.send_date).dt.dayofweek.astype('uint8')

New variable creations


KeyboardInterrupt: 

In [61]:
train_df['dow'] = train_df['dow'].map({0:'Mon', 1:'Tue', 2:'Wed', 3:'Thr',4: 'Fri', 5: 'Sat', 6: 'Sun'} ).astype(str)
train_df['hour'] = train_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
train_df['day'] = train_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)

In [43]:
print('Grouping variables creation')
# ip - user_id,  Channel - campaign_id
gp = train_df[['user_id','campaign_id']].groupby(by=['user_id'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totcnt'})
train_df = train_df.merge(gp, on=['user_id'], how='left')
del gp
gc.collect()
gp = train_df[['user_id','is_open']].groupby(by=['user_id'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opncnt'})
train_df = train_df.merge(gp, on=['user_id'], how='left')
del gp
gc.collect()
gp = train_df[['user_id','is_click']].groupby(by=['user_id'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkcnt'})
train_df = train_df.merge(gp, on=['user_id'], how='left')
del gp
gc.collect()

gp = train_df[['user_id','dow','campaign_id']].groupby(by=['user_id','dow'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'dowcnt'})
train_df = train_df.merge(gp, on=['user_id','dow'], how='left')
del gp
gc.collect()
gp = train_df[['user_id','dow','is_click']].groupby(by=['user_id','dow'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'dowclk'})
train_df = train_df.merge(gp, on=['user_id','dow'], how='left')
del gp
gc.collect()

gp = train_df[['user_id','day','campaign_id']].groupby(by=['user_id','day'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'daycnt'})
train_df = train_df.merge(gp, on=['user_id','day'], how='left')
del gp
gc.collect()
gp = train_df[['user_id','day','is_click']].groupby(by=['user_id','day'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'dayclk'})
train_df = train_df.merge(gp, on=['user_id','day'], how='left')
del gp
gc.collect()

gp = train_df[['user_id','hour','campaign_id']].groupby(by=['user_id','hour'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'hourcnt'})
train_df = train_df.merge(gp, on=['user_id','hour'], how='left')
del gp
gc.collect()
gp = train_df[['user_id','hour','is_click']].groupby(by=['user_id','hour'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'hourclk'})
train_df = train_df.merge(gp, on=['user_id','hour'], how='left')
del gp
gc.collect()

print("Assigning data types for variables")
train_df['totcnt'] = train_df['totcnt'].fillna(0).astype('uint16')
train_df['opncnt'] = train_df['opncnt'].fillna(0).astype('uint16')
train_df['clkcnt'] = train_df['clkcnt'].fillna(0).astype('uint16')
train_df['dowcnt'] = train_df['dowcnt'].fillna(0).astype('uint16')
train_df['dowclk'] = train_df['dowclk'].fillna(0).astype('uint16')
train_df['daycnt'] = train_df['daycnt'].fillna(0).astype('uint16')
train_df['dayclk'] = train_df['dayclk'].fillna(0).astype('uint16')
train_df['hourcnt'] = train_df['hourcnt'].fillna(0).astype('uint16')
train_df['hourclk'] = train_df['hourclk'].fillna(0).astype('uint16')

Grouping variables creation
Assigning data types for variables


In [67]:
train_df.head()

Unnamed: 0,campaign_id,communication_type,email_body,email_url,id,is_click,is_open,no_of_images,no_of_internal_links,no_of_sections,send_date,subject,total_links,user_id
0,42,Newsletter,"September Newsletter\r\n \r\nDear AVians,\r\n ...",http://r.newsletters.analyticsvidhya.com/7v3rd...,42_14051,0.0,0.0,13,79,4,01-09-2017 19:55,[September] Exciting days ahead with DataHack ...,88,14051
1,42,Newsletter,"September Newsletter\r\n \r\nDear AVians,\r\n ...",http://r.newsletters.analyticsvidhya.com/7v3rd...,42_177808,0.0,0.0,13,79,4,01-09-2017 20:13,[September] Exciting days ahead with DataHack ...,88,177808
2,42,Newsletter,"September Newsletter\r\n \r\nDear AVians,\r\n ...",http://r.newsletters.analyticsvidhya.com/7v3rd...,42_133077,0.0,0.0,13,79,4,01-09-2017 20:11,[September] Exciting days ahead with DataHack ...,88,133077
3,42,Newsletter,"September Newsletter\r\n \r\nDear AVians,\r\n ...",http://r.newsletters.analyticsvidhya.com/7v3rd...,42_118677,0.0,0.0,13,79,4,01-09-2017 20:15,[September] Exciting days ahead with DataHack ...,88,118677
4,42,Newsletter,"September Newsletter\r\n \r\nDear AVians,\r\n ...",http://r.newsletters.analyticsvidhya.com/7v3rd...,42_25809,0.0,0.0,13,79,4,01-09-2017 19:49,[September] Exciting days ahead with DataHack ...,88,25809


In [5]:
print("label encoding....")

train_df[['user_id', 'campaign_id', 'hour', 'day', 'dow']].apply(LabelEncoder().fit_transform)
test_df = train_df[len_train:]
train_df = train_df[:len_train]
y_train = train_df['is_click'].values
train_df.drop(['id','user_id','send_date','is_click','communication_type','email_body','email_url',
               'is_open','subject'], 1, inplace=True)
print('Data is prepared')

label encoding....
Data is prepared


In [6]:
train_df.head()

Unnamed: 0,campaign_id,no_of_images,no_of_internal_links,no_of_sections,total_links,hour,day,dow,qty,camp_count
0,42,13,79,4,88,19,9,0,1,16
1,42,13,79,4,88,20,9,0,1,11
2,42,13,79,4,88,20,9,0,1,10
3,42,13,79,4,88,20,9,0,1,11
4,42,13,79,4,88,19,9,0,1,12


In [7]:
print('Getting the max numbers for Neural Network')
max_camp = np.max([train_df['campaign_id'].max(), test_df['campaign_id'].max()])+1
max_img = np.max([train_df['no_of_images'].max(), test_df['no_of_images'].max()])+1
max_ilink = np.max([train_df['no_of_internal_links'].max(), test_df['no_of_internal_links'].max()])+1
max_sec = np.max([train_df['no_of_sections'].max(), test_df['no_of_sections'].max()])+1
max_totlnk = np.max([train_df['total_links'].max(), test_df['total_links'].max()])+1
max_hour = np.max([train_df['hour'].max(), test_df['hour'].max()])+1
max_day = np.max([train_df['day'].max(), test_df['day'].max()])+1
max_dow = np.max([train_df['dow'].max(), test_df['dow'].max()])+1
max_qty = np.max([train_df['qty'].max(), test_df['qty'].max()])+1
max_c1 = np.max([train_df['camp_count'].max(), test_df['camp_count'].max()])+1

Getting the max numbers for Neural Network


In [8]:
def get_keras_data(dataset):
    X = {
        'cid': np.array(dataset.campaign_id),
        'noimg': np.array(dataset.no_of_images),
        'noinlinks': np.array(dataset.no_of_internal_links),
        'nosections': np.array(dataset.no_of_sections),
        'totlinks': np.array(dataset.total_links),
        'h': np.array(dataset.hour),
        'd': np.array(dataset.day),
        'wd': np.array(dataset.dow),
        'qty': np.array(dataset.qty),
        'c1': np.array(dataset.camp_count)
    }
    return X

In [9]:
train_df = get_keras_data(train_df)
print('Building the Network')

emb_n = 50
dense_n = 1000

in_cid = Input(shape=[1], name='cid')
emb_cid = Embedding(max_camp, emb_n)(in_cid)
in_img = Input(shape=[1], name='noimg')
emb_img = Embedding(max_img, emb_n)(in_img)
in_inlnk = Input(shape=[1], name='noinlinks')
emb_inlnk = Embedding(max_ilink, emb_n)(in_inlnk)
in_nosec = Input(shape=[1], name='nosections')
emb_nosec = Embedding(max_sec, emb_n)(in_nosec)
in_totlnk = Input(shape=[1], name='totlinks')
emb_totlnk = Embedding(max_totlnk, emb_n)(in_totlnk)
in_h = Input(shape=[1], name='h')
emb_h = Embedding(max_hour, emb_n)(in_h)
in_d = Input(shape=[1], name='d')
emb_d = Embedding(max_day, emb_n)(in_d)
in_wd = Input(shape=[1], name='wd')
emb_wd = Embedding(max_dow, emb_n)(in_wd)
in_qty = Input(shape=[1], name='qty')
emb_qty = Embedding(max_qty, emb_n)(in_qty)
in_c1 = Input(shape=[1], name='c1')
emb_c1 = Embedding(max_c1, emb_n)(in_c1)

fe = concatenate([(emb_cid), (emb_img), (emb_inlnk), (emb_nosec), (emb_totlnk), (emb_h),
                 (emb_d), (emb_wd), (emb_qty), (emb_c1)])
s_dout = SpatialDropout1D(0.2)(fe)
x = Flatten()(s_dout)
x = Dropout(0.2)(Dense(dense_n, activation='relu')(x))
x = Dropout(0.2)(Dense(dense_n, activation='relu')(x))
outp = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[in_cid, in_img, in_inlnk, in_nosec, in_totlnk, in_h, in_d, in_wd, in_qty,
              in_c1 ], outputs=outp)

batch_size = 512
epochs = 3
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(train_df) / batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0001
lr_decay = exp_decay(lr_init, lr_fin, steps)
optimizer_adam = Adam(lr=0.001, decay=lr_decay)
model.compile(loss='binary_crossentropy', optimizer=optimizer_adam,
              metrics=['accuracy'])
model.summary()

print('Training the Model')
model.fit(train_df, y_train, batch_size=batch_size, epochs=epochs, shuffle=True, verbose=2)
del train_df, y_train
gc.collect()
# model.save_weights('dl_support.h5')

Building the Network
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
cid (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
noimg (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
noinlinks (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
nosections (InputLayer)         (None, 1)            0                                            
________________________________________________________________________________________

14490

In [10]:
sub = pd.DataFrame()
sub['id'] = test_df['id'].astype('int')
test_df.drop(['id','user_id','send_date','is_click','communication_type','email_body','email_url',
               'is_open','subject'], 1, inplace=True)
test_df = get_keras_data(test_df)

print("Predicting the result")
sub['is_click'] = model.predict(test_df, batch_size=batch_size, verbose=2)
del test_df
gc.collect()
print("Predictions are ready")
sub.to_csv('dl_support.csv', index=False)

Predicting the result
Predictions are ready
