In [1]:
import numpy as np
import pandas as pd
from time import time

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion,Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,HashingVectorizer, TfidfTransformer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, concatenate, GRU, Embedding, Flatten, Activation
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K

import multiprocessing as mp
from scipy.sparse import csr_matrix
import os
from sklearn.metrics import roc_auc_score
import gc
from sklearn.base import BaseEstimator, TransformerMixin
import re
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

import warnings
warnings.filterwarnings("ignore")

warnings.filterwarnings('ignore')
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
os.environ['MKL_NUM_THREADS'] = '4'
os.environ['OMP_NUM_THREADS'] = '4'
os.environ['JOBLIB_START_METHOD'] = 'forkserver'

In [57]:
start = time()
path = '/Users/Vishy/Files/AVDatahack/LOM/Data/'
# train = pd.read_csv(path+'train.csv',)
# test = pd.read_csv(path+'test.csv')
# camp = pd.read_csv(path+'campaign_data.csv')

# print('New variable creations')
# train['hour'] = pd.to_datetime(train.send_date).dt.hour.astype('uint8')
# train['day'] = pd.to_datetime(train.send_date).dt.day.astype('uint8')
# train['dow'] = pd.to_datetime(train.send_date).dt.dayofweek.astype('uint8')

# test['hour'] = pd.to_datetime(test.send_date).dt.hour.astype('uint8')
# test['day'] = pd.to_datetime(test.send_date).dt.day.astype('uint8')
# test['dow'] = pd.to_datetime(test.send_date).dt.dayofweek.astype('uint8')

# camp['sub_tokens'] = camp['subject'].map(tokenizer)
# camp['email_tokens'] = camp['email_body'].map(tokenizer)
# train_df = pd.merge(train, camp, on='campaign_id')
# test_df = pd.merge(test, camp, on='campaign_id')

train_df = pd.read_csv(path+'train_df.csv',nrows=20000,parse_dates=['send_date'])
#test_df = pd.read_csv(path+'test_df.csv',nrows=100000,parse_dates=['send_date'])

In [4]:
start = time()
print('Renaming of the new date variables created')
train_df['dow'] = train_df['dow'].map({0:'MON', 1:'TUE', 2:'WED', 3:'THR',4: 'FRI', 5: 'SAT', 6: 'SUN'} ).astype(str)
train_df['hour'] = train_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
train_df['day'] = train_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)

test_df['dow'] = test_df['dow'].map({0:'MON', 1:'TUE', 2:'WED', 3:'THR',4: 'FRI', 5: 'SAT', 6: 'SUN'} ).astype(str)
test_df['hour'] = test_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
test_df['day'] = test_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)


print('Time taken for renaming date variable: ', time()-start, ' secs')

Renaming of the new date variables created
Time taken for renaming date variable:  0.33565282821655273  secs


In [5]:
#ntraindf = train_df[['user_id','campaign_id','is_open','is_click','hour','day','dow','communication_type']]
#ntraindf.to_excel('train.xlsx', index=False)

In [6]:
#ntraindf.user_id.nunique()
#del ntraindf

In [7]:
train_df.columns

Index(['id', 'user_id', 'campaign_id', 'send_date', 'is_open', 'is_click',
       'hour', 'day', 'dow', 'communication_type', 'total_links',
       'no_of_internal_links', 'no_of_images', 'no_of_sections', 'email_body',
       'subject', 'email_url', 'sub_tokens', 'email_tokens'],
      dtype='object')

In [8]:
# Group by next campaign
GROUP_BY_NEXT_CAMP = [
    {'groupby': ['user_id']},
    {'groupby': ['user_id', 'communication_type']},
    {'groupby': ['user_id', 'day']},
    {'groupby': ['user_id', 'dow']},
]

for spec in GROUP_BY_NEXT_CAMP:
    new_feature = '{}_nxt'.format('_'.join(spec['groupby']))    
    all_features = spec['groupby'] + ['send_date']
    train_df[new_feature] = train_df[all_features].groupby(spec['groupby']).send_date.transform(lambda x: x.diff().shift(-1)).dt.seconds
print('Group by next campaign for train complete')
    
for spec in GROUP_BY_NEXT_CAMP:
    new_feature = '{}_nxt'.format('_'.join(spec['groupby']))    
    all_features = spec['groupby'] + ['send_date']
    test_df[new_feature] = test_df[all_features].groupby(spec['groupby']).send_date.transform(lambda x: x.diff().shift(-1)).dt.seconds
print('Group by next campaign for test complete')

Group by next campaign for train complete
Group by next campaign for test complete


In [12]:
print("Assigning data types for new variables in train data")
train_df['user_id_nxt'] = train_df['user_id_nxt'].fillna(0).astype('uint16')
train_df['user_id_communication_type_nxt'] = train_df['user_id_communication_type_nxt'].fillna(0).astype('uint16')
train_df['user_id_day_nxt'] = train_df['user_id_day_nxt'].fillna(0).astype('uint16')
train_df['user_id_dow_nxt'] = train_df['user_id_dow_nxt'].fillna(0).astype('uint16')

Assigning data types for new variables in train data


In [14]:
print("Assigning data types for new variables in train data")
test_df['user_id_nxt'] = test_df['user_id_nxt'].fillna(0).astype('uint16')
test_df['user_id_communication_type_nxt'] = test_df['user_id_communication_type_nxt'].fillna(0).astype('uint16')
test_df['user_id_day_nxt'] = test_df['user_id_day_nxt'].fillna(0).astype('uint16')
test_df['user_id_dow_nxt'] = test_df['user_id_dow_nxt'].fillna(0).astype('uint16')

Assigning data types for new variables in train data


In [15]:
# Number of prev ang next campaign
HISTORY_CAMP = {
    'typ1': ['user_id', 'communication_type'],
    'typ2': ['user_id', 'communication_type', 'dow'],
    'typ3': ['user_id', 'communication_type', 'day'],
}

for fname, fset in HISTORY_CAMP.items():
    train_df['prev_'+fname] = train_df.groupby(fset).cumcount().rename('prev_'+fname)
    train_df['futr_'+fname] = train_df.iloc[::-1].groupby(fset).cumcount().rename('futr_'+fname).iloc[::-1]
print('Group by previous and next campaign for train complete')
    
for fname, fset in HISTORY_CAMP.items():
    test_df['prev_'+fname] = test_df.groupby(fset).cumcount().rename('prev_'+fname)
    test_df['futr_'+fname] = test_df.iloc[::-1].groupby(fset).cumcount().rename('futr_'+fname).iloc[::-1]
print('Group by previous and next campaign for test complete')

Group by previous and next campaign for train complete
Group by previous and next campaign for test complete


In [16]:
print("Assigning data types for new variables in train data")
train_df['prev_typ1'] = train_df['prev_typ1'].fillna(0).astype('uint16')
train_df['prev_typ2'] = train_df['prev_typ2'].fillna(0).astype('uint16')
train_df['prev_typ3'] = train_df['prev_typ3'].fillna(0).astype('uint16')
train_df['futr_typ1'] = train_df['futr_typ1'].fillna(0).astype('uint16')
train_df['futr_typ2'] = train_df['futr_typ2'].fillna(0).astype('uint16')
train_df['futr_typ3'] = train_df['futr_typ3'].fillna(0).astype('uint16')

Assigning data types for new variables in train data


In [17]:
print("Assigning data types for new variables in train data")
test_df['prev_typ1'] = test_df['prev_typ1'].fillna(0).astype('uint16')
test_df['prev_typ2'] = test_df['prev_typ2'].fillna(0).astype('uint16')
test_df['prev_typ3'] = test_df['prev_typ3'].fillna(0).astype('uint16')
test_df['futr_typ1'] = test_df['futr_typ1'].fillna(0).astype('uint16')
test_df['futr_typ2'] = test_df['futr_typ2'].fillna(0).astype('uint16')
test_df['futr_typ3'] = test_df['futr_typ3'].fillna(0).astype('uint16')

Assigning data types for new variables in train data


In [18]:
# Grouping of variables # User_ID level
print(' User_ID Level Grouping variables creation for train_data')

# Dow
gp = train_df[['user_id','dow','campaign_id']].groupby(by=['user_id','dow'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'udowcnt'})
train_df = train_df.merge(gp, on=['user_id','dow'], how='left')
test_df = test_df.merge(gp, on=['user_id','dow'], how='left')
del gp
gc.collect()

# Day
gp = train_df[['user_id','day','campaign_id']].groupby(by=['user_id','day'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'udaycnt'})
train_df = train_df.merge(gp, on=['user_id','day'], how='left')
test_df = test_df.merge(gp, on=['user_id','day'], how='left')
del gp
gc.collect()

# Hour
gp = train_df[['user_id','hour','campaign_id']].groupby(by=['user_id','hour'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'uhourcnt'})
train_df = train_df.merge(gp, on=['user_id','hour'], how='left')
test_df = test_df.merge(gp, on=['user_id','hour'], how='left')
del gp
gc.collect()


 User_ID Level Grouping variables creation for train_data


108

In [19]:
print("Assigning data types for variables train data")
train_df['udowcnt'] = train_df['udowcnt'].fillna(0).astype('uint16')
train_df['udaycnt'] = train_df['udaycnt'].fillna(0).astype('uint16')
train_df['uhourcnt'] = train_df['uhourcnt'].fillna(0).astype('uint16')

Assigning data types for variables train data


In [20]:
print("Assigning data types for variables test data")
test_df['udowcnt'] = test_df['udowcnt'].fillna(0).astype('uint16')
test_df['udaycnt'] = test_df['udaycnt'].fillna(0).astype('uint16')
test_df['uhourcnt'] = test_df['uhourcnt'].fillna(0).astype('uint16')

Assigning data types for variables test data


In [21]:
train_df.columns

Index(['id', 'user_id', 'campaign_id', 'send_date', 'is_open', 'is_click',
       'hour', 'day', 'dow', 'communication_type', 'total_links',
       'no_of_internal_links', 'no_of_images', 'no_of_sections', 'email_body',
       'subject', 'email_url', 'sub_tokens', 'email_tokens', 'user_id_nxt',
       'user_id_communication_type_nxt', 'user_id_day_nxt', 'user_id_dow_nxt',
       'prev_typ1', 'futr_typ1', 'prev_typ2', 'futr_typ2', 'prev_typ3',
       'futr_typ3', 'udowcnt', 'udaycnt', 'uhourcnt'],
      dtype='object')

In [22]:
# Combining the data
train_df, dev_df = train_test_split(train_df, random_state=42, test_size=0.10)

print('Shape of the train data is ',train_df.shape,' Test data is ',test_df.shape,' Valid data is',dev_df.shape)
print('Time taken for importing the data is: ', time()-start, ' secs')

Y_train = train_df.is_click.values.reshape(-11, 1)
Y_dev = dev_df.is_click.values.reshape(-1, 1)
test_id = test_df['id'].values
del test_df['id'],train_df['is_open'],train_df['is_click'],dev_df['is_open'],dev_df['is_click']

n_trains = train_df.shape[0]
n_devs = dev_df.shape[0]
n_test = test_df.shape[0]

full_df = pd.concat([train_df,dev_df,test_df])
del train_df,dev_df,test_df
gc.collect()

Shape of the train data is  (180000, 32)  Test data is  (100000, 30)  Valid data is (20000, 32)
Time taken for importing the data is:  1064.1751079559326  secs


47

In [23]:
# Converting the Categorical data
print("Processing categorical data...")
start = time()
le = LabelEncoder()

le.fit(full_df.user_id)
full_df.user_id = le.transform(full_df.user_id)

le.fit(full_df.dow)
full_df.dow = le.transform(full_df.dow)

le.fit(full_df.hour)
full_df.hour = le.transform(full_df.hour)

le.fit(full_df.day)
full_df.day = le.transform(full_df.day)

le.fit(full_df.communication_type)
full_df.communication_type = le.transform(full_df.communication_type)

del le
print('Time taken for processing categorical is:', time()-start )


Processing categorical data...
Time taken for processing categorical is: 2.05899715423584


In [25]:
# Transform text data into sequences
print("Transforming text data to sequences...")
start = time()
raw_text = np.hstack([full_df.subject.str.lower(), full_df.email_body.str.lower()])

print("Fitting tokenizer...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)

print("Transforming text to sequences...")
full_df['seq_subject'] = tok_raw.texts_to_sequences(full_df.subject.str.lower())
full_df['seq_email'] = tok_raw.texts_to_sequences(full_df.email_body.str.lower())

del raw_text
#del tok_raw
print('Time taken for Transforming text data to sequences is:', time()-start)

Transforming text data to sequences...
Fitting tokenizer...
Transforming text to sequences...
Time taken for Transforming text data to sequences is: 274.07536911964417


In [26]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 38762 to 99999
Data columns (total 32 columns):
campaign_id                       300000 non-null int64
communication_type                300000 non-null int64
day                               300000 non-null int64
dow                               300000 non-null int64
email_body                        300000 non-null object
email_tokens                      300000 non-null object
email_url                         300000 non-null object
futr_typ1                         300000 non-null uint16
futr_typ2                         300000 non-null uint16
futr_typ3                         300000 non-null uint16
hour                              300000 non-null int64
id                                200000 non-null object
no_of_images                      300000 non-null int64
no_of_internal_links              300000 non-null int64
no_of_sections                    300000 non-null int64
prev_typ1                         30000

In [27]:
# Define constants to use when define RNN model
MAX_SUB_SEQ = 20
MAX_EMAIL_SEQ = 200
MAX_TEXT = np.max([np.max(full_df.seq_subject.max()), np.max(full_df.seq_email.max()),]) + 4
MAX_USER = np.max(full_df.user_id.max()) + 1
MAX_DOW = np.max(full_df.dow.max()) + 1
MAX_HOUR = np.max(full_df.hour.max()) + 1
MAX_DAY = np.max(full_df.day.max()) + 1
MAX_COM = np.max(full_df.communication_type.max()) + 1
max_udowcnt = np.max(full_df.udowcnt.max())+1
max_udaycnt = np.max(full_df.udaycnt.max())+1
max_uhourcnt = np.max(full_df.uhourcnt.max())+1
max_user_id_nxt = np.max(full_df.user_id_nxt.max())+1
max_user_id_communication_type_nxt = np.max(full_df.user_id_communication_type_nxt.max())+1
max_user_id_day_nxt = np.max(full_df.user_id_day_nxt.max())+1
max_user_id_dow_nxt = np.max(full_df.user_id_dow_nxt.max())+1
max_prev_typ1 = np.max(full_df.prev_typ1.max())+1
max_prev_typ2 = np.max(full_df.prev_typ2.max())+1
max_prev_typ3 = np.max(full_df.prev_typ3.max())+1
max_futr_typ1 = np.max(full_df.futr_typ1.max())+1
max_futr_typ2 = np.max(full_df.futr_typ2.max())+1
max_futr_typ3 = np.max(full_df.futr_typ3.max())+1

In [28]:
# Generate data for the RNN

def get_keras_data(df):
    X = {
        'user_id': np.array(df.user_id),
        'subject': pad_sequences(df.seq_subject, maxlen=MAX_SUB_SEQ),
        'email': pad_sequences(df.seq_email, maxlen=MAX_EMAIL_SEQ),
        'dow': np.array(df.dow),
        'hour': np.array(df.hour),
        'day': np.array(df.day),
        'communication_type': np.array(df.communication_type),
        'udowcnt': np.array(df.udowcnt),
        'udaycnt': np.array(df.udaycnt),
        'uhourcnt': np.array(df.uhourcnt),
        'user_id_nxt': np.array(df.user_id_nxt),
        'user_id_communication_type_nxt': np.array(df.user_id_communication_type_nxt),
        'user_id_day_nxt': np.array(df.user_id_day_nxt),
        'user_id_dow_nxt': np.array(df.user_id_dow_nxt),
        'prev_typ1': np.array(df.prev_typ1),
        'prev_typ2': np.array(df.prev_typ2),
        'prev_typ3': np.array(df.prev_typ3),
        'futr_typ1': np.array(df.futr_typ1),
        'futr_typ2': np.array(df.futr_typ2),
        'futr_typ3': np.array(df.futr_typ3),
    }
    return X

In [29]:
print("Generate data for the RNN...")
start = time()
train = full_df[:n_trains]
dev = full_df[n_trains:n_trains+n_devs]
test = full_df[n_trains+n_devs:]

X_train = get_keras_data(train)
X_dev = get_keras_data(dev)
X_test = get_keras_data(test)
print('Time taken for Generate data for the RNN is:', time()-start )

Generate data for the RNN...
Time taken for Generate data for the RNN is: 10.397393941879272


In [54]:
# defining the RNN Model

def new_rnn_model(lr=0.001, decay=0.0):    
    # Inputs
    user_id = Input(shape=[1], name="user_id")
    subject = Input(shape=[X_train["subject"].shape[1]], name="subject")
    email = Input(shape=[X_train["email"].shape[1]], name="email")
    dow = Input(shape=[1], name="dow")
    hour = Input(shape=[1], name="hour")
    day = Input(shape=[1], name="day")
    communication_type = Input(shape=[1], name="communication_type")
    udowcnt = Input(shape=[1], name="udowcnt")
    udaycnt = Input(shape=[1], name="udaycnt")
    uhourcnt = Input(shape=[1], name="uhourcnt")
    user_id_nxt = Input(shape=[1], name="user_id_nxt")
    user_id_communication_type_nxt = Input(shape=[1], name="user_id_communication_type_nxt")
    user_id_day_nxt = Input(shape=[1], name="user_id_day_nxt")
    user_id_dow_nxt = Input(shape=[1], name="user_id_dow_nxt")
    prev_typ1 = Input(shape=[1], name="prev_typ1")
    prev_typ2 = Input(shape=[1], name="prev_typ2")
    prev_typ3 = Input(shape=[1], name="prev_typ3")
    futr_typ1 = Input(shape=[1], name="futr_typ1")
    futr_typ2 = Input(shape=[1], name="futr_typ2")
    futr_typ3 = Input(shape=[1], name="futr_typ3")

    
    # Embeddings layers
    emb_user_id = Embedding(MAX_USER, 10)(user_id)
    emb_subject = Embedding(MAX_TEXT, 20)(subject)
    emb_email = Embedding(MAX_TEXT, 60)(email)
    emb_dow = Embedding(MAX_DOW, 10)(dow)
    emb_hour = Embedding(MAX_HOUR, 10)(hour)
    emb_day = Embedding(MAX_DAY, 5)(day)
    emb_communication_type = Embedding(MAX_COM, 5)(communication_type)    
    emb_udowcnt = Embedding(max_udowcnt, 5)(udowcnt)
    emb_udaycnt = Embedding(max_udaycnt, 5)(udaycnt)
    emb_uhourcnt = Embedding(max_uhourcnt, 5)(uhourcnt)
    emb_user_id_nxt = Embedding(max_user_id_nxt, 5)(user_id_nxt)
    emb_user_id_communication_type_nxt = Embedding(max_user_id_communication_type_nxt, 5)(user_id_communication_type_nxt)
    emb_user_id_day_nxt = Embedding(max_user_id_day_nxt, 5)(user_id_day_nxt)
    emb_user_id_dow_nxt = Embedding(max_user_id_dow_nxt, 5)(user_id_dow_nxt)
    emb_prev_typ1 = Embedding(max_prev_typ1, 5)(prev_typ1)
    emb_prev_typ2 = Embedding(max_prev_typ2, 5)(prev_typ2)
    emb_prev_typ3 = Embedding(max_prev_typ3, 5)(prev_typ3)
    emb_futr_typ1 = Embedding(max_futr_typ1, 5)(futr_typ1)
    emb_futr_typ2 = Embedding(max_futr_typ2, 5)(futr_typ2)
    emb_futr_typ3 = Embedding(max_futr_typ3, 5)(futr_typ3)

    
    # rnn layers
    rnn_layer1 = GRU(32) (emb_email)
    rnn_layer2 = GRU(16) (emb_subject)

    # main layers
    main_l = concatenate([
        Flatten()(emb_user_id),
        Flatten()(emb_dow),
        Flatten()(emb_hour),
        Flatten()(emb_day),
        Flatten()(emb_communication_type),
        Flatten()(emb_udowcnt),
        Flatten()(emb_udaycnt),
        Flatten()(emb_uhourcnt),
        Flatten()(emb_user_id_nxt),
        Flatten()(emb_user_id_communication_type_nxt),
        Flatten()(emb_user_id_day_nxt),
        Flatten()(emb_user_id_dow_nxt),
        Flatten()(emb_prev_typ1),
        Flatten()(emb_prev_typ2),
        Flatten()(emb_prev_typ3),
        Flatten()(emb_futr_typ1),
        Flatten()(emb_futr_typ2),
        Flatten()(emb_futr_typ3),
        rnn_layer1,
        rnn_layer2
    ])
    
    """s_dout = SpatialDropout1D(0.2)(fe)
    fl1 = Flatten()(s_dout)
    conv = Conv1D(100, kernel_size=4, strides=1, padding='same')(s_dout)
    fl2 = Flatten()(conv)
    concat = concatenate([(fl1), (fl2)])
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(concat))
    x = Dropout(0.2)(Dense(dense_n,activation='relu')(x))
    outp = Dense(1,activation='sigmoid')(x)"""

    main_l = Dropout(0.2)(main_l)
    main_l = Dense(512)(main_l)
    main_l = Activation('relu')(main_l)

    main_l = Dropout(0.2)(main_l)
    main_l = Dense(256)(main_l)
    main_l = Activation('relu')(main_l)
    
    main_l = Dropout(0.2)(main_l)
    main_l = Dense(64)(main_l)
    main_l = Activation('relu')(main_l)
    
    # the output layer.
    output = Dense(1, activation="sigmoid") (main_l)
    model = Model([user_id, subject, email, dow , hour, day, communication_type, udowcnt, udaycnt, uhourcnt,
                   user_id_nxt, user_id_communication_type_nxt, user_id_day_nxt, user_id_dow_nxt,
                   prev_typ1, futr_typ1, prev_typ2, futr_typ2, prev_typ3, futr_typ3], output) #udowclk,udayclk,uhourclk

    optimizer = Adam(lr=lr, decay=decay)
    model.compile(loss="binary_crossentropy", optimizer=optimizer,metrics=['accuracy'])

    return model

model = new_rnn_model()
#model.summary()
#del model

In [55]:
# Learning for the RNN
print("Training the RNN...")
start = time()
# Set hyper parameters for the model.
BATCH_SIZE = 1024
epochs = 3

# Calculate learning rate decay.
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(n_trains / BATCH_SIZE) * epochs
lr_init, lr_fin = 0.007, 0.0005
lr_decay = exp_decay(lr_init, lr_fin, steps)

rnn_model = new_rnn_model(lr=lr_init, decay=lr_decay)

rnn_model.fit(X_train, Y_train, epochs=epochs, 
            batch_size=BATCH_SIZE,validation_data=(X_dev, Y_dev), verbose=2)
print('Time taken for Training the RNN:', time()-start )

print("Evaluating the model on validation data...")
Y_dev_preds_rnn = rnn_model.predict(X_dev, batch_size=BATCH_SIZE)
print("ROC-AUC error:", roc_auc_score(Y_dev, Y_dev_preds_rnn))

Training the RNN...
Train on 180000 samples, validate on 20000 samples
Epoch 1/3
 - 235s - loss: 0.0793 - acc: 0.9868 - val_loss: 0.0706 - val_acc: 0.9867
Epoch 2/3
 - 220s - loss: 0.0411 - acc: 0.9869 - val_loss: 0.1001 - val_acc: 0.9867
Epoch 3/3
 - 223s - loss: 0.0172 - acc: 0.9910 - val_loss: 0.2074 - val_acc: 0.9349
Time taken for Training the RNN: 681.9215769767761
Evaluating the model on validation data...
ROC-AUC error: 0.5623234507673868


In [47]:
rnn_preds = rnn_model.predict(X_dev, batch_size=BATCH_SIZE, verbose=1)
print ('Prediction for RNN complete')

Prediction for RNN complete


In [None]:
out = pd.DataFrame({'id': test_id, 'is_click': rnn_preds.reshape(-1)})
out.head()
out.to_csv("pred4.csv", index=False)