In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
#import xgboost as xgb
import lightgbm as lgb
from time import time
import gc
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [3]:
path = '/Users/804357/Desktop/MyFiles/Learn/LOM/Data/'
#path = '/Users/Vishy/Files/AVDatahack/LOM/Data/' 
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
camp = pd.read_csv(path+'campaign_data.csv')

train_df = pd.merge(train, camp, on='campaign_id')
test_df = pd.merge(test, camp, on='campaign_id')
train_df, dev_df = train_test_split(train_df, random_state=42, test_size=0.10)

print('Shape of the train data is ',train_df.shape,' Test data is ',test_df.shape,' Valid data is',dev_df.shape)

Shape of the train data is  (920871, 14)  Test data is  (773858, 12)  Valid data is (102320, 14)


In [33]:
camp.communication_type.value_counts()

Hackathon          13
Corporate          12
Newsletter          9
Conference          8
Upcoming Events     7
Others              2
Webinar             1
Name: communication_type, dtype: int64

In [4]:
n_trains = train_df.shape[0]
n_devs = dev_df.shape[0]
n_tests = test_df.shape[0]

full_df = pd.concat([train_df,dev_df,test_df])
del train_df,dev_df,test_df
gc.collect()

33

In [5]:
print('New date variable creations')
#train
full_df['hour'] = pd.to_datetime(full_df.send_date).dt.hour.astype('uint8')
full_df['day'] = pd.to_datetime(full_df.send_date).dt.day.astype('uint8')
full_df['dow'] = pd.to_datetime(full_df.send_date).dt.dayofweek.astype('uint8')

New date variable creations


In [6]:
full_df.columns.values

array(['campaign_id', 'communication_type', 'email_body', 'email_url',
       'id', 'is_click', 'is_open', 'no_of_images', 'no_of_internal_links',
       'no_of_sections', 'send_date', 'subject', 'total_links', 'user_id',
       'hour', 'day', 'dow'], dtype=object)

In [7]:
print('Renaming of the new date variables created')
full_df['dow'] = full_df['dow'].map({0:'MON', 1:'TUE', 2:'WED', 3:'THR',4: 'FRI', 5: 'SAT', 6: 'SUN'} ).astype(str)
full_df['hour'] = full_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
full_df['day'] = full_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)

Renaming of the new date variables created


In [8]:
train_df = full_df[:n_trains]
dev_df = full_df[n_trains:n_trains+n_devs]
test_df = full_df[n_trains+n_devs:]
print(train_df.shape,test_df.shape,dev_df.shape)

(920871, 17) (773858, 17) (102320, 17)


In [9]:
# del gp

In [10]:
print('Grouping variables creation for train_data')
# ip - user_id,  Channel - campaign_id

# Campaigns per communication
gp = train_df[['communication_type','campaign_id']].groupby(by=['communication_type'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totcom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()
# open count per communication
gp = train_df[['communication_type','is_open']].groupby(by=['communication_type'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opncom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()
# Click count per communication
gp = train_df[['communication_type','is_click']].groupby(by=['communication_type'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkcom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()


gp = train_df[['communication_type','dow','campaign_id']].groupby(by=['communication_type','dow'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totdow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','dow','is_open']].groupby(by=['communication_type','dow'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opndow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','dow','is_click']].groupby(by=['communication_type','dow'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkdow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','day','campaign_id']].groupby(by=['communication_type','day'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','day','is_open']].groupby(by=['communication_type','day'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opnday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','day','is_click']].groupby(by=['communication_type','day'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','campaign_id']].groupby(by=['communication_type','hour'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'tothour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','is_open']].groupby(by=['communication_type','hour'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opnhour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','is_click']].groupby(by=['communication_type','hour'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkhour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()


Grouping variables creation for train_data


118

In [11]:
print("Assigning data types for Training variables")
train_df['totcom'] = train_df['totcom'].fillna(0).astype('uint16')
train_df['opncom'] = train_df['opncom'].fillna(0).astype('uint16')
train_df['clkcom'] = train_df['clkcom'].fillna(0).astype('uint16')
train_df['opncomrate'] = ((train_df['opncom']/train_df['totcom']).replace(np.inf, 0))
train_df['clkcomrate'] = ((train_df['clkcom']/train_df['totcom']).replace(np.inf, 0))

train_df['totdow'] = train_df['totdow'].fillna(0).astype('uint16')
train_df['opndow'] = train_df['opndow'].fillna(0).astype('uint16')
train_df['clkdow'] = train_df['clkdow'].fillna(0).astype('uint16')
train_df['opndowrate'] = ((train_df['opndow']/train_df['totdow']).replace(np.inf, 0))
train_df['clkdowrate'] = ((train_df['clkdow']/train_df['totdow']).replace(np.inf, 0))

train_df['totday'] = train_df['totday'].fillna(0).astype('uint16')
train_df['opnday'] = train_df['opnday'].fillna(0).astype('uint16')
train_df['clkday'] = train_df['clkday'].fillna(0).astype('uint16')
train_df['opndayrate'] = ((train_df['opnday']/train_df['totday']).replace(np.inf, 0))
train_df['clkdayrate'] = ((train_df['clkday']/train_df['totday']).replace(np.inf, 0))

train_df['tothour'] = train_df['tothour'].fillna(0).astype('uint16')
train_df['opnhour'] = train_df['opnhour'].fillna(0).astype('uint16')
train_df['clkhour'] = train_df['clkhour'].fillna(0).astype('uint16')
train_df['opnhourrate'] = ((train_df['opnhour']/train_df['tothour']).replace(np.inf, 0))
train_df['clkhourrate'] = ((train_df['clkhour']/train_df['tothour']).replace(np.inf, 0))

del train_df['totcom'],train_df['opncom'],train_df['clkcom'],train_df['totdow'],train_df['opndow'],train_df['clkdow']
del train_df['totday'],train_df['opnday'],train_df['clkday'],train_df['tothour'],train_df['opnhour'],train_df['clkhour']

Assigning data types for Training variables


In [12]:
dev_df['totcom'] = dev_df['totcom'].fillna(0).astype('uint16')
dev_df['opncom'] = dev_df['opncom'].fillna(0).astype('uint16')
dev_df['clkcom'] = dev_df['clkcom'].fillna(0).astype('uint16')
dev_df['opncomrate'] = ((dev_df['opncom']/dev_df['totcom']).replace(np.inf, 0))
dev_df['clkcomrate'] = ((dev_df['clkcom']/dev_df['totcom']).replace(np.inf, 0))

dev_df['totdow'] = dev_df['totdow'].fillna(0).astype('uint16')
dev_df['opndow'] = dev_df['opndow'].fillna(0).astype('uint16')
dev_df['clkdow'] = dev_df['clkdow'].fillna(0).astype('uint16')
dev_df['opndowrate'] = ((dev_df['opndow']/dev_df['totdow']).replace(np.inf, 0))
dev_df['clkdowrate'] = ((dev_df['clkdow']/dev_df['totdow']).replace(np.inf, 0))

dev_df['totday'] = dev_df['totday'].fillna(0).astype('uint16')
dev_df['opnday'] = dev_df['opnday'].fillna(0).astype('uint16')
dev_df['clkday'] = dev_df['clkday'].fillna(0).astype('uint16')
dev_df['opndayrate'] = ((dev_df['opnday']/dev_df['totday']).replace(np.inf, 0))
dev_df['clkdayrate'] = ((dev_df['clkday']/dev_df['totday']).replace(np.inf, 0))

dev_df['tothour'] = dev_df['tothour'].fillna(0).astype('uint16')
dev_df['opnhour'] = dev_df['opnhour'].fillna(0).astype('uint16')
dev_df['clkhour'] = dev_df['clkhour'].fillna(0).astype('uint16')
dev_df['opnhourrate'] = ((dev_df['opnhour']/dev_df['tothour']).replace(np.inf, 0))
dev_df['clkhourrate'] = ((dev_df['clkhour']/dev_df['tothour']).replace(np.inf, 0))

del dev_df['totcom'],dev_df['opncom'],dev_df['clkcom'],dev_df['totdow'],dev_df['opndow'],dev_df['clkdow']
del dev_df['totday'],dev_df['opnday'],dev_df['clkday'],dev_df['tothour'],dev_df['opnhour'],dev_df['clkhour']

In [13]:
test_df['totcom'] = test_df['totcom'].fillna(0).astype('uint16')
test_df['opncom'] = test_df['opncom'].fillna(0).astype('uint16')
test_df['clkcom'] = test_df['clkcom'].fillna(0).astype('uint16')
test_df['opncomrate'] = ((test_df['opncom']/test_df['totcom']).replace(np.inf, 0))
test_df['clkcomrate'] = ((test_df['clkcom']/test_df['totcom']).replace(np.inf, 0))

test_df['totdow'] = test_df['totdow'].fillna(0).astype('uint16')
test_df['opndow'] = test_df['opndow'].fillna(0).astype('uint16')
test_df['clkdow'] = test_df['clkdow'].fillna(0).astype('uint16')
test_df['opndowrate'] = ((test_df['opndow']/test_df['totdow']).replace(np.inf, 0))
test_df['clkdowrate'] = ((test_df['clkdow']/test_df['totdow']).replace(np.inf, 0))

test_df['totday'] = test_df['totday'].fillna(0).astype('uint16')
test_df['opnday'] = test_df['opnday'].fillna(0).astype('uint16')
test_df['clkday'] = test_df['clkday'].fillna(0).astype('uint16')
test_df['opndayrate'] = ((test_df['opnday']/test_df['totday']).replace(np.inf, 0))
test_df['clkdayrate'] = ((test_df['clkday']/test_df['totday']).replace(np.inf, 0))

test_df['tothour'] = test_df['tothour'].fillna(0).astype('uint16')
test_df['opnhour'] = test_df['opnhour'].fillna(0).astype('uint16')
test_df['clkhour'] = test_df['clkhour'].fillna(0).astype('uint16')
test_df['opnhourrate'] = ((test_df['opnhour']/test_df['tothour']).replace(np.inf, 0))
test_df['clkhourrate'] = ((test_df['clkhour']/test_df['tothour']).replace(np.inf, 0))

del test_df['totcom'],test_df['opncom'],test_df['clkcom'],test_df['totdow'],test_df['opndow'],test_df['clkdow']
del test_df['totday'],test_df['opnday'],test_df['clkday'],test_df['tothour'],test_df['opnhour'],test_df['clkhour']

In [14]:
train_df.columns

Index(['campaign_id', 'communication_type', 'email_body', 'email_url', 'id',
       'is_click', 'is_open', 'no_of_images', 'no_of_internal_links',
       'no_of_sections', 'send_date', 'subject', 'total_links', 'user_id',
       'hour', 'day', 'dow', 'opncomrate', 'clkcomrate', 'opndowrate',
       'clkdowrate', 'opndayrate', 'clkdayrate', 'opnhourrate', 'clkhourrate'],
      dtype='object')

In [16]:
test_df.columns

Index(['campaign_id', 'communication_type', 'email_body', 'email_url', 'id',
       'is_click', 'is_open', 'no_of_images', 'no_of_internal_links',
       'no_of_sections', 'send_date', 'subject', 'total_links', 'user_id',
       'hour', 'day', 'dow', 'opncomrate', 'clkcomrate', 'opndowrate',
       'clkdowrate', 'opndayrate', 'clkdayrate', 'opnhourrate', 'clkhourrate'],
      dtype='object')

In [17]:
Y_train = train_df[['is_click', 'is_open']]
Y_Dev = dev_df[[ 'is_click', 'is_open']]
del train_df['is_click'],train_df['is_open'],dev_df['is_click'],dev_df['is_open'],test_df['is_click'],test_df['is_open']

In [20]:
# Delete unnecessary variables
id_test = test_df['id']
del train_df['campaign_id'],train_df['email_body'],train_df['email_url'],train_df['id'],train_df['send_date'],train_df['subject'],train_df['user_id']
del dev_df['campaign_id'],dev_df['email_body'],dev_df['email_url'],dev_df['id'],dev_df['send_date'],dev_df['subject'],dev_df['user_id']
del test_df['campaign_id'],test_df['email_body'],test_df['email_url'],test_df['id'],test_df['send_date'],test_df['subject'],test_df['user_id']
print(train_df.shape,dev_df.shape,test_df.shape)

(920871, 16) (102320, 16) (773858, 16)


In [22]:
train_df.head()

Unnamed: 0,communication_type,no_of_images,no_of_internal_links,no_of_sections,total_links,hour,day,dow,opncomrate,clkcomrate,opndowrate,clkdowrate,opndayrate,clkdayrate,opnhourrate,clkhourrate
0,Conference,13,100,1,104,PM4,MID,SUN,0.720094,0.077426,0.24334,0.065421,0.24334,0.065421,0.24334,0.065421
1,Others,1,3,1,7,PM2,LAT,MON,0.201903,0.014378,0.226826,0.011877,0.226826,0.011877,0.223211,0.015696
2,Conference,16,117,1,119,PM2,VLAT,THR,0.720094,0.077426,0.500594,0.122309,0.500594,0.122309,0.65136,0.063647
3,Upcoming Events,7,14,1,18,PM1,EAR,SUN,3.965905,0.555484,0.141751,0.012699,3.965905,0.555484,0.321901,0.039611
4,Newsletter,13,79,4,88,PM3,EAR,MON,2.65794,0.398875,0.505684,0.121652,0.262997,0.062272,1.064677,0.121755


In [28]:
train_df = pd.get_dummies(data=train_df, columns=['communication_type','dow','hour','day'], drop_first=True)
dev_df = pd.get_dummies(data=dev_df, columns=['communication_type','dow','hour','day'], drop_first=True)
test_df = pd.get_dummies(data=test_df, columns=['communication_type','dow','hour','day'], drop_first=True)
print (train_df.shape,test_df.shape,dev_df.shape)

(920871, 32) (773858, 30) (102320, 32)


In [34]:
dev_df.columns

Index(['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links',
       'opncomrate', 'clkcomrate', 'opndowrate', 'clkdowrate', 'opndayrate',
       'clkdayrate', 'opnhourrate', 'clkhourrate',
       'communication_type_Corporate', 'communication_type_Hackathon',
       'communication_type_Newsletter', 'communication_type_Others',
       'communication_type_Upcoming Events', 'communication_type_Webinar',
       'dow_MON', 'dow_SAT', 'dow_SUN', 'dow_THR', 'dow_TUE', 'dow_WED',
       'hour_PM1', 'hour_PM2', 'hour_PM3', 'hour_PM4', 'day_LAT', 'day_MID',
       'day_VLAT', 'day_VMID'],
      dtype='object')

In [29]:
train_df.columns

Index(['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links',
       'opncomrate', 'clkcomrate', 'opndowrate', 'clkdowrate', 'opndayrate',
       'clkdayrate', 'opnhourrate', 'clkhourrate',
       'communication_type_Corporate', 'communication_type_Hackathon',
       'communication_type_Newsletter', 'communication_type_Others',
       'communication_type_Upcoming Events', 'communication_type_Webinar',
       'dow_MON', 'dow_SAT', 'dow_SUN', 'dow_THR', 'dow_TUE', 'dow_WED',
       'hour_PM1', 'hour_PM2', 'hour_PM3', 'hour_PM4', 'day_LAT', 'day_MID',
       'day_VLAT', 'day_VMID'],
      dtype='object')

In [31]:
test_df.columns

Index(['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links',
       'opncomrate', 'clkcomrate', 'opndowrate', 'clkdowrate', 'opndayrate',
       'clkdayrate', 'opnhourrate', 'clkhourrate',
       'communication_type_Hackathon', 'communication_type_Newsletter',
       'communication_type_Upcoming Events', 'dow_MON', 'dow_SAT', 'dow_SUN',
       'dow_THR', 'dow_TUE', 'dow_WED', 'hour_AM3', 'hour_AM4', 'hour_PM1',
       'hour_PM2', 'hour_PM3', 'hour_PM4', 'day_MID', 'day_VEAR', 'day_VMID'],
      dtype='object')

In [36]:
test_df['communication_type_Corporate'],test_df['communication_type_Others'],test_df['communication_type_Webinar'],test_df['day_VMID'],test_df['day_LAT'],test_df['day_VLAT'],test_df['hour_AM2']=0,0,0,0,0,0,0
train_df['hour_AM2'],train_df['hour_AM3'],train_df['hour_AM4'],train_df['day_VEAR'] = 0,0,0,0
dev_df['hour_AM2'],dev_df['hour_AM3'],dev_df['hour_AM4'],dev_df['day_VEAR'] = 0,0,0,0
print(train_df.shape,test_df.shape,dev_df.shape)

(920871, 36) (773858, 36) (102320, 36)


In [37]:
train_df.columns

Index(['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links',
       'opncomrate', 'clkcomrate', 'opndowrate', 'clkdowrate', 'opndayrate',
       'clkdayrate', 'opnhourrate', 'clkhourrate',
       'communication_type_Corporate', 'communication_type_Hackathon',
       'communication_type_Newsletter', 'communication_type_Others',
       'communication_type_Upcoming Events', 'communication_type_Webinar',
       'dow_MON', 'dow_SAT', 'dow_SUN', 'dow_THR', 'dow_TUE', 'dow_WED',
       'hour_PM1', 'hour_PM2', 'hour_PM3', 'hour_PM4', 'day_LAT', 'day_MID',
       'day_VLAT', 'day_VMID', 'hour_AM2', 'hour_AM3', 'hour_AM4', 'day_VEAR'],
      dtype='object')

In [38]:
train_df=train_df[['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links',
       'opncomrate', 'clkcomrate', 'opndowrate', 'clkdowrate', 'opndayrate',
       'clkdayrate', 'opnhourrate', 'clkhourrate',
       'communication_type_Corporate', 'communication_type_Hackathon',
       'communication_type_Newsletter', 'communication_type_Others',
       'communication_type_Upcoming Events', 'communication_type_Webinar',
       'dow_MON', 'dow_SAT', 'dow_SUN', 'dow_THR', 'dow_TUE', 'dow_WED',
       'hour_PM1', 'hour_PM2', 'hour_PM3', 'hour_PM4', 'day_LAT', 'day_MID',
       'day_VLAT', 'day_VMID', 'hour_AM2', 'hour_AM3', 'hour_AM4', 'day_VEAR']]
dev_df=dev_df[['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links',
       'opncomrate', 'clkcomrate', 'opndowrate', 'clkdowrate', 'opndayrate',
       'clkdayrate', 'opnhourrate', 'clkhourrate',
       'communication_type_Corporate', 'communication_type_Hackathon',
       'communication_type_Newsletter', 'communication_type_Others',
       'communication_type_Upcoming Events', 'communication_type_Webinar',
       'dow_MON', 'dow_SAT', 'dow_SUN', 'dow_THR', 'dow_TUE', 'dow_WED',
       'hour_PM1', 'hour_PM2', 'hour_PM3', 'hour_PM4', 'day_LAT', 'day_MID',
       'day_VLAT', 'day_VMID', 'hour_AM2', 'hour_AM3', 'hour_AM4', 'day_VEAR']]
test_df=test_df[['no_of_images', 'no_of_internal_links', 'no_of_sections', 'total_links',
       'opncomrate', 'clkcomrate', 'opndowrate', 'clkdowrate', 'opndayrate',
       'clkdayrate', 'opnhourrate', 'clkhourrate',
       'communication_type_Corporate', 'communication_type_Hackathon',
       'communication_type_Newsletter', 'communication_type_Others',
       'communication_type_Upcoming Events', 'communication_type_Webinar',
       'dow_MON', 'dow_SAT', 'dow_SUN', 'dow_THR', 'dow_TUE', 'dow_WED',
       'hour_PM1', 'hour_PM2', 'hour_PM3', 'hour_PM4', 'day_LAT', 'day_MID',
       'day_VLAT', 'day_VMID', 'hour_AM2', 'hour_AM3', 'hour_AM4', 'day_VEAR']]

In [70]:
params = {'learning_rate': 0.002,'application': 'binay','max_depth': 8,'num_leaves': 60,'metric': 'AUC',
          'objective' : 'binary', 'data_random_seed': 1, 'bagging_fraction': 0.8,'nthread': 4,'scale_pos_weight':90}
 
params2 = {'learning_rate': 0.006,'application': 'binary','max_depth': 16,'num_leaves': 130,'metric': 'AUC',
           'objective' : 'binary', 'data_random_seed': 2,'bagging_fraction': 1,'nthread': 4,'scale_pos_weight':90} 

In [71]:
# Light GBM 
y_train = Y_train['is_click'].values
train_X, valid_X, train_y, valid_y = train_test_split(train_df,y_train,test_size = 0.2,random_state = 42)
d_train = lgb.Dataset(train_X, label=train_y) 
d_valid = lgb.Dataset(valid_X, label=valid_y) 
watchlist = [d_train, d_valid] 

model3 = lgb.train(params, train_set=d_train, num_boost_round=5000, valid_sets=watchlist,
                   early_stopping_rounds=100, verbose_eval=100) 
preds3 = model3.predict(dev_df)

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.591432	valid_1's auc: 0.593349
Early stopping, best iteration is:
[7]	training's auc: 0.59066	valid_1's auc: 0.595612


In [72]:
train_X2, valid_X2, train_y2, valid_y2 = train_test_split(train_df, y_train, test_size = 0.2, random_state = 42)  
d_train2 = lgb.Dataset(train_X2, label=train_y2)
d_valid2 = lgb.Dataset(valid_X2, label=valid_y2)
watchlist2 = [d_train2, d_valid2]

model4 = lgb.train(params2, train_set=d_train2, num_boost_round=6000, valid_sets=watchlist2, 
                   early_stopping_rounds=100, verbose_eval=100)  
preds4 = model4.predict(dev_df)

Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.591433	valid_1's auc: 0.593402
Early stopping, best iteration is:
[1]	training's auc: 0.591433	valid_1's auc: 0.593402


In [73]:
y_valid= Y_Dev['is_click'].values
print ('ROC-AUC score using LGB1 is', roc_auc_score(y_valid, preds3))
print ('ROC-AUC score using LGB2 is', roc_auc_score(y_valid, preds4))

ROC-AUC score using LGB1 is 0.591842155248
ROC-AUC score using LGB2 is 0.591608109419


In [74]:
preds3 = model3.predict(test_df)
preds4 = model4.predict(test_df)

In [75]:
preds = (0.50*preds3+0.50*preds4)

out = pd.DataFrame({'id': id_test, 'is_click': preds})
out.to_csv("pred3.csv", index=False)