In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb
from time import time
import gc
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [2]:
#path = '/Users/804357/Desktop/MyFiles/Learn/LOM/Data/'
path = '/Users/Vishy/Files/AVDatahack/LOM/Data/' 
train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')
camp = pd.read_csv(path+'campaign_data.csv')

train_df = pd.merge(train, camp, on='campaign_id')
test_df = pd.merge(test, camp, on='campaign_id')
train_df, dev_df = train_test_split(train_df, random_state=42, test_size=0.10)

print('Shape of the train data is ',train_df.shape,' Test data is ',test_df.shape,' Valid data is',dev_df.shape)

Shape of the train data is  (920871, 14)  Test data is  (773858, 12)  Valid data is (102320, 14)


In [3]:
n_trains = train_df.shape[0]
n_devs = dev_df.shape[0]
n_tests = test_df.shape[0]

full_df = pd.concat([train_df,dev_df,test_df])
del train_df,dev_df,test_df
gc.collect()

14

In [None]:
print('New date variable creations')
#train
full_df['hour'] = pd.to_datetime(full_df.send_date).dt.hour.astype('uint8')
full_df['day'] = pd.to_datetime(full_df.send_date).dt.day.astype('uint8')
full_df['dow'] = pd.to_datetime(full_df.send_date).dt.dayofweek.astype('uint8')

New date variable creations


In [6]:
full_df.columns.values

array(['campaign_id', 'communication_type', 'email_body', 'email_url',
       'id', 'is_click', 'is_open', 'no_of_images',
       'no_of_internal_links', 'no_of_sections', 'send_date', 'subject',
       'total_links', 'user_id', 'hour', 'day', 'dow'], dtype=object)

In [65]:
print('Renaming of the new date variables created')
full_df['dow'] = full_df['dow'].map({0:'MON', 1:'TUE', 2:'WED', 3:'THR',4: 'FRI', 5: 'SAT', 6: 'SUN'} ).astype(str)
full_df['hour'] = full_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
full_df['day'] = full_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)

Renaming of the new date variables created


In [62]:
train_df = full_df[:n_trains]
dev_df = full_df[n_trains:n_trains+n_devs]
test_df = full_df[n_trains+n_devs:]
print(train_df.shape,test_df.shape,dev_df.shape)

(920871, 17) (773858, 17) (102320, 17)


In [60]:
# del gp

In [63]:
print('Grouping variables creation for train_data')
# ip - user_id,  Channel - campaign_id

# Campaigns per communication
gp = train_df[['communication_type','campaign_id']].groupby(by=['communication_type'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totcom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()
# open count per communication
gp = train_df[['communication_type','is_open']].groupby(by=['communication_type'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opncom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()
# Click count per communication
gp = train_df[['communication_type','is_click']].groupby(by=['communication_type'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkcom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()


gp = train_df[['communication_type','dow','campaign_id']].groupby(by=['communication_type','dow'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totdow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','dow','is_open']].groupby(by=['communication_type','dow'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opndow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','dow','is_click']].groupby(by=['communication_type','dow'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkdow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','day','campaign_id']].groupby(by=['communication_type','day'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','day','is_open']].groupby(by=['communication_type','day'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opnday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','day','is_click']].groupby(by=['communication_type','day'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','campaign_id']].groupby(by=['communication_type','hour'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'tothour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','is_open']].groupby(by=['communication_type','hour'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opnhour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','is_click']].groupby(by=['communication_type','hour'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkhour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()


Grouping variables creation for train_data
Assigning data types for variables


In [None]:
print("Assigning data types for Training variables")
train_df['totcom'] = train_df['totcom'].fillna(0).astype('uint16')
train_df['opncom'] = train_df['opncom'].fillna(0).astype('uint16')
train_df['clkcom'] = train_df['clkcom'].fillna(0).astype('uint16')
train_df['opncomrate'] = ((train_df['opncom']/train_df['totcom']).replace(np.inf, 0))
train_df['clkcomrate'] = ((train_df['clkcom']/train_df['totcom']).replace(np.inf, 0))

train_df['totdow'] = train_df['totdow'].fillna(0).astype('uint16')
train_df['opndow'] = train_df['opndow'].fillna(0).astype('uint16')
train_df['clkdow'] = train_df['clkdow'].fillna(0).astype('uint16')
train_df['opndowrate'] = ((train_df['opndow']/train_df['totdow']).replace(np.inf, 0))
train_df['clkdowrate'] = ((train_df['clkdow']/train_df['totdow']).replace(np.inf, 0))

train_df['totday'] = train_df['totday'].fillna(0).astype('uint16')
train_df['opnday'] = train_df['opnday'].fillna(0).astype('uint16')
train_df['clkday'] = train_df['clkday'].fillna(0).astype('uint16')
train_df['opndayrate'] = ((train_df['opnday']/train_df['totday']).replace(np.inf, 0))
train_df['clkdayrate'] = ((train_df['clkday']/train_df['totday']).replace(np.inf, 0))

train_df['tothour'] = train_df['tothour'].fillna(0).astype('uint16')
train_df['opnhour'] = train_df['opnhour'].fillna(0).astype('uint16')
train_df['clkhour'] = train_df['clkhour'].fillna(0).astype('uint16')
train_df['opnhourrate'] = ((train_df['opnhour']/train_df['tothour']).replace(np.inf, 0))
train_df['clkhourrate'] = ((train_df['clkhour']/train_df['tothour']).replace(np.inf, 0))

del train_df['totcom'],train_df['opncom'],train_df['clkcom'],train_df['totdow'],train_df['opndow'],train_df['clkdow']
del train_df['totday'],train_df['opnday'],train_df['clkday'],train_df['tothour'],train_df['opnhour'],train_df['clkhour']

In [None]:
dev_df['totcom'] = dev_df['totcom'].fillna(0).astype('uint16')
dev_df['opncom'] = dev_df['opncom'].fillna(0).astype('uint16')
dev_df['clkcom'] = dev_df['clkcom'].fillna(0).astype('uint16')
dev_df['opncomrate'] = ((dev_df['opncom']/dev_df['totcom']).replace(np.inf, 0))
dev_df['clkcomrate'] = ((dev_df['clkcom']/dev_df['totcom']).replace(np.inf, 0))

dev_df['totdow'] = dev_df['totdow'].fillna(0).astype('uint16')
dev_df['opndow'] = dev_df['opndow'].fillna(0).astype('uint16')
dev_df['clkdow'] = dev_df['clkdow'].fillna(0).astype('uint16')
dev_df['opndowrate'] = ((dev_df['opndow']/dev_df['totdow']).replace(np.inf, 0))
dev_df['clkdowrate'] = ((dev_df['clkdow']/dev_df['totdow']).replace(np.inf, 0))

dev_df['totday'] = dev_df['totday'].fillna(0).astype('uint16')
dev_df['opnday'] = dev_df['opnday'].fillna(0).astype('uint16')
dev_df['clkday'] = dev_df['clkday'].fillna(0).astype('uint16')
dev_df['opndayrate'] = ((dev_df['opnday']/dev_df['totday']).replace(np.inf, 0))
dev_df['clkdayrate'] = ((dev_df['clkday']/dev_df['totday']).replace(np.inf, 0))

dev_df['tothour'] = dev_df['tothour'].fillna(0).astype('uint16')
dev_df['opnhour'] = dev_df['opnhour'].fillna(0).astype('uint16')
dev_df['clkhour'] = dev_df['clkhour'].fillna(0).astype('uint16')
dev_df['opnhourrate'] = ((dev_df['opnhour']/dev_df['tothour']).replace(np.inf, 0))
dev_df['clkhourrate'] = ((dev_df['clkhour']/dev_df['tothour']).replace(np.inf, 0))

del dev_df['totcom'],dev_df['opncom'],dev_df['clkcom'],dev_df['totdow'],dev_df['opndow'],dev_df['clkdow']
del dev_df['totday'],dev_df['opnday'],dev_df['clkday'],dev_df['tothour'],dev_df['opnhour'],dev_df['clkhour']

In [None]:
test_df['totcom'] = test_df['totcom'].fillna(0).astype('uint16')
test_df['opncom'] = test_df['opncom'].fillna(0).astype('uint16')
test_df['clkcom'] = test_df['clkcom'].fillna(0).astype('uint16')
test_df['opncomrate'] = ((test_df['opncom']/test_df['totcom']).replace(np.inf, 0))
test_df['clkcomrate'] = ((test_df['clkcom']/test_df['totcom']).replace(np.inf, 0))

test_df['totdow'] = test_df['totdow'].fillna(0).astype('uint16')
test_df['opndow'] = test_df['opndow'].fillna(0).astype('uint16')
test_df['clkdow'] = test_df['clkdow'].fillna(0).astype('uint16')
test_df['opndowrate'] = ((test_df['opndow']/test_df['totdow']).replace(np.inf, 0))
test_df['clkdowrate'] = ((test_df['clkdow']/test_df['totdow']).replace(np.inf, 0))

test_df['totday'] = test_df['totday'].fillna(0).astype('uint16')
test_df['opnday'] = test_df['opnday'].fillna(0).astype('uint16')
test_df['clkday'] = test_df['clkday'].fillna(0).astype('uint16')
test_df['opndayrate'] = ((test_df['opnday']/test_df['totday']).replace(np.inf, 0))
test_df['clkdayrate'] = ((test_df['clkday']/test_df['totday']).replace(np.inf, 0))

test_df['tothour'] = test_df['tothour'].fillna(0).astype('uint16')
test_df['opnhour'] = test_df['opnhour'].fillna(0).astype('uint16')
test_df['clkhour'] = test_df['clkhour'].fillna(0).astype('uint16')
test_df['opnhourrate'] = ((test_df['opnhour']/test_df['tothour']).replace(np.inf, 0))
test_df['clkhourrate'] = ((test_df['clkhour']/test_df['tothour']).replace(np.inf, 0))

del test_df['totcom'],test_df['opncom'],test_df['clkcom'],test_df['totdow'],test_df['opndow'],test_df['clkdow']
del test_df['totday'],test_df['opnday'],test_df['clkday'],test_df['tothour'],test_df['opnhour'],test_df['clkhour']

In [7]:
train_df.columns

Index(['campaign_id', 'communication_type', 'email_body', 'email_url', 'id',
       'is_click', 'is_open', 'no_of_images', 'no_of_internal_links',
       'no_of_sections', 'send_date', 'subject', 'total_links', 'user_id',
       'hour', 'day', 'dow', 'totcnt', 'opncnt', 'clkcnt', 'dowcnt', 'dowclk',
       'daycnt', 'dayclk', 'hourcnt', 'hourclk'],
      dtype='object')

In [8]:
# prep data for Base Model
target_train = train_df['is_click']
id_test = test_df['id']

del train['campaign_id'],train['id'],train['user_id'],train['is_open'],train['is_click'],train['email_body'],train['subject'],train['email_url'],train['send_date'],train['no_of_images'],train['no_of_internal_links'],train['no_of_sections'],train['total_links']
del test['campaign_id'],test['id'],test['user_id'],test['is_open'],test['is_click'],test['email_body'],test['subject'],test['email_url'],test['send_date'],test['no_of_images'],test['no_of_internal_links'],test['no_of_sections'],test['total_links']

print(train.shape,test.shape)

(1023191, 13) (773858, 13)


In [9]:
train_cats = pd.get_dummies(data=train, columns=['communication_type','dow','hour','day'], drop_first=True)
test_cats = pd.get_dummies(data=test, columns=['communication_type','dow','hour','day'], drop_first=True)
print (train_cats.shape,test_cats.shape)

(1023191, 29) (773858, 27)


In [10]:
train_cats.columns

Index(['totcnt', 'opncnt', 'clkcnt', 'dowcnt', 'dowclk', 'daycnt', 'dayclk',
       'hourcnt', 'hourclk', 'communication_type_Corporate',
       'communication_type_Hackathon', 'communication_type_Newsletter',
       'communication_type_Others', 'communication_type_Upcoming Events',
       'communication_type_Webinar', 'dow_Mon', 'dow_Sat', 'dow_Sun',
       'dow_Thr', 'dow_Tue', 'dow_Wed', 'hour_PM1', 'hour_PM2', 'hour_PM3',
       'hour_PM4', 'day_LAT', 'day_MID', 'day_VLAT', 'day_VMID'],
      dtype='object')

In [11]:
listA = list(train_cats.columns.values)
listB = list(test_cats.columns.values)

In [12]:
for item in listA:
    if item not in listB:
        print (item)

communication_type_Corporate
communication_type_Others
communication_type_Webinar
day_LAT
day_VLAT


In [13]:
for item in listB:
    if item not in listA:
        print (item)

hour_AM3
hour_AM4
day_VEAR


In [14]:
test_cats['communication_type_Corporate'],test_cats['communication_type_Others'],test_cats['communication_type_Webinar'],test_cats['day_LAT'],test_cats['day_VLAT']=0,0,0,0,0
train_cats['hour_AM3'],train_cats['hour_AM4'],train_cats['hour_AM4'] =0,0,0

In [15]:
train_cats.columns

Index(['totcnt', 'opncnt', 'clkcnt', 'dowcnt', 'dowclk', 'daycnt', 'dayclk',
       'hourcnt', 'hourclk', 'communication_type_Corporate',
       'communication_type_Hackathon', 'communication_type_Newsletter',
       'communication_type_Others', 'communication_type_Upcoming Events',
       'communication_type_Webinar', 'dow_Mon', 'dow_Sat', 'dow_Sun',
       'dow_Thr', 'dow_Tue', 'dow_Wed', 'hour_PM1', 'hour_PM2', 'hour_PM3',
       'hour_PM4', 'day_LAT', 'day_MID', 'day_VLAT', 'day_VMID', 'hour_AM3',
       'hour_AM4'],
      dtype='object')

In [16]:
train_cats = train_cats[['totcnt', 'opncnt', 'clkcnt', 'dowcnt', 'dowclk', 'daycnt', 'dayclk','hourcnt', 'hourclk',
                        'communication_type_Corporate','communication_type_Hackathon','communication_type_Newsletter',
                        'communication_type_Others','communication_type_Upcoming Events','communication_type_Webinar',
                        'dow_Mon', 'dow_Sat', 'dow_Sun','dow_Thr', 'dow_Tue', 'dow_Wed', 'hour_PM1', 'hour_PM2',
                        'hour_PM3','hour_PM4', 'day_LAT', 'day_MID', 'day_VLAT', 'day_VMID', 'hour_AM3','hour_AM4']]

test_cats = test_cats[['totcnt', 'opncnt', 'clkcnt', 'dowcnt', 'dowclk', 'daycnt', 'dayclk','hourcnt', 'hourclk',
                      'communication_type_Corporate','communication_type_Hackathon','communication_type_Newsletter',
                      'communication_type_Others','communication_type_Upcoming Events','communication_type_Webinar',
                      'dow_Mon', 'dow_Sat', 'dow_Sun','dow_Thr', 'dow_Tue', 'dow_Wed', 'hour_PM1', 'hour_PM2',
                      'hour_PM3','hour_PM4', 'day_LAT', 'day_MID', 'day_VLAT', 'day_VMID', 'hour_AM3','hour_AM4']]

In [17]:
traindf = np.array(train_cats)
testdf = np.array(test_cats)

xgb_preds = []

In [18]:
K = 5
kf = KFold(n_splits = K, random_state = 42, shuffle = True)

In [21]:
start = time()

for train_index, test_index in kf.split(traindf):
    train_X, valid_X = traindf[train_index], traindf[test_index]
    train_y, valid_y = target_train[train_index], target_train[test_index]

    # params configuration also from the1owl's kernel
    # https://www.kaggle.com/the1owl/forza-baseline
    xgb_params = {'eta': 0.02, 'objective': 'binary:logistic', max_depth= 6, subsample = 1, colsample_bytree = 1,
                  min_chil_weight=1, 'eval_metric': 'auc', 'seed': 42, 'silent': True}

    d_train = xgb.DMatrix(train_X, train_y)
    d_valid = xgb.DMatrix(valid_X, valid_y)
    d_test = xgb.DMatrix(testdf)
    
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    model = xgb.train(xgb_params, d_train, 5000,  watchlist, 
                      maximize=True, verbose_eval=50, early_stopping_rounds=50)
                        
    xgb_pred = model.predict(d_test)
    xgb_preds.append(list(xgb_pred))

end = time()
print ('Time taken is:', end-start)

SyntaxError: invalid syntax (<ipython-input-21-a6849180cd34>, line 9)

In [70]:
preds=[]
for i in range(len(xgb_preds[0])):
    sum=0
    for j in range(K):
        sum+=xgb_preds[j][i]
    preds.append(sum / K)

out = pd.DataFrame({'id': id_test, 'is_click': preds})
out.to_csv("pred3.csv", index=False)