In [1]:
import pandas as pd
import numpy as np
import scipy
from time import time
import gc

from nltk.corpus import stopwords
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
import re

from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb


import warnings
warnings.filterwarnings("ignore")

warnings.filterwarnings('ignore')
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [2]:
def tokenizer(text):
    try:
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``'], tokens))
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    except Error as e:
        print(e)

In [8]:
start = time()

# train = pd.read_csv(path+'train.csv',)
# test = pd.read_csv(path+'test.csv')
# camp = pd.read_csv(path+'campaign_data.csv')

# print('New variable creations')
# train['hour'] = pd.to_datetime(train.send_date).dt.hour.astype('uint8')
# train['day'] = pd.to_datetime(train.send_date).dt.day.astype('uint8')
# train['dow'] = pd.to_datetime(train.send_date).dt.dayofweek.astype('uint8')

# test['hour'] = pd.to_datetime(test.send_date).dt.hour.astype('uint8')
# test['day'] = pd.to_datetime(test.send_date).dt.day.astype('uint8')
# test['dow'] = pd.to_datetime(test.send_date).dt.dayofweek.astype('uint8')

# camp['sub_tokens'] = camp['subject'].map(tokenizer)
# camp['email_tokens'] = camp['email_body'].map(tokenizer)
# train_df = pd.merge(train, camp, on='campaign_id')
# test_df = pd.merge(test, camp, on='campaign_id')

train_df = pd.read_csv(path+'train_df.csv')
test_df = pd.read_csv(path+'test_df.csv')


train_df, dev_df = train_test_split(train_df, random_state=42, test_size=0.10)

print('Shape of the train data is ',train_df.shape,' Test data is ',test_df.shape,' Valid data is',dev_df.shape)
print('Time taken for importing the data is: ', time()-start, ' secs')

Shape of the train data is  (920871, 19)  Test data is  (773858, 17)  Valid data is (102320, 19)
Time taken for importing the data is:  41.4264178276062  secs


In [9]:
print('Renaming of the new date variables created')
train_df['dow'] = train_df['dow'].map({0:'MON', 1:'TUE', 2:'WED', 3:'THR',4: 'FRI', 5: 'SAT', 6: 'SUN'} ).astype(str)
train_df['hour'] = train_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
train_df['day'] = train_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)


dev_df['dow'] = dev_df['dow'].map({0:'MON', 1:'TUE', 2:'WED', 3:'THR',4: 'FRI', 5: 'SAT', 6: 'SUN'} ).astype(str)
dev_df['hour'] = dev_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
dev_df['day'] = dev_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)


test_df['dow'] = test_df['dow'].map({0:'MON', 1:'TUE', 2:'WED', 3:'THR',4: 'FRI', 5: 'SAT', 6: 'SUN'} ).astype(str)
test_df['hour'] = test_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
test_df['day'] = test_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)

Renaming of the new date variables created


In [10]:
print('Grouping variables creation for train_data')
# ip - user_id,  Channel - campaign_id

# Campaigns per communication
gp = train_df[['user_id','campaign_id']].groupby(by=['communication_type'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totcom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()
# open count per communication
gp = train_df[['communication_type','is_open']].groupby(by=['communication_type'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opncom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()
# Click count per communication
gp = train_df[['communication_type','is_click']].groupby(by=['communication_type'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkcom'})
train_df = train_df.merge(gp, on=['communication_type'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type'], how='left')
test_df = test_df.merge(gp, on=['communication_type'], how='left')
del gp
gc.collect()


gp = train_df[['communication_type','dow','campaign_id']].groupby(by=['communication_type','dow'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totdow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','dow','is_open']].groupby(by=['communication_type','dow'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opndow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','dow','is_click']].groupby(by=['communication_type','dow'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkdow'})
train_df = train_df.merge(gp, on=['communication_type','dow'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','dow'], how='left')
test_df = test_df.merge(gp, on=['communication_type','dow'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','day','campaign_id']].groupby(by=['communication_type','day'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'totday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','day','is_open']].groupby(by=['communication_type','day'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opnday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()
gp = train_df[['communication_type','day','is_click']].groupby(by=['communication_type','day'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkday'})
train_df = train_df.merge(gp, on=['communication_type','day'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','day'], how='left')
test_df = test_df.merge(gp, on=['communication_type','day'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','campaign_id']].groupby(by=['communication_type','hour'])[['campaign_id']].count().reset_index().rename(index=str, columns={'campaign_id': 'tothour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','is_open']].groupby(by=['communication_type','hour'])[['is_open']].sum().reset_index().rename(index=str, columns={'is_open': 'opnhour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()

gp = train_df[['communication_type','hour','is_click']].groupby(by=['communication_type','hour'])[['is_click']].sum().reset_index().rename(index=str, columns={'is_click': 'clkhour'})
train_df = train_df.merge(gp, on=['communication_type','hour'], how='left')
dev_df = dev_df.merge(gp, on=['communication_type','hour'], how='left')
test_df = test_df.merge(gp, on=['communication_type','hour'], how='left')
del gp
gc.collect()


Grouping variables creation for train_data


116

In [11]:
print("Assigning data types for Training variables")
train_df['totcom'] = train_df['totcom'].fillna(0).astype('uint16')
train_df['opncom'] = train_df['opncom'].fillna(0).astype('uint16')
train_df['clkcom'] = train_df['clkcom'].fillna(0).astype('uint16')
train_df['opncomrate'] = ((train_df['opncom']/train_df['totcom']).replace(np.inf, 0))
train_df['clkcomrate'] = ((train_df['clkcom']/train_df['totcom']).replace(np.inf, 0))

train_df['totdow'] = train_df['totdow'].fillna(0).astype('uint16')
train_df['opndow'] = train_df['opndow'].fillna(0).astype('uint16')
train_df['clkdow'] = train_df['clkdow'].fillna(0).astype('uint16')
train_df['opndowrate'] = ((train_df['opndow']/train_df['totdow']).replace(np.inf, 0))
train_df['clkdowrate'] = ((train_df['clkdow']/train_df['totdow']).replace(np.inf, 0))

train_df['totday'] = train_df['totday'].fillna(0).astype('uint16')
train_df['opnday'] = train_df['opnday'].fillna(0).astype('uint16')
train_df['clkday'] = train_df['clkday'].fillna(0).astype('uint16')
train_df['opndayrate'] = ((train_df['opnday']/train_df['totday']).replace(np.inf, 0))
train_df['clkdayrate'] = ((train_df['clkday']/train_df['totday']).replace(np.inf, 0))

train_df['tothour'] = train_df['tothour'].fillna(0).astype('uint16')
train_df['opnhour'] = train_df['opnhour'].fillna(0).astype('uint16')
train_df['clkhour'] = train_df['clkhour'].fillna(0).astype('uint16')
train_df['opnhourrate'] = ((train_df['opnhour']/train_df['tothour']).replace(np.inf, 0))
train_df['clkhourrate'] = ((train_df['clkhour']/train_df['tothour']).replace(np.inf, 0))

del train_df['totcom'],train_df['opncom'],train_df['clkcom'],train_df['totdow'],train_df['opndow'],train_df['clkdow']
del train_df['totday'],train_df['opnday'],train_df['clkday'],train_df['tothour'],train_df['opnhour'],train_df['clkhour']

Assigning data types for Training variables


In [12]:
dev_df['totcom'] = dev_df['totcom'].fillna(0).astype('uint16')
dev_df['opncom'] = dev_df['opncom'].fillna(0).astype('uint16')
dev_df['clkcom'] = dev_df['clkcom'].fillna(0).astype('uint16')
dev_df['opncomrate'] = ((dev_df['opncom']/dev_df['totcom']).replace(np.inf, 0))
dev_df['clkcomrate'] = ((dev_df['clkcom']/dev_df['totcom']).replace(np.inf, 0))

dev_df['totdow'] = dev_df['totdow'].fillna(0).astype('uint16')
dev_df['opndow'] = dev_df['opndow'].fillna(0).astype('uint16')
dev_df['clkdow'] = dev_df['clkdow'].fillna(0).astype('uint16')
dev_df['opndowrate'] = ((dev_df['opndow']/dev_df['totdow']).replace(np.inf, 0))
dev_df['clkdowrate'] = ((dev_df['clkdow']/dev_df['totdow']).replace(np.inf, 0))

dev_df['totday'] = dev_df['totday'].fillna(0).astype('uint16')
dev_df['opnday'] = dev_df['opnday'].fillna(0).astype('uint16')
dev_df['clkday'] = dev_df['clkday'].fillna(0).astype('uint16')
dev_df['opndayrate'] = ((dev_df['opnday']/dev_df['totday']).replace(np.inf, 0))
dev_df['clkdayrate'] = ((dev_df['clkday']/dev_df['totday']).replace(np.inf, 0))

dev_df['tothour'] = dev_df['tothour'].fillna(0).astype('uint16')
dev_df['opnhour'] = dev_df['opnhour'].fillna(0).astype('uint16')
dev_df['clkhour'] = dev_df['clkhour'].fillna(0).astype('uint16')
dev_df['opnhourrate'] = ((dev_df['opnhour']/dev_df['tothour']).replace(np.inf, 0))
dev_df['clkhourrate'] = ((dev_df['clkhour']/dev_df['tothour']).replace(np.inf, 0))

del dev_df['totcom'],dev_df['opncom'],dev_df['clkcom'],dev_df['totdow'],dev_df['opndow'],dev_df['clkdow']
del dev_df['totday'],dev_df['opnday'],dev_df['clkday'],dev_df['tothour'],dev_df['opnhour'],dev_df['clkhour']

In [13]:
test_df['totcom'] = test_df['totcom'].fillna(0).astype('uint16')
test_df['opncom'] = test_df['opncom'].fillna(0).astype('uint16')
test_df['clkcom'] = test_df['clkcom'].fillna(0).astype('uint16')
test_df['opncomrate'] = ((test_df['opncom']/test_df['totcom']).replace(np.inf, 0))
test_df['clkcomrate'] = ((test_df['clkcom']/test_df['totcom']).replace(np.inf, 0))

test_df['totdow'] = test_df['totdow'].fillna(0).astype('uint16')
test_df['opndow'] = test_df['opndow'].fillna(0).astype('uint16')
test_df['clkdow'] = test_df['clkdow'].fillna(0).astype('uint16')
test_df['opndowrate'] = ((test_df['opndow']/test_df['totdow']).replace(np.inf, 0))
test_df['clkdowrate'] = ((test_df['clkdow']/test_df['totdow']).replace(np.inf, 0))

test_df['totday'] = test_df['totday'].fillna(0).astype('uint16')
test_df['opnday'] = test_df['opnday'].fillna(0).astype('uint16')
test_df['clkday'] = test_df['clkday'].fillna(0).astype('uint16')
test_df['opndayrate'] = ((test_df['opnday']/test_df['totday']).replace(np.inf, 0))
test_df['clkdayrate'] = ((test_df['clkday']/test_df['totday']).replace(np.inf, 0))

test_df['tothour'] = test_df['tothour'].fillna(0).astype('uint16')
test_df['opnhour'] = test_df['opnhour'].fillna(0).astype('uint16')
test_df['clkhour'] = test_df['clkhour'].fillna(0).astype('uint16')
test_df['opnhourrate'] = ((test_df['opnhour']/test_df['tothour']).replace(np.inf, 0))
test_df['clkhourrate'] = ((test_df['clkhour']/test_df['tothour']).replace(np.inf, 0))

del test_df['totcom'],test_df['opncom'],test_df['clkcom'],test_df['totdow'],test_df['opndow'],test_df['clkdow']
del test_df['totday'],test_df['opnday'],test_df['clkday'],test_df['tothour'],test_df['opnhour'],test_df['clkhour']

In [14]:
# Combining the data
y_train = (train_df[['is_open', 'is_click']])
y_dev = (dev_df[['is_open', 'is_click']])
test_id = test_df['id'].values
del test_df['id'],train_df['is_open'],train_df['is_click'],dev_df['is_open'],dev_df['is_click']

n_trains = train_df.shape[0]
n_devs = dev_df.shape[0]
n_tests = test_df.shape[0]

full_df = pd.concat([train_df,dev_df,test_df])
del train_df,dev_df,test_df
gc.collect()

240

In [33]:
start = time()
print('Renaming of the new date variables created')
full_df['dow'] = full_df['dow'].map({0:'MON', 1:'TUE', 2:'WED', 3:'THR',4: 'FRI', 5: 'SAT', 6: 'SUN'} ).astype(str)
full_df['hour'] = full_df['hour'].map({0:'AM1', 1:'AM1', 2:'AM1', 3:'AM2', 4:'AM2', 5:'AM2', 6:'AM3', 7:'AM3', 8:'AM3',
                                         9:'AM4', 10:'AM4', 11:'AM4', 12:'PM1', 13:'PM1', 14:'PM1', 15:'PM2', 16:'PM2',
                                         17:'PM2', 18:'PM3', 19:'PM3', 20:'PM3', 21:'PM4', 22:'PM4', 23:'PM4'}).astype(str)
full_df['day'] = full_df['day'].map({1:'VEAR', 2:'VEAR', 3:'VEAR', 4:'VEAR', 5:'VEAR', 6:'EAR', 7:'EAR', 8:'EAR',
                                       9:'EAR', 10:'EAR', 11:'MID', 12:'MID', 13:'MID', 14:'MID', 15:'MID', 16:'VMID',
                                       17:'VMID', 18:'VMID', 19:'VMID', 20:'VMID', 21:'LAT', 22:'LAT', 23:'LAT', 
                                       24:'LAT', 25:'LAT', 26:'VLAT', 27:'VLAT', 28:'VLAT', 29:'VLAT', 30:'VLAT',
                                       31:'VLAT'}).astype(str)

print('Time taken for renaming date variable: ', time()-start, ' secs')

Renaming of the new date variables created
Time taken for renaming date variable:  1.6065678596496582  secs


In [15]:
# TfidfVectorizer for subject
start=time()
print("TfidfVectorizer for subject......................")
TFIDF1 = TfidfVectorizer(max_features = 1000, ngram_range = (1,2), stop_words = "english")
X_sub = TFIDF1.fit_transform(full_df["sub_tokens"])
print('Time taken for TfidfVectorizer for subject: ', time()-start, ' secs with shape: ', X_sub.shape)

TfidfVectorizer for subject......................
Time taken for TfidfVectorizer for subject:  41.483054876327515  secs with shape:  (1797049, 435)


In [16]:
# TfidfVectorizer for email
start=time()
print("TfidfVectorizer for email Description......................")
TFIDF2 = TfidfVectorizer(max_features = 100000, ngram_range = (1,2), stop_words = "english")
X_email = TFIDF1.fit_transform(full_df["email_tokens"])
print('Time taken for TfidfVectorizer for email Description: ', time()-start, ' secs with shape: ', X_email.shape)

TfidfVectorizer for email Description......................
Time taken for TfidfVectorizer for email Description:  523.4367690086365  secs with shape:  (1797049, 1000)


In [17]:
# Dummification of the categorical variables in the data
start=time()
print("Dummy Encoders for communication and day, time..................")
X_dummy = scipy.sparse.csr_matrix(pd.get_dummies(full_df[['communication_type','day','dow','hour']], sparse = True).values)
print('Time taken for Dummification is: ', time()-start,' secs with shape: ',X_dummy.shape)

Dummy Encoders for communication and day, time..................
Time taken for Dummification is:  25.771622896194458  secs with shape:  (1797049, 27)


In [18]:
full_df.columns

Index(['campaign_id', 'clkcomrate', 'clkdayrate', 'clkdowrate', 'clkhourrate',
       'communication_type', 'day', 'dow', 'email_body', 'email_tokens',
       'email_url', 'hour', 'id', 'no_of_images', 'no_of_internal_links',
       'no_of_sections', 'opncomrate', 'opndayrate', 'opndowrate',
       'opnhourrate', 'send_date', 'sub_tokens', 'subject', 'total_links',
       'user_id'],
      dtype='object')

In [19]:
# Gathering the continous variables
start=time()
print("Gathering the email property continous variables..................")
X_continous = scipy.sparse.csr_matrix(full_df[['no_of_images','no_of_internal_links','no_of_sections',
                                               'total_links','clkcomrate', 'clkdayrate', 'clkdowrate', 
                                               'clkhourrate']].values)
print('Time taken for Gathering the email property continous variables is: ', time()-start,' secs with shape: ',X_continous.shape)

Gathering the email property continous variables..................
Time taken for Gathering the email property continous variables is:  2.3386292457580566  secs with shape:  (1797049, 8)


In [20]:
start=time()
print("Stacking up the X......................")
X = scipy.sparse.hstack((X_sub,X_email,X_dummy,X_continous)).tocsr()
# ,X_counts
X_train = X[:n_trains]
X_dev = X[n_trains:n_trains+n_devs]
X_test = X[n_trains+n_devs:]

del X_sub,X_email,X_dummy,X_continous #,full_df
#,X_counts\
gc.collect()

print('X is ready with shape:',X_train.shape, X_dev.shape, X_test.shape)

Stacking up the X......................
X is ready with shape: (920871, 1470) (102320, 1470) (773858, 1470)


In [21]:
start = time()
y=(y_train['is_click'].values)
y_valid= (y_dev['is_click'].values)

print ('Ridge Modeling now in progress.......................')
model1 = RidgeClassifier(solver = "saga", fit_intercept=False, random_state=42, alpha=2)
model1.fit(X_train, y)
preds1 = model1.predict(X_dev)
print('Model complete with accuracy: ', roc_auc_score(y_valid, preds1))

model2 = RidgeClassifier(solver="sag", fit_intercept=False, random_state=42, alpha=2)
model2.fit(X_train, y)
preds2 = model2.predict(X_dev)
print('Model complete with accuracy: ', roc_auc_score(y_valid, preds2))
print('Time taken for Ridge Modeling is: ', time()-start,' secs')

Ridge Modeling now in progress.......................
Model complete with accuracy:  0.5
Model complete with accuracy:  0.5
Time taken for Ridge Modeling is:  188.465106010437  secs


In [22]:
print ('Light GBM Modeling now in progress.......................')
start = time()
train_X, valid_X, train_y, valid_y = train_test_split(X_train, y, test_size = 0.2, random_state = 42) 
d_train = lgb.Dataset(train_X, label=train_y)
d_valid = lgb.Dataset(valid_X, label=valid_y)
watchlist = [d_train, d_valid]

params = {'learning_rate': 0.05, 'objective': 'binary','metric': 'AUC'}

params2 = {'learning_rate': 0.04, 'objective': 'binary', 'metric':'AUC'}
            
model3 = lgb.train(params, train_set=d_train, num_boost_round=2000, valid_sets=watchlist,
                    early_stopping_rounds=100, verbose_eval=100)
preds3 = model3.predict(X_dev)
print('Prediction for lgb 1 completed.', time()-start,  ' secs')
print('Accuracy: ', roc_auc_score(y_valid, preds3))

start = time()    
train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X_train, y, test_size = 0.2, random_state = 42) 
d_train2 = lgb.Dataset(train_X2, label=train_y2)
d_valid2 = lgb.Dataset(valid_X2, label=valid_y2)
watchlist2 = [d_train2, d_valid2]

model4 = lgb.train(params2, train_set=d_train2, num_boost_round=2000, valid_sets=watchlist2,
                    early_stopping_rounds=100, verbose_eval=100) 
preds4 = model4.predict(X_dev)
print('Accuracy: ', roc_auc_score(y_valid, preds4))
print('Prediction for lgb 2 completed.', time()-start, ' secs')

Light GBM Modeling now in progress.......................
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.591433	valid_1's auc: 0.593402
Early stopping, best iteration is:
[1]	training's auc: 0.591433	valid_1's auc: 0.593402
Prediction for lgb 1 completed. 56.00239896774292  secs
Accuracy:  0.5916081094189191
Training until validation scores don't improve for 100 rounds.
[100]	training's auc: 0.591433	valid_1's auc: 0.593402
Early stopping, best iteration is:
[1]	training's auc: 0.591433	valid_1's auc: 0.593402
Accuracy:  0.5916081094189191
Prediction for lgb 2 completed. 66.15966606140137  secs


In [23]:
preds3 = model3.predict(X_test)
preds4 = model4.predict(X_test)
preds = (0.5*preds3)+(0.5*preds4) 

In [24]:
out = pd.DataFrame({'id': test_id, 'is_click': preds})
out.head()
out.to_csv("pred4.csv", index=False)

In [25]:
out.shape

(773858, 2)