In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_rows=100
pd.options.display.max_columns=100
pd.options.display.max_colwidth=200
import datetime as dt
import warnings
warnings.filterwarnings(action='ignore')
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score,f1_score
from sklearn.model_selection import KFold,GroupKFold,ShuffleSplit,StratifiedKFold
import gc
import lightgbm as lgb
from tqdm import tqdm_notebook as tqdm
from imblearn.over_sampling import SMOTE

In [None]:
path = '/kaggle/input/wns-wizard/'

In [None]:
#Reduce the memory usage - Inspired by Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


In [None]:
train_df = reduce_mem_usage(pd.read_csv(f'{path}train.csv',parse_dates=['impression_time',],dayfirst=True))
test_df = reduce_mem_usage(pd.read_csv(f'{path}test.csv',parse_dates=['impression_time',],dayfirst=True))
item_data = reduce_mem_usage(pd.read_csv(f'{path}item_data.csv'))
view_log = reduce_mem_usage(pd.read_csv(f'{path}view_log.csv',parse_dates=['server_time',],dayfirst=True))

In [None]:
from collections import Counter

def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: round(float(majority)/float(count), 2) for cls, count in counter.items()}

class_weights = get_class_weights(train_df['is_click'])
print(class_weights)

In [None]:
train_df['is_click'].value_counts()

In [None]:
train_df.shape

In [None]:
## is_click==1
10862/237609

In [None]:
train_df.head(100)

In [None]:
test_df.head(100)

In [None]:
view_log.head(100)

In [None]:
train_df['is_train']=1
test_df['is_train']=0

In [None]:
date_cols = ['impression_time']
for date_col in date_cols:
    train_df[date_col + "_in_seconds"] = (train_df[date_col] - dt.datetime(2000,1,1)).dt.total_seconds()
    test_df[date_col + "_in_seconds"] = (test_df[date_col] - dt.datetime(2000,1,1)).dt.total_seconds()
    
    train_df[date_col + "_month"] = train_df[date_col].dt.month
    test_df[date_col + "_month"] = test_df[date_col].dt.month
    
    train_df[date_col + "_hour"] = train_df[date_col].dt.hour
    test_df[date_col + "_hour"] = test_df[date_col].dt.hour
    
    #train_df[date_col + "_year"] = train_df[date_col].dt.year
    #test_df[date_col + "_year"] = test_df[date_col].dt.year
    
    train_df[date_col + "_week"] = train_df[date_col].dt.week
    test_df[date_col + "_week"] = test_df[date_col].dt.week
    
    train_df[date_col + "_weekDay"] = train_df[date_col].dt.weekday
    test_df[date_col + "_weekDay"] = test_df[date_col].dt.weekday

In [None]:
train_df['is_click'] = train_df['is_click'].astype(np.int8)

In [None]:
train_df.head()

In [None]:
dist = pd.DataFrame(np.zeros((train_df['impression_time_hour'].nunique(),4)),columns=['Var','Shape','Zeros','Ones'])
for i,var in enumerate(train_df['impression_time_hour'].unique()):
    mask = train_df['impression_time_hour']==var
    size = train_df[mask].shape[0]
    mask0 = train_df['is_click']==0
    mask1 = train_df['is_click']==1
    ones = train_df[mask & mask1].shape[0]
    zeros = train_df[mask & mask0].shape[0]
    dist.iloc[i,:]=[var,int(size),int(zeros),int(ones)]
dist.sort_values(by='Ones',ascending=False)

In [None]:
train_df[train_df['is_click']==1]

In [None]:
train_app_unique = train_df['app_code'].unique() 
test_app_unique = test_df['app_code'].unique()
s = [i for i in train_app_unique if i not in test_app_unique]
ss = [i for i in test_app_unique if i not in train_app_unique]

In [None]:
print('Train min/max date: %s / %s' % (train_df.impression_time.min().date(), train_df.impression_time.max().date()))
print('Test  min/max date: %s / %s' % ( test_df.impression_time.min().date(),  test_df.impression_time.max().date()))
print('')
print('Number of days in train: %d' % ((train_df.impression_time.max() - train_df.impression_time.min()).days + 1))
print('Number of days in test:  %d' % (( test_df.impression_time.max() -  test_df.impression_time.min()).days + 1))
print('')
print('Train shape: %d rows' % train_df.shape[0])
print('Test shape: %d rows'  % test_df.shape[0])

## Train_Data Preprocessing

In [None]:
train_test = pd.concat([train_df,test_df],sort=False,axis=0).reset_index(drop=True)
#del train_df,test_df
gc.collect()

In [None]:
train_test = train_test.sort_values(by=['impression_time']).reset_index(drop=True)
train_test.head()

In [None]:
train_test.shape

In [None]:
def hour_encode(val):
    if val in list(range(2,8)):
        return 0
    elif val in list(range(9,14)):
        return 1
    else:
        return 2

In [None]:
#train_test['hour_encode'] = train_test['impression_time_hour'].apply(hour_encode)

In [None]:
##unique app_code
train_test.ix[train_test['app_code'].isin(ss+s),'app_code_prsent']=1
train_test.ix[~train_test['app_code'].isin(ss+s),'app_code_prsent']=0
train_test['app_code_prsent'] = train_test['app_code_prsent'].astype(np.int8)

In [None]:
## impression_time
_ = train_test.groupby('user_id')['impression_time_in_seconds'].mean().reset_index()
_.columns = ['user_id','user_impression_time_in_second_mean']
train_test = pd.merge(train_test,_,how='left',on='user_id')
train_test["user_cumnum_impression_time"] = train_test.groupby("user_id")["impression_time_in_seconds"].cumcount().values

In [None]:
_ = train_test.groupby('user_id').size().reset_index()
_.columns = ['user_id','user_id_count']
train_test = pd.merge(train_test,_,how='left',on='user_id')

In [None]:
train_test['prev_impression_time'] = train_test.groupby('user_id')['impression_time_in_seconds'].shift(1)
train_test['time_gap_between_prev'] = train_test['impression_time_in_seconds']-train_test['prev_impression_time']
train_test['next_impression_time'] = train_test.groupby('user_id')['impression_time_in_seconds'].shift(-1)
train_test['time_gap_between_next'] = train_test['impression_time_in_seconds']-train_test['next_impression_time']
train_test.drop(columns=['prev_impression_time','next_impression_time'],inplace=True)

In [None]:
train_test['prev_impression_time_2'] = train_test.groupby('user_id')['impression_time_in_seconds'].shift(2)
train_test['time_gap_between_prev_2'] = train_test['impression_time_in_seconds']-train_test['prev_impression_time_2']
train_test['next_impression_time_2'] = train_test.groupby('user_id')['impression_time_in_seconds'].shift(-2)
train_test['time_gap_between_next_2'] = train_test['impression_time_in_seconds']-train_test['next_impression_time_2']
train_test.drop(columns=['prev_impression_time_2','next_impression_time_2'],inplace=True)

In [None]:
train_test['prev_impression_time_3'] = train_test.groupby('user_id')['impression_time_in_seconds'].shift(3)
train_test['time_gap_between_prev_3'] = train_test['impression_time_in_seconds']-train_test['prev_impression_time_3']
train_test['next_impression_time_3'] = train_test.groupby('user_id')['impression_time_in_seconds'].shift(-3)
train_test['time_gap_between_next_3'] = train_test['impression_time_in_seconds']-train_test['next_impression_time_3']
train_test.drop(columns=['prev_impression_time_3','next_impression_time_3'],inplace=True)

In [None]:
_ = train_test.groupby(['user_id'])['app_code'].nunique().reset_index()
_.columns=['user_id','userId_appCode_unique']
train_test = pd.merge(train_test,_,how='left',on='user_id')
_ = train_test.groupby(['user_id','app_code']).size().reset_index()
_.columns=['user_id','app_code','userId_appCode_size']
train_test = pd.merge(train_test,_,how='left',on=['user_id','app_code'])
del _
gc.collect()

In [None]:
## os_version
_ = train_test.groupby(['user_id'])['os_version'].nunique().reset_index()
_.columns=['user_id','userId_osVersion_unique']
train_test = pd.merge(train_test,_,how='left',on='user_id')
_ = train_test.groupby(['user_id','os_version']).size().reset_index()
_.columns=['user_id','os_version','userId_osVersion_size']
train_test = pd.merge(train_test,_,how='left',on=['user_id','os_version'])
del _
gc.collect()

In [None]:
## is_4G
_ = train_test.groupby(['user_id'])['is_4G'].nunique().reset_index()
_.columns=['user_id','userId_is4G_unique']
train_test = pd.merge(train_test,_,how='left',on='user_id')
_ = train_test.groupby(['user_id','is_4G']).size().reset_index()
_.columns=['user_id','is_4G','userId_is4G_size']
train_test = pd.merge(train_test,_,how='left',on=['user_id','is_4G'])
del _
gc.collect()

In [None]:
mask  = train_test['userId_appCode_size']==train_test['user_id_count']
train_test.ix[mask,'userId_appCode_size']=-1
mask1 = train_test['userId_osVersion_size']==train_test['user_id_count']
train_test.ix[mask1,'userId_osVersion_size']=-1
mask2 = train_test['userId_is4G_size']==train_test['user_id_count']
train_test.ix[mask2,'userId_is4G_size']=-1

In [None]:
## impression_id
_ = train_test.groupby(['user_id'])['impression_id'].nunique().reset_index()
_.columns=['user_id','user_impression_nunique']
train_test = pd.merge(train_test,_,how='left',on=['user_id'])
train_test['user_impression_ratio'] = train_test['user_impression_nunique']/train_test['user_id_count']

In [None]:
train_test.shape

In [None]:
view_log = pd.merge(view_log,item_data,on='item_id',how='left')
del item_data
gc.collect()

In [None]:
view_log.head(100)

In [None]:
def label_encode_df(dataframe,cols):
    for col in cols:
        le=preprocessing.LabelEncoder()
        dataframe[str(col)+'_encode']=le.fit_transform(dataframe[col].astype(str))

In [None]:
label_encode_df(train_test,['impression_id','os_version'])
label_encode_df(view_log,['device_type'])

In [None]:
date_cols = ['server_time']
for date_col in date_cols:
    view_log[date_col + "_in_seconds"] = (view_log[date_col] - dt.datetime(2000,1,1)).dt.total_seconds()
    view_log[date_col + "_month"] = view_log[date_col].dt.month
    #train_df[date_col + "_year"] = train_df[date_col].dt.year
    #test_df[date_col + "_year"] = test_df[date_col].dt.year
    view_log[date_col + "_week"] = view_log[date_col].dt.week
    view_log[date_col + "_weekDay"] = view_log[date_col].dt.weekday

In [None]:
_ = view_log.groupby('user_id')['device_type'].nunique().reset_index()
_.columns= ['user_id','device_type_nunique']
view_log = pd.merge(view_log,_,on='user_id',how='left')
train_test = pd.merge(train_test,_,on='user_id',how='left')
del _
gc.collect()

In [None]:
_ = view_log.groupby('user_id')['session_id'].nunique().reset_index()
_.columns= ['user_id','session_id_nunique']
view_log = pd.merge(view_log,_,on='user_id',how='left')
train_test = pd.merge(train_test,_,on='user_id',how='left')
del _
gc.collect()


In [None]:
_ = view_log.groupby('user_id')['item_id'].nunique().reset_index()
_.columns= ['user_id','item_id_nunique']
view_log = pd.merge(view_log,_,on='user_id',how='left')
train_test = pd.merge(train_test,_,on='user_id',how='left')
del _
gc.collect()

In [None]:
_ = view_log.groupby(['user_id']).size().reset_index()
_.columns = ['user_id','no_of_time_user_connected']
view_log = pd.merge(view_log,_,on='user_id',how='left')
train_test = pd.merge(train_test,_,on='user_id',how='left')
del _ 
gc.collect()

In [None]:
_ = view_log.groupby('user_id')['server_time_in_seconds'].mean().reset_index()
_.columns = ['user_id','user_server_time_in_second_mean']
view_log = pd.merge(view_log,_,on='user_id',how='left')
train_test = pd.merge(train_test,_,on='user_id',how='left')
del _ 
gc.collect()

In [None]:
train_test['user_session_ratio'] = train_test['session_id_nunique']/train_test['no_of_time_user_connected']
train_test['user_item_ratio'] = train_test['item_id_nunique']/train_test['no_of_time_user_connected']

view_log['user_session_ratio'] = view_log['session_id_nunique']/view_log['no_of_time_user_connected']
view_log['user_item_ratio'] = view_log['item_id_nunique']/view_log['no_of_time_user_connected']

In [None]:
_ = view_log.groupby(['user_id','session_id'])['server_time_in_seconds'].first().reset_index()
_.columns = ['user_id','session_id','first']
view_log = pd.merge(view_log,_,how='left',on=['user_id','session_id'])

_ = view_log.groupby(['user_id','session_id'])['server_time_in_seconds'].last().reset_index()
_.columns = ['user_id','session_id','last']
view_log = pd.merge(view_log,_,how='left',on=['user_id','session_id'])
view_log['overall_session_time'] = view_log['last']-view_log['first']
del _ 
gc.collect()


_ = view_log.groupby(['user_id'])['overall_session_time'].sum().reset_index()
_.columns = ['user_id','overall_time_sum']
view_log = pd.merge(view_log,_,how='left',on='user_id')
train_test = pd.merge(train_test,_,on='user_id',how='left')

del _
gc.collect()

In [None]:
view_log.drop(columns=['first','last','overall_session_time'],inplace=True)

In [None]:
_ = view_log.groupby(["user_id"])["item_price"].agg(["min", "max", "mean", "std"]).reset_index()
_.columns   = ['user_id','user_item_price_min','user_item_price_max','user_item_price_mean','user_item_price_std']
train_test = pd.merge(train_test,_,on='user_id',how='left')
view_log = pd.merge(view_log,_,how='left',on='user_id')
del _
gc.collect()

In [None]:
train_test.head()

In [None]:
#train_test['diff_user_impression_session_time_in_second_mean'] = train_test['user_impression_time_in_second_mean']-train_test['user_server_time_in_second_mean']

In [None]:
view_log.head(100)

In [None]:
train_test.head()

In [None]:
train_test['user_app_price_rank'] = train_test.groupby(['user_id','app_code'])['user_item_price_mean'].rank()

In [None]:
train_test.columns

In [None]:
_ = pd.pivot_table(train_test, index="user_id", columns="impression_time_weekDay", values="impression_id", aggfunc="count", fill_value=0).reset_index()
train_test = pd.merge(train_test, _, on="user_id", how="left")
_ = pd.pivot_table(train_test, index="user_id", columns="impression_time_weekDay", values="impression_id", aggfunc="count", fill_value=0).reset_index()
train_test = pd.merge(train_test, _, on="user_id", how="left")

_ = pd.pivot_table(train_test, index="user_id", columns="impression_time_month", values="impression_id", aggfunc="count", fill_value=0).reset_index()
train_test = pd.merge(train_test, _, on="user_id", how="left")

_ = pd.pivot_table(train_test, index="user_id", columns="impression_time_hour", values="impression_id", aggfunc="count", fill_value=0).reset_index()
train_test = pd.merge(train_test, _, on="user_id", how="left")

_ = pd.pivot_table(train_test, index="user_id", columns="impression_time_week", values="impression_id", aggfunc="count", fill_value=0).reset_index()
train_test = pd.merge(train_test, _, on="user_id", how="left")


_ = pd.pivot_table(train_test, index="user_id", columns="app_code", values="impression_id", aggfunc="count", fill_value=0).reset_index()
train_test = pd.merge(train_test, _, on="user_id", how="left")

del _
gc.collect()


In [None]:
_ = train_test.groupby(['user_id'])['app_code'].shift(1)
train_test['prev_app_code_diff'] = (train_test['app_code'] == _).astype(int)
_ = train_test.groupby(['user_id'])['app_code'].shift(-1)
train_test['next_app_code_diff'] = (train_test['app_code'] == _).astype(int)


In [None]:
train_test.head()

In [None]:
train_test.shape

In [None]:
train_test = train_test.sort_values(by=['impression_time','user_id'])

In [None]:
train_test['user_impressionTime_diff_in_seconds']  = train_test.groupby(['user_id'])['impression_time_in_seconds'].diff()

In [None]:
gc.collect()

In [None]:
train_test.shape

In [None]:
train_test.to_pickle('train_test_1.pkl')  

In [None]:
cols_to_exclude = ["impression_time", "is_click",'is_train','impression_id','os_version','device_type','diff_user_impression_session_time_in_second_mean']
cols_to_use = [col for col in train_test.columns if col not in cols_to_exclude]

In [None]:
train_df = train_test[train_test['is_train']==1]
test_df = train_test[train_test['is_train']==0]

In [None]:
train_df.shape

In [None]:
del train_test

In [None]:
gc.collect()

In [None]:
train_df['is_click'] = train_df['is_click'].astype(np.int8)

In [None]:
train_df[train_df['is_click']==1].shape

In [None]:
params = {}
params["objective"] = "binary"
params["max_depth"] = -1
#  params["min_sum_hessian_in_leaf"] = 50
params["learning_rate"] = 0.008
params["bagging_fraction"] = 0.8
params["feature_fraction"] = 0.2
params["feature_fraction_seed"] = 42
params["bagging_freq"] = 1
params["bagging_seed"] = 42
params["verbosity"] = -1
params['metric']='auc'

In [None]:
target = train_df['is_click']
folds = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(tqdm(folds.split(train_df.values, target.values))):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][cols_to_use], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][cols_to_use], label=target.iloc[val_idx])
    num_round = 400000
    clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][cols_to_use], num_iteration=clf.best_iteration)
    
   
    predictions += clf.predict(test_df[cols_to_use], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

auc = roc_auc_score(train_df['is_click'], oof)
fpr, tpr, thresholds = roc_curve(train_df['is_click'], oof)
plt.figure()
plt.plot(fpr, tpr, label='Adaptive Boosting Classifier (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate'), plt.ylabel('True Positive Rate'), plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
# plt.savefig('Log_ROC')
plt.show()

In [None]:
sub_df = pd.DataFrame(test_df[["impression_id"]])
sub_df['is_click'] = predictions 
sub_df.to_csv("lgbm-0.751913487461546.csv", index=False)