In [None]:
#install the pydrive wrapper and import libraries
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the pydrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# running this cell to mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install catboost

In [None]:
!ls '/content/drive/My Drive/AV_data/WNS_Analytics_Wizerd_23424/'

In [None]:
# !unzip -q '/content/drive/My Drive/AV_data/WNS_Analytics_Wizard_24082019/sample_submission_IPsBlCT.zip'
# !unzip -q '/content/drive/My Drive/AV_data/WNS_Analytics_Wizard_24082019/test.zip'
# !unzip -q '/content/drive/My Drive/AV_data/WNS_Analytics_Wizard_24082019/train.zip'
!ls

In [None]:
# load packages
import numpy as np,
import pandas as pd,
from scipy.stats import mode,
from sklearn import metrics, preprocessing, model_selection,
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold,
from sklearn.model_selection import GroupKFold, GridSearchCV,
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,
from collections import defaultdict, Counter,
import lightgbm as lgb,
import matplotlib.pyplot as plt,
import seaborn as sns

import string,
from imblearn.over_sampling import SMOTE, ADASYN

pd.options.display.max_columns = 100
from plotly import tools
import plotly.graph_objs as go,
from plotly.offline import init_notebook_mode, iplot,
import warnings,
warnings.filterwarnings(\"ignore\")

In [None]:
train_df = pd.read_csv('train.csv')

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.shape

In [None]:
train_df.groupby('is_click')['impression_id'].count()

In [None]:
train_df['is_click'].value_counts(normalize=True)
#We have imbalance target class. We need to use some upsampling techniques

In [None]:
print(f'Number of samples in train: {train_df.shape[0]}'),
print(f'Number of columns in train: {train_df.shape[1]}'),
for col in train_df.columns:
    if train_df[col].isnull().any():
        print(col, train_df[col].isnull().sum())

In [None]:
train_df.dtypes

In [None]:
missing_data = (((train_df.isnull().sum())*100)/len(train_df))
missing_data

In [None]:
#### Read item_data
item_data_df = pd.read_csv('item_data.csv')

In [None]:
item_data_df.head()

In [None]:
item_data_df.isnull().sum()

In [None]:
item_data_df.shape

In [None]:
### View Log data
view_log_df = pd.read_csv('view_log.csv')

In [None]:
view_log_df.head()

In [None]:
view_log_df.isnull().sum()

In [None]:
view_log_df.shape

In [None]:
# Join the datasets View Log and Item Data
#Join the train data and quality and process based on the key timestamp
item_view_log_df = pd.merge(view_log_df, item_data_df, on='item_id', how='left')

In [None]:
item_view_log_df.shape

In [None]:
item_view_log_df.drop_duplicates(inplace=True)

In [None]:
item_view_log_df.shape
item_view_log_df.head()

In [None]:
item_view_log_df[item_view_log_df['user_id'] ==0].head()

In [None]:
item_view_log_df.dtypes

In [None]:
cols = ['device_type']
for col in cols:
    if item_view_log_df[col].dtype==object:
        print(col)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(item_view_log_df[col].values.astype('str')))
        item_view_log_df[col] = lbl.transform(list(item_view_log_df[col].values.astype('str')))

In [None]:
item_view_log_df['server_time'] = pd.to_datetime(item_view_log_df['server_time'])

In [None]:
item_view_log_df[\"log_Year\"] = item_view_log_df[\"server_time\"].dt.year
item_view_log_df[\"log_Month\"] = item_view_log_df[\"server_time\"].dt.month
item_view_log_df[\"log_Day\"] = item_view_log_df[\"server_time\"].dt.day
item_view_log_df[\"log_WeekDay\"] = item_view_log_df[\"server_time\"].dt.weekday
item_view_log_df[\"log_time\"] = item_view_log_df[\"server_time\"].dt.time
item_view_log_df[['log_h','log_m','log_s']] = item_view_log_df['log_time'].astype(str).str.split(':', expand=True).astype(int)

In [None]:
item_view_log_df.head()

In [None]:
item_view_log_df['log_Year'].value_counts()

In [None]:
item_view_log_df['log_Month'].value_counts()

In [None]:
item_view_log_df['log_Day'].value_counts()

In [None]:
# ### Convert all the variables into category\n",
        "# item_view_log_df.session_id=dftest.session_id.astype(np.object)\n",
        "# item_view_log_df.user_id=dftest.user_id.astype(np.object)\n",
        "# item_view_log_df.item_id=dftest.item_id.astype(np.object)\n",
        "\n",
        "# item_view_log_df.category_1=dftest.category_1.astype(np.object)\n",
        "# item_view_log_df.category_2=dftest.category_2.astype(np.object)\n",
        "# item_view_log_df.category_3=dftest.category_3.astype(np.object)\n",
        "\n",
        "# item_view_log_df.product_type=dftest.product_type.astype(np.object)

In [None]:
### Grouping the viewed log data using and extracting features
item_view_log_df['device_type'].value_counts()

In [None]:
item_view_log_df['product_type'].value_counts().head()

In [None]:
days_active = item_view_log_df.reset_index().groupby(
    ['user_id'])['server_time'].agg(
    lambda x: (x.max() - x.min()).days if (x.max() - x.min()).days !=0 else 1)
unique_days_active = item_view_log_df.reset_index().groupby(
    ['user_id'])['server_time'].agg(
    lambda x: len(np.unique(x.dt.dayofyear)))
user_time_features = days_active.reset_index().merge(
    unique_days_active.reset_index(),on='user_id',how = 'left')
user_time_features.columns = ['user_id','log_days_active','log_unique_days_active']

In [None]:
user_time_features.head()

In [None]:
user_time_features.shape

In [None]:
user_time_features['user_id'].nunique()

In [None]:
item_view_log_df.head()

In [None]:
item_view_log_df[item_view_log_df['session_id'] == 112333]

In [None]:
item_view_log_df['session_id'].value_counts().head()

In [None]:
item_view_log_df['category_1'].nunique()

In [None]:
item_view_log_df['category_2'].nunique()

In [None]:
item_view_log_df['category_3'].nunique()

In [None]:
item_view_log_df['item_id'].nunique()

In [None]:
item_view_log_df['product_type'].nunique()

In [None]:
item_view_log_df.columns

In [None]:
item_view_log_df.dtypes

In [None]:
log_Month_df = pd.pivot_table(item_view_log_df, 
                              values=\"session_id\", 
                              index=\"user_id\", 
                              columns=\"log_Month\", 
                              aggfunc=\"count\", 
                              fill_value=0).reset_index()
print(log_Month_df.columns)

In [None]:
log_Month_df.columns = [\"user_id\"] + [\"log_Month_\"+str(i) for i in range(10,13)]

In [None]:
log_WeekDay_df = pd.pivot_table(item_view_log_df, 
                                values=\"session_id\", 
                                index=\"user_id\", 
                                columns=\"log_WeekDay\", 
                                aggfunc=\"count\", 
                                fill_value=0).reset_index()
print(log_WeekDay_df.columns)

In [None]:
log_WeekDay_df.columns = [\"user_id\"] + [\"log_WeekDay_\"+str(i) for i in range(0,7)]

In [None]:
log_WeekDay_df.head()

In [None]:
log_WeekDay_df.shape

In [None]:
item_view_log_df[(item_view_log_df['user_id'] == 4557)].shape

In [None]:
item_view_log_df[(item_view_log_df['user_id'] == 4557) & (item_view_log_df['category_1'] == 16.0)].head()

In [None]:
item_view_log_df['category_1'].min(), item_view_log_df['category_1'].max()

In [None]:
category_1_df = pd.pivot_table(item_view_log_df, 
                               values=\"item_id\", 
                               index=\"user_id\", 
                               columns=\"category_1\", 
                               aggfunc=\"count\", 
                               fill_value=0).reset_index()
print(category_1_df.columns)
category_1_df.columns = [\"user_id\"] + [\"cat_1_\"+str(i) for i in range(0,17)]

In [None]:
category_1_df.head()

In [None]:
category_1_df.shape

In [None]:
item_view_log_df['category_2'].min(), item_view_log_df['category_2'].max()

In [None]:
category_2_df = pd.pivot_table(item_view_log_df, 
                               values=\"item_id\", 
                               index=\"user_id\", 
                               columns=\"category_2\", 
                               aggfunc=\"count\", 
                               fill_value=0).reset_index()
category_2_df.columns = [\"user_id\"] + [\"cat_2_\"+str(i) for i in range(0,79)]

In [None]:
category_2_df.head()

In [None]:
item_view_log_df['category_3'].min(), item_view_log_df['category_3'].max()

In [None]:
item_view_log_df['product_type'].min(), item_view_log_df['product_type'].max()

In [None]:
bins = [0,7,15,22,31]
group_names = [1, 2, 3, 4]
item_view_log_df['Month_wk_grp'] = pd.cut(item_view_log_df['log_Day'], bins, labels=group_names)
item_view_log_df.head()

In [None]:
item_view_log_df['Month_wk_grp'] = pd.to_numeric(item_view_log_df['Month_wk_grp'])

In [None]:
log_Month_wk_grp_df = pd.pivot_table(item_view_log_df, 
                                     index=\"user_id\", 
                                     columns=\"Month_wk_grp\", 
                                     values=\"session_id\", 
                                     aggfunc=\"count\", 
                                     fill_value=0).reset_index()\n",
log_Month_wk_grp_df.columns = [\"user_id\"] + [\"log_Month_wk_grp_\"+str(i) for i in range(1,5)]

In [None]:
log_Month_wk_grp_df.head()

In [None]:
test_df.head()

In [None]:
test_df.shape

In [None]:
test_df['app_code'].value_counts().head()

In [None]:
from matplotlib_venn import venn2, venn2_circles

def get_venn(axarr, feature):
    axarr[0,0].set_title(f'Overlap between {feature} in train and test')
    venn2([set(train_df[feature].values), set(test_df[feature].values)], set_labels = ('train','test'), ax=axarr[0,0])
    axarr[0,1].set_title(f'Overlap between {feature} in train and hist')
    venn2([set(train_df[feature].values), 
           set(item_view_log_df[feature].values)], 
          set_labels = ('train','hist'), ax=axarr[0,1])
    axarr[1,0].set_title(f'Overlap between {feature} in test and hist')
    venn2([set(test_df[feature].values), 
           set(item_view_log_df[feature].values)], 
          set_labels = ('test','hist'), ax=axarr[1,0])
    axarr[1,1].set_title(f'Overlap between {feature} in train and test')
    venn2([set(train_df[feature].values), 
           set(test_df[feature].values)], 
           set_labels = ('train','test'), ax=axarr[1,1])

In [None]:
fig, axarr = plt.subplots(2,2, figsize=(10,6))
get_venn(axarr, 'user_id')

In [None]:
#### Join Train and LogView data"

In [None]:
item_view_log_df.head()

In [None]:
train_df.head()

In [None]:
train_df[train_df['user_id'] == 87862]

In [None]:
train_df['os_version'].value_counts()

In [None]:
# * join the datasets\n",
train_df['is_train']  = 1
test_df['is_click'] = -99
test_df['is_train'] = 0

In [None]:
full_df = train_df.append(test_df)

In [None]:
full_df['impression_time'] = pd.to_datetime(full_df['impression_time'])

In [None]:
full_df.dtypes

In [None]:
full_df[\"Year\"] = full_df[\"impression_time\"].dt.year
full_df[\"Month\"] = full_df[\"impression_time\"].dt.month
full_df[\"Day\"] = full_df[\"impression_time\"].dt.day
full_df[\"WeekDay\"] = full_df[\"impression_time\"].dt.weekday
full_df[\"time\"] = full_df[\"impression_time\"].dt.time
full_df[['h','m','s']] = full_df['time'].astype(str).str.split(':', expand=True).astype(int)

In [None]:
full_df.head()

In [None]:
full_df.dtypes

In [None]:
gdf = pd.pivot_table(full_df, index=\"user_id\", 
                     columns=\"Year\", 
                     values=\"impression_id\", 
                     aggfunc=\"count\", 
                     fill_value=0).reset_index()\n",
full_df = pd.merge(full_df, gdf, on=\"user_id\", how=\"left\")

In [None]:
full_df['Year'].value_counts()

In [None]:
full_df.head()

In [None]:
gdf = pd.pivot_table(full_df, 
                     index=\"user_id\", 
                     columns=\"Month\", 
                     values=\"impression_id\", 
                     aggfunc=\"count\", 
                     fill_value=0).reset_index()
gdf.columns = [\"user_id\"] + [\"Month_\"+str(i) for i in range(11,13)]
full_df = pd.merge(full_df, gdf, on=\"user_id\", how=\"left\")

In [None]:
full_df['app_code'].nunique()

In [None]:
full_df.head()

In [None]:
gdf = pd.pivot_table(full_df, 
                     index=\"user_id\", 
                     columns=\"Day\", 
                     values=\"impression_id\", 
                     aggfunc=\"count\", 
                     fill_value=0).reset_index()
gdf.columns = [\"user_id\"] + [\"Day_\"+str(i) for i in range(1,31)]
full_df = pd.merge(full_df, gdf, on=\"user_id\", how=\"left\")

In [None]:
full_df.head()

In [None]:
### pivot on MOdel and Year\n",
gdf = pd.pivot_table(full_df, index=\"user_id\", 
                     columns=\"WeekDay\", 
                     values=\"impression_id\", 
                     aggfunc=\"count\", 
                     fill_value=0).reset_index()
gdf.columns = [\"user_id\"] + [\"WeekDay_\"+str(i) for i in range(0,7)]
full_df = pd.merge(full_df, gdf, on=\"user_id\", how=\"left\")

In [None]:
full_df.head()

In [None]:
ad_days_active = full_df.reset_index().groupby(['user_id'])['impression_time'].agg(
    lambda x: (x.max() - x.min()).days if (x.max() - x.min()).days !=0 else 1)
ad_unique_days_active = full_df.reset_index().groupby(['user_id'])['impression_time'].agg(
    lambda x: len(np.unique(x.dt.dayofyear)))
ad_user_time_features = ad_days_active.reset_index().merge(
    ad_unique_days_active.reset_index(),
    on='user_id',how = 'left')
ad_user_time_features.columns = ['user_id','ad_days_active','ad_unique_days_active']

In [None]:
full_df = pd.merge(full_df, ad_user_time_features, on=\"user_id\", how=\"left\")

In [None]:
full_df.head()

In [None]:
full_df = pd.merge(full_df, user_time_features, on=\"user_id\", how=\"left\")

In [None]:
full_df.tail()

In [None]:
full_df = pd.merge(full_df, category_1_df, on=\"user_id\", how=\"left\")

In [None]:
full_df.head()

In [None]:
full_df = pd.merge(full_df, log_WeekDay_df, on=\"user_id\", how=\"left\")

In [None]:
full_df.head()

In [None]:
full_df = pd.merge(full_df, log_Month_df, on=\"user_id\", how=\"left\")

In [None]:
full_df.head()

In [None]:
full_df = pd.merge(full_df, category_2_df, on=\"user_id\", how=\"left\")

In [None]:
full_df.head()

In [None]:
full_df = pd.merge(full_df, log_Day_df, on=\"user_id\", how=\"left\")

In [None]:
full_df = pd.merge(full_df, log_Month_wk_grp_df, on=\"user_id\", how=\"left\")

In [None]:
full_df.shape

In [None]:
full_df.head()

In [None]:
#### Add the user time features from the item log window"
testcount = len(test_df)
count = len(full_df)-testcount
print(count)

In [None]:
full_df.isnull().sum()

In [None]:
full_df.fillna(0,inplace=True)

In [None]:
full_df.head()

In [None]:
full_df['app_code'].nunique()

In [None]:
full_df['os_version'].nunique()

In [None]:
#### Convertign into one-hot encoding"
cols = ['os_version']\n",
for col in cols:
    if full_df[col].dtype==object:
        print(col)
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(full_df[col].values.astype('str')))
        full_df[col] = lbl.transform(list(full_df[col].values.astype('str')))

In [None]:
ATTRIBUTION_CATEGORIES = [ 
         # V1 Features #\n",
           ###############\n",
         ['app_code'], ['os_version'], ['is_4G'],
        
      # V2 Features #\n",
        ###############\n",
          ['app_code', 'os_version'],
          ['app_code', 'is_4G'],
           ['os_version', 'is_4G']

In [None]:
print(ATTRIBUTION_CATEGORIES)

In [None]:
# Find frequency of is_attributed for each unique value in column
freqs = {}
for cols in ATTRIBUTION_CATEGORIES:
    # New feature name\n",
     new_feature = '_'.join(cols)+'_confRate'    ,
        
      # Perform the groupby
      group_object = full_df.groupby(cols)
      
       # Group sizes    \n",
       group_sizes = group_object.size()\n",
       log_group = np.log(100000) # 1000 views -> 60% confidence, 100 views -> 40% confidence \n",
    print(\">> Calculating confidence-weighted rate for: {}.\\n   
          Saving to: {}. Group Max /Mean / Median / Min: {} / {} / {} / {}\".format(
            cols, new_feature,
         group_sizes.max(), 
           np.round(group_sizes.mean(), 2),\n",
             np.round(group_sizes.median(), 2),\n",
               group_sizes.min()\n",
         ))  
           
         # Aggregation function\n",
         def rate_calculation(x):\n",
         """Calculate the attributed rate. Scale by confidence"""
        rate = x.sum() / float(x.count())
             conf = np.min([1, np.log(x.count()) / log_group])
             return rate * conf
        
          # Perform the merge\n",
          full_df = full_df.merge(\n",
             group_object['is_click']. \\
                   apply(rate_calculation). \\
                 reset_index(). 
                   rename( 
                      index=str,
                     columns={'is_click': new_feature}
                    )[cols + [new_feature]],
              on=cols, how='left'
          )

In [None]:
full_df.head()

In [None]:
full_df.columns

In [None]:
temp = item_view_log_df.groupby('user_id').size().reset_index()
temp.columns = ['user_id', 'count']
full_df = full_df.join(temp.set_index('user_id'), on = 'user_id', how = 'left')
full_df.head()

In [None]:
tmep = item_view_log_df.groupby(['user_id', 'server_time']).size().reset_index()
tmep.columns = ['user_id', 'server_time', 'same_user_time_count']

temp = tmep.groupby('user_id').agg({\"same_user_time_count\": ['var', 'mean']}).reset_index()
temp.columns = ['user_id', 'same_user_time_count_var', 'same_user_time_count_mean']
temp.head()

In [None]:
full_df = full_df.join(temp.set_index('user_id'), on = 'user_id', how = 'left')
full_df['same_user_time_count_var'] = full_df['same_user_time_count_var'].fillna(0)
full_df['same_user_time_count_mean'] = full_df['same_user_time_count_mean'].fillna(0)
full_df.head()

In [None]:
def prev_view(user_id, DateTime):
    user_id_dict = defaultdict()
    prev_view = np.zeros(len(user_id))
    for i, (u, t) in enumerate(zip(user_id, DateTime)):
        if u in user_id_dict:
            prev_view[i] = (t - user_id_dict[u])
            user_id_dict[u] = t\n",
        else:
            prev_view[i] = 0
            user_id_dict[u] = t
    prev_view = prev_view/10**10
    return prev_view

In [None]:
def expanding_count(x):\n",
        "    exp_count = []\n",
        "    count_dict = defaultdict(np.int32)\n",
        "    for i in x:\n",
        "        if i in count_dict:\n",
        "            count_dict[i] += 1\n",
        "            exp_count.append(count_dict[i])\n",
        "        else:\n",
        "            exp_count.append(1)\n",
        "            count_dict[i] = 1\n",
        "    return exp_count, count_dic

In [None]:
def mean_likelihood(df, cat_var, target, alpha = 0.5):\n",
        "    P_c = df.groupby(cat_var)[target].transform('mean')\n",
        "    P_global = df[target].mean()\n",
        "    n_c = df.groupby(cat_var)[target].transform('count')\n",
        "    enc = (P_c*n_c + P_global*alpha)/(n_c + alpha)\n",
        "    temp = df[[cat_var]]\n",
        "    temp['enc'] = enc\n",
        "    return temp.groupby(cat_var).mean()

In [None]:
full_df.head()

In [None]:
item_view_log_df['user_id_count'] = item_view_log_df['user_id'].map(Counter(item_view_log_df['user_id']))
exp_count, _ = expanding_count(item_view_log_df['user_id'])
item_view_log_df['user_id_exp_count'] = exp_count

In [None]:
item_view_log_df.head()

In [None]:
item_view_log_df['user_id'].max()

In [None]:
item_view_log_df[item_view_log_df['user_id'] == 74788].shape

In [None]:
item_view_log_df['user_id_exp_count'].min()

In [None]:
item_view_log_df['user_itemid'] = item_view_log_df['user_id'].astype(str)+'_'+item_view_log_df['item_id'].astype(str)
item_view_log_df['user_itemid_count'] = item_view_log_df['user_itemid'].map(Counter(item_view_log_df['user_itemid']))
exp_count, _ = expanding_count(item_view_log_df['user_itemid'])
item_view_log_df['user_itemid_count'] = exp_count

In [None]:
item_view_log_df.head()

In [None]:
item_view_log_df['user_itemid_count'].max()

In [None]:
item_view_log_df[item_view_log_df['user_itemid_count'] == 217]

In [None]:
item_view_log_df['user_product'] = item_view_log_df['user_id'].astype(str)+'_'+item_view_log_df['product_type'].astype(str)
item_view_log_df['user_product_count'] = item_view_log_df['user_product'].map(Counter(item_view_log_df['user_product']))

In [None]:
item_view_log_df.head()

In [None]:
item_view_log_df['product_item_id'] = item_view_log_df['product_type'].astype(str) + '_' + item_view_log_df['item_id'].astype(str)
item_view_log_df['product_item_id_count'] = item_view_log_df['product_item_id'].map(Counter(item_view_log_df['product_item_id']))

In [None]:
item_view_log_df.head()

In [None]:
item_view_log_df['user_product_item_id'] = item_view_log_df['user_id'].astype(str)+ '_' + item_view_log_df['product_item_id']
item_view_log_df['user_itemid_product_count'] = item_view_log_df['user_product_item_id'].map(Counter(item_view_log_df['user_product_item_id']))
exp_count, _ = expanding_count(item_view_log_df['user_product_item_id'])
item_view_log_df['user_itemid_product_exp_count'] = exp_count

In [None]:
item_view_log_df.head()

In [None]:
temp = item_view_log_df.groupby(['user_id', 'user_itemid_product_count']).size().unstack().fillna(0)
full_df = full_df.join(temp, on = 'user_id', how = 'left')

In [None]:
temp.head()

In [None]:
full_df.head()

In [None]:
full_df[full_df[217] == 217]

In [None]:
full_df[full_df[217] == 217]

In [None]:
train = full_df[:count]
test = full_df[count:]

In [None]:
print(full_df.shape)
print(train.shape)
print(test.shape)

In [None]:
train_df = train.copy()
test_df = test.copy()

In [None]:
train_df['is_click'].value_counts()

In [None]:
train_df.columns

In [None]:
cat_vars = ['app_code', 'os_version', 'is_4G']
# getting mean encoding features
cvlist = list(KFold(n_splits = 10, random_state = 1).split(train_df))\n",
for var in cat_vars + ['user_id']:
    mean_enc_var = np.zeros(len(train_df))
    for tr_idx, val_idx in cvlist:
        X_tr, X_val = train_df.loc[tr_idx], train_df.loc[val_idx]
        X_tr_mean = mean_likelihood(X_tr, var, 'is_click')\n",
        mean_enc_var[val_idx] = X_val[var].map(X_tr_mean['enc'])
        train_df[f'mean_enc_{var}'] = mean_enc_var
        train_df[f'mean_enc_{var}'] = train_df[f'mean_enc_{var}'].fillna(train_df[f'mean_enc_{var}'].mean())
        test_df[f'mean_enc_{var}'] = test_df[var].map(mean_likelihood(train_df, var, 'is_click')['enc'])
        test_df[f'mean_enc_{var}'] = test_df[f'mean_enc_{var}'].fillna(train_df[f'mean_enc_{var}'].mean())

In [None]:
train_X = train_df.drop(['impression_id', 'impression_time','user_id', 'is_click', 'is_train', 'time'],axis=1)
test_X = test_df.drop(['impression_id', 'impression_time','user_id', 'is_click', 'is_train', 'time'],axis=1)
y = train_df['is_click'].values
train_y = y
        
X = train_X

In [None]:
X.columns

In [None]:
train_df.columns

In [None]:
#Standardization\n",
from sklearn.preprocessing import StandardScaler

sc=StandardScaler()
train_X=sc.fit_transform(train_X)
X = train_X
test_X=sc.transform(test_X)

In [None]:
def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None):
    params = {}
    params[\"objective\"] = \"binary\”
    params['metric'] = 'auc'
    params[\"max_depth\"] = 8    
     params[\"min_data_in_leaf\"] = 1
    params[\"learning_rate\"] = 0.01
      params[\"bagging_fraction\"] = 0.7
       params[\"feature_fraction\"] = 0.7
     params[\"bagging_freq\"] = 1
       params[\"bagging_seed\"] = 0
     params[\"verbosity\"] = -1
         num_rounds = 20000
       
        plst = list(params.items())
           lgtrain = lgb.Dataset(train_X, label=train_y)
        
    if test_y is not None:        
        lgtest = lgb.Dataset(test_X, label=test_y)       
        model = lgb.train(params, lgtrain,  num_rounds,
                               valid_sets=[lgtrain,lgtest],                          
        
                             early_stopping_rounds=100, verbose_eval=500)
            
    else:
        lgtest = lgb.Dataset(test_X)
        model = lgb.train(params, lgtrain,   num_rounds)
        
    pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
    pred_test_y = (pred_test_y)
        
    pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
    pred_test_y2 = (pred_test_y2)
       
         loss = 0
         if test_y is not None:           
              loss = roc_auc_score((test_y), pred_test_y)    
            return pred_test_y, loss, pred_test_y2, model
         else:
             return pred_test_y, loss, pred_test_y2, model

In [None]:
cv_scores = []
pred_test_full = 0
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=30)
        
for dev_index, val_index in kf.split(X, y):
    dev_X, val_X = train_X[dev_index], train_X[val_index]
    dev_y, val_y = train_y[dev_index], train_y[val_index]   
         
    pred_val, loss, pred_test,model = runLGB(dev_X, dev_y, val_X, val_y, test_X)
    pred_test_full += pred_test
    cv_scores.append(loss)
    print(cv_scores)\n",
    pred_test_full /= 5.
    print(sum(cv_scores)/5)

In [None]:
submission_df = pd.read_csv('sample_submission.csv')
submission_df.head()

In [None]:
submission_df['is_click']= pd.DataFrame(pred_test_full)

In [None]:
submission_df.to_csv('lgb_base_v3.csv', index=False)

In [None]:
#### Baseline Model - Public Leaderboard score - 0.69876663249451
#Public Leaderboard score - 0.7296318527