# init

## imports

In [1]:
import os
import pandas as pd
import warnings
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

## configs

In [2]:
BASE_PATH = ('/home/shaghayegh/class/ad_click/data/')
DATASET_PATH = BASE_PATH + 'dataset.parquet'
TRAIN_TEST_SPLIT_DATE = '2017-05-11 00:00:00'

FEATURES = [
    'pvalue_level', 'shopping_level', 'price', 'age_level',
    'final_gender_code','user_ad_count', 'user_ad_clk_count',
    'adgroup_count','adgroup_clk_count', 'campaign_count',
    'campaign_clk_count','cate_count', 'cate_clk_count',
    'cat_gender_ad_count','cat_gender_ad_clk_count', 'cat_age_ad_count',
    'cat_age_ad_clk_count','gender_age_ad_count', 'gender_age_ad_clk_count'
]

AD_FEATURES = [
    'price', 'adgroup_count','adgroup_clk_count', 'campaign_count',
    'campaign_clk_count','cate_count', 'cate_clk_count'
]
USER_FEATURES = [
    'pvalue_level', 'shopping_level', 'age_level',
    'final_gender_code','user_ad_count', 'user_ad_clk_count'
]

LABEL = ['clk']

PREDICTION_DATA_PATH = BASE_PATH + 'test.csv'

# load data

In [3]:
dataset_df = pd.read_parquet(DATASET_PATH)
dataset_df.head()

Unnamed: 0,adgroup_id,userid,time_stamp,pid,nonclk,clk,cms_segid,cms_group_id,pvalue_level,shopping_level,...,campaign_count,campaign_clk_count,cate_count,cate_clk_count,cat_gender_ad_count,cat_gender_ad_clk_count,cat_age_ad_count,cat_age_ad_clk_count,gender_age_ad_count,gender_age_ad_clk_count
0,43,824392,1494108093,430548_1007,1,0,0,6,,1,...,10,2.0,15644,908.0,7927,426.0,311,18.0,0,
1,69,729782,1494287274,430548_1007,1,0,0,8,2.0,3,...,30,3.0,1258,95.0,1057,72.0,62,3.0,0,
2,85,10772,1494393723,430548_1007,1,0,0,4,,3,...,49,1.0,529,28.0,334,15.0,138,6.0,0,
3,96,765463,1494006071,430539_1007,1,0,35,4,2.0,3,...,0,,18,1.0,13,0.0,5,0.0,0,
4,100,828934,1494321308,430548_1007,1,0,0,10,2.0,2,...,4,0.0,1184,57.0,1066,52.0,436,21.0,0,


# train/test split

In [4]:
def train_test_split(dataset_df, split_date):
    train_df = dataset_df[dataset_df['time'] < split_date]
    test_df = dataset_df[dataset_df['time'] >= split_date]
    return train_df, test_df

train_df, test_df = train_test_split(dataset_df, TRAIN_TEST_SPLIT_DATE )
train_df.head()

Unnamed: 0,adgroup_id,userid,time_stamp,pid,nonclk,clk,cms_segid,cms_group_id,pvalue_level,shopping_level,...,campaign_count,campaign_clk_count,cate_count,cate_clk_count,cat_gender_ad_count,cat_gender_ad_clk_count,cat_age_ad_count,cat_age_ad_clk_count,gender_age_ad_count,gender_age_ad_clk_count
0,43,824392,1494108093,430548_1007,1,0,0,6,,1,...,10,2.0,15644,908.0,7927,426.0,311,18.0,0,
1,69,729782,1494287274,430548_1007,1,0,0,8,2.0,3,...,30,3.0,1258,95.0,1057,72.0,62,3.0,0,
2,85,10772,1494393723,430548_1007,1,0,0,4,,3,...,49,1.0,529,28.0,334,15.0,138,6.0,0,
3,96,765463,1494006071,430539_1007,1,0,35,4,2.0,3,...,0,,18,1.0,13,0.0,5,0.0,0,
4,100,828934,1494321308,430548_1007,1,0,0,10,2.0,2,...,4,0.0,1184,57.0,1066,52.0,436,21.0,0,


In [5]:
test_df.head()

Unnamed: 0,adgroup_id,userid,time_stamp,pid,nonclk,clk,cms_segid,cms_group_id,pvalue_level,shopping_level,...,campaign_count,campaign_clk_count,cate_count,cate_clk_count,cat_gender_ad_count,cat_gender_ad_clk_count,cat_age_ad_count,cat_age_ad_clk_count,gender_age_ad_count,gender_age_ad_clk_count
37,102,1122493,1494544665,430539_1007,1,0,0,10,,3,...,483,20.0,2652,120.0,437,17.0,972,49.0,31,1.0
38,102,1030522,1494561308,430539_1007,1,0,82,10,3.0,3,...,510,20.0,2822,128.0,452,18.0,1031,53.0,32,1.0
39,102,686384,1494575099,430539_1007,1,0,0,10,,3,...,535,20.0,2922,132.0,464,18.0,1067,53.0,33,1.0
40,102,769032,1494577595,430548_1007,1,0,80,10,2.0,3,...,540,20.0,2940,134.0,466,18.0,1077,54.0,34,1.0
41,102,83112,1494579892,430539_1007,1,0,80,10,2.0,3,...,545,20.0,2965,134.0,469,18.0,1086,54.0,35,1.0


# methods

## xgboost

### training model 1

In [6]:
xg1 = XGBClassifier(n_estimators=100, tree_method= 'gpu_hist', scale_pos_weight = 95 / 5)
xg1.fit(train_df[AD_FEATURES], train_df[LABEL])

### predict 1

In [7]:
test_df['Pred_xg1'] = xg1.predict_proba(test_df[AD_FEATURES])[:, 1]
test_df.head()

Unnamed: 0,adgroup_id,userid,time_stamp,pid,nonclk,clk,cms_segid,cms_group_id,pvalue_level,shopping_level,...,campaign_clk_count,cate_count,cate_clk_count,cat_gender_ad_count,cat_gender_ad_clk_count,cat_age_ad_count,cat_age_ad_clk_count,gender_age_ad_count,gender_age_ad_clk_count,Pred_xg1
37,102,1122493,1494544665,430539_1007,1,0,0,10,,3,...,20.0,2652,120.0,437,17.0,972,49.0,31,1.0,0.425842
38,102,1030522,1494561308,430539_1007,1,0,82,10,3.0,3,...,20.0,2822,128.0,452,18.0,1031,53.0,32,1.0,0.413324
39,102,686384,1494575099,430539_1007,1,0,0,10,,3,...,20.0,2922,132.0,464,18.0,1067,53.0,33,1.0,0.395283
40,102,769032,1494577595,430548_1007,1,0,80,10,2.0,3,...,20.0,2940,134.0,466,18.0,1077,54.0,34,1.0,0.395283
41,102,83112,1494579892,430539_1007,1,0,80,10,2.0,3,...,20.0,2965,134.0,469,18.0,1086,54.0,35,1.0,0.395283


### training model 2

In [8]:
xg2 = XGBClassifier(n_estimators=100, tree_method= 'gpu_hist', scale_pos_weight = 95 / 5)
xg2.fit(train_df[USER_FEATURES], train_df[LABEL])

### predict 2

In [9]:
test_df['Pred_xg2'] = xg2.predict_proba(test_df[USER_FEATURES])[:, 1]
test_df.head()

Unnamed: 0,adgroup_id,userid,time_stamp,pid,nonclk,clk,cms_segid,cms_group_id,pvalue_level,shopping_level,...,cate_count,cate_clk_count,cat_gender_ad_count,cat_gender_ad_clk_count,cat_age_ad_count,cat_age_ad_clk_count,gender_age_ad_count,gender_age_ad_clk_count,Pred_xg1,Pred_xg2
37,102,1122493,1494544665,430539_1007,1,0,0,10,,3,...,2652,120.0,437,17.0,972,49.0,31,1.0,0.425842,0.425445
38,102,1030522,1494561308,430539_1007,1,0,82,10,3.0,3,...,2822,128.0,452,18.0,1031,53.0,32,1.0,0.413324,0.557408
39,102,686384,1494575099,430539_1007,1,0,0,10,,3,...,2922,132.0,464,18.0,1067,53.0,33,1.0,0.395283,0.429266
40,102,769032,1494577595,430548_1007,1,0,80,10,2.0,3,...,2940,134.0,466,18.0,1077,54.0,34,1.0,0.395283,0.31443
41,102,83112,1494579892,430539_1007,1,0,80,10,2.0,3,...,2965,134.0,469,18.0,1086,54.0,35,1.0,0.395283,0.471036


### training model 3

In [10]:
xg3 = XGBClassifier(n_estimators=100, tree_method= 'gpu_hist', scale_pos_weight = 95 / 5)
xg3.fit(train_df[FEATURES], train_df[LABEL])

### predict 3

In [11]:
test_df['Pred_xg3'] = xg3.predict_proba(test_df[FEATURES])[:, 1]
test_df.head()

Unnamed: 0,adgroup_id,userid,time_stamp,pid,nonclk,clk,cms_segid,cms_group_id,pvalue_level,shopping_level,...,cate_clk_count,cat_gender_ad_count,cat_gender_ad_clk_count,cat_age_ad_count,cat_age_ad_clk_count,gender_age_ad_count,gender_age_ad_clk_count,Pred_xg1,Pred_xg2,Pred_xg3
37,102,1122493,1494544665,430539_1007,1,0,0,10,,3,...,120.0,437,17.0,972,49.0,31,1.0,0.425842,0.425445,0.361358
38,102,1030522,1494561308,430539_1007,1,0,82,10,3.0,3,...,128.0,452,18.0,1031,53.0,32,1.0,0.413324,0.557408,0.347493
39,102,686384,1494575099,430539_1007,1,0,0,10,,3,...,132.0,464,18.0,1067,53.0,33,1.0,0.395283,0.429266,0.356536
40,102,769032,1494577595,430548_1007,1,0,80,10,2.0,3,...,134.0,466,18.0,1077,54.0,34,1.0,0.395283,0.31443,0.282013
41,102,83112,1494579892,430539_1007,1,0,80,10,2.0,3,...,134.0,469,18.0,1086,54.0,35,1.0,0.395283,0.471036,0.398434


# save prediction

In [12]:
test_df.to_csv(PREDICTION_DATA_PATH, index=False)