In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter

import os
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')



In [2]:
#reading files
train = pd.read_csv('~/kaggle/input/train.csv')
coupon_data = pd.read_csv('~/kaggle/input/coupon_item_mapping.csv')
campaign_data = pd.read_csv('~/kaggle/input/campaign_data.csv')
cust_tran_data = pd.read_csv('~/kaggle/input/customer_transaction_data.csv')
cust_demo_data = pd.read_csv('~/kaggle/input/customer_demographics.csv')
item_data = pd.read_csv('~/kaggle/input/item_data.csv')

test = pd.read_csv('~/kaggle/input/test.csv')
sample = pd.read_csv('~/kaggle/input/sample_submission_Byiv0dS.csv')

In [3]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status
0,1,13,27,1053,0
1,2,13,116,48,0
2,6,9,635,205,0
3,7,13,644,1050,0
4,9,8,1017,1489,0


<h1 align="center">Campaign Pre-processing</h1>

In [4]:
campaign_data.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,21/10/13,20/12/13
1,25,Y,21/10/13,22/11/13
2,20,Y,07/09/13,16/11/13
3,23,Y,08/10/13,15/11/13
4,21,Y,16/09/13,18/10/13


In [5]:
#Converting date and time data to appropriate format
campaign_data['start_date'] = pd.to_datetime(campaign_data['start_date'], format = '%d/%m/%y')
campaign_data['end_date'] = pd.to_datetime(campaign_data['end_date'], format = '%d/%m/%y')

In [6]:
#adding campaign type to train and test
train['campaign_type'] = train.campaign_id.map(campaign_data.groupby('campaign_id').campaign_type.apply(lambda x: x.unique()[0]))
test['campaign_type'] = test.campaign_id.map(campaign_data.groupby('campaign_id').campaign_type.apply(lambda x: x.unique()[0]))

<h1 align="center">Customer Demographic Pre-processing</h1>

In [7]:
cust_demo_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2,,4
1,6,46-55,Married,0,2,,5
2,7,26-35,,0,3,1.0,3
3,8,26-35,,0,4,2.0,6
4,10,46-55,Single,0,1,,5


In [8]:
cust_demo_data.isna().any()

customer_id       False
age_range         False
marital_status     True
rented            False
family_size       False
no_of_children     True
income_bracket    False
dtype: bool

In [9]:
#type of family size, no of children = int64
cust_demo_data['family_size'] = cust_demo_data['family_size'].apply(lambda x: float(re.sub('\+','',str(x))))
cust_demo_data['no_of_children'] = cust_demo_data['no_of_children'].apply(lambda x: float(re.sub('\+','',str(x))) if (pd.notna(x)) else x)

In [10]:
cust_demo_data.head()

Unnamed: 0,customer_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,70+,Married,0,2.0,,4
1,6,46-55,Married,0,2.0,,5
2,7,26-35,,0,3.0,1.0,3
3,8,26-35,,0,4.0,2.0,6
4,10,46-55,Single,0,1.0,,5


In [11]:

#family size of 1 will be single
cust_demo_data.loc[pd.isnull(cust_demo_data.marital_status) & (cust_demo_data.family_size == 1),'marital_status'] = 'Single'

#family size - no of childrens = 1, will be single
cust_demo_data.loc[(cust_demo_data.family_size - cust_demo_data.no_of_children == 1) & pd.isnull(cust_demo_data.marital_status),'marital_status'] = 'Single'

#we have 142 of 152 customers with diff of 2 in their fam size and #childrens are Married
cust_demo_data.loc[(pd.isnull(cust_demo_data.marital_status)) & ((cust_demo_data.family_size - cust_demo_data.no_of_children) == 2)  & (pd.notnull(cust_demo_data.no_of_children)),'marital_status'] = 'Married'

#original data shows customers with fam size == 2, and nans in no of childrens are majorly Married
cust_demo_data.loc[pd.isnull(cust_demo_data.marital_status) & (pd.isnull(cust_demo_data.no_of_children)) & (cust_demo_data.family_size ==2),'marital_status'] = 'Married'

In [12]:

#Married people with family_size ==2 will have 0 childrens
cust_demo_data.loc[pd.isnull(cust_demo_data.no_of_children) & (cust_demo_data.marital_status == 'Married') & (cust_demo_data.family_size == 2),'no_of_children'] = 0.0

#customers with family size 1 will have zero childrens
cust_demo_data.loc[pd.isnull(cust_demo_data.no_of_children) & (cust_demo_data.family_size == 1), 'no_of_children'] = 0.0

#singles with family size == 2, will probably have 1 child
cust_demo_data.loc[pd.isnull(cust_demo_data.no_of_children) & (cust_demo_data.family_size == 2),'no_of_children'] = 1.0

<h1 align="center">Customer Transaction Pre-processing</h1>

In [13]:
cust_tran_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1.0,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1.0,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1.0,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1.0,67.32,0.0,0.0
4,2012-01-02,1501,48199,1.0,71.24,-28.14,0.0


In [14]:
cust_tran_data.isnull().any()

date               False
customer_id        False
item_id            False
quantity            True
selling_price       True
other_discount      True
coupon_discount     True
dtype: bool

In [15]:
#convert date and time to app ropritate format
cust_tran_data['date'] = pd.to_datetime(cust_tran_data['date'])

<h1 align="center">EDA and Processing of Data </h1>

In [16]:
#merging train and test with cust_demo on campaign_id
train = pd.merge(train,cust_demo_data, on='customer_id', how='left')
test = pd.merge(test,cust_demo_data, on='customer_id', how='left')

In [17]:
train.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,campaign_type,age_range,marital_status,rented,family_size,no_of_children,income_bracket
0,1,13,27,1053,0,X,46-55,Single,0.0,1.0,0.0,5.0
1,2,13,116,48,0,X,36-45,Married,0.0,2.0,0.0,3.0
2,6,9,635,205,0,Y,46-55,Married,0.0,2.0,0.0,7.0
3,7,13,644,1050,0,X,,,,,,
4,9,8,1017,1489,0,X,46-55,Married,0.0,2.0,0.0,3.0


In [18]:
# "bought_X" =  Intersection between Items bought by customer previously(from cust_tran) and all items available in coupon provided(from coupons)

In [19]:
#cust2items - dictionary mapping customer_ids to all items bought by them
cust_tran_data['str_item'] = cust_tran_data.item_id.apply(lambda x: str(x)) #did this to calculate dict_cust2items, no need further
dict_cust2items = cust_tran_data.groupby('customer_id').str_item.apply(lambda x: ' '.join(x)).to_dict()
cust_tran_data.drop('str_item',axis=1,inplace=True)
dict_cust2items.update({1358:'0.0'})
dict_cust2items.update({405:'0.0'})
dict_cust2items.update({218:'0.0'})
dict_cust2items.update({980:'0.0'})
dict_cust2items.update({1569:'0.0'})
dict_cust2items.update({991:'0.0'})

# These data values have no coupon redeemed on them so they are kind of a nuisance hence i associated a 0.0 to them to vacuosly proceed futher

1358 has no element?

In [20]:
print(dict_cust2items)

{1: '4953 5626 7808 8107 8307 14054 17047 17559 18807 20811 22147 23744 25583 27557 28801 28816 28939 28969 30775 36521 45074 45986 51608 51762 52016 52180 53418 7875 8307 26238 28276 28876 50127 51608 53524 56179 57921 6647 22147 22637 30818 32201 36924 57837 57921 5626 7130 7875 8307 8372 10327 10534 13295 16234 16298 17559 22208 23971 25583 30775 31196 45986 51608 58644 7875 8307 11411 15088 16009 20822 34324 45063 45986 51608 52350 54384 57921 61345 8307 8693 15481 16340 21553 28939 51608 56179 7475 9281 19083 19600 30633 49915 52016 58644 5552 7177 7875 8307 8548 11559 13931 15322 16009 17018 17047 19600 20232 27106 28276 28876 29420 30775 30833 31074 31191 32129 33446 35785 45986 46149 47842 48638 51514 52671 57921 58644 59913 4953 7875 8307 8355 11487 14537 15302 15322 16731 17559 23588 25381 25446 27104 28590 28801 30492 30818 32093 32142 32260 33079 36908 52082 52767 53524 56179 57921 60220 61345', 2: '31095 45231 45266 45393 45502 7229 12720 12894 30851 7309 10222 17085 28656

In [21]:
#cou2items - dictionary mapping coupon_ids to all items under them
dict_cou2items = coupon_data.groupby('coupon_id').item_id.apply(lambda x: ' '.join(list(x.apply(lambda x: str(x))))).to_dict()

In [22]:
#intersect of cust2items and cou2item
train['bought_X'] = train[['coupon_id','customer_id']].apply(lambda x : len(np.intersect1d(dict_cust2items[x[1]].split() , dict_cou2items[x[0]].split())) , axis=1)
test['bought_X'] = test[['coupon_id','customer_id']].apply(lambda x : len(np.intersect1d(dict_cust2items[x[1]].split() , dict_cou2items[x[0]].split())) , axis=1)

In [23]:
#ite2cou - dictionary mapping item_ids to all coupons applicable to them
d_ite2cou = coupon_data.groupby('item_id').coupon_id.apply(lambda x: ' '.join(list(x.apply(lambda x: str(x))))).to_dict()

In [24]:
#adding col for whether coupon was applied on that item (i.e redeemed or not)
cust_tran_data['redeem'] = cust_tran_data.coupon_discount.apply(lambda x: 1 if x<0 else 0)

In [25]:
##############  1.) Calculating redeemed % per item from cust_tran
#               2.) Summing all those %'s for items in a coupon, take mean finally
#               3.) map it to coupons

In [26]:
#per_item_redeemed_history = dict mapping item_ids to redeemed %
d_per_item_redeemed_history = ((cust_tran_data.groupby('item_id').redeem.sum() / cust_tran_data.groupby('item_id').redeem.count()) *100).to_dict()

In [27]:
#some items corresponding to test coupons are not in d_per_item_redeemed_hist hence need for this func
def item_redeem_func(x):
    for item in dict_cou2items[x].split():
        per = []
        try:
            per.append(d_per_item_redeemed_history[int(item)])

        except:
            pass
    k = [np.mean(per) if pd.isna(np.mean(per)) == False else 0]
    return k[0]

In [28]:
#applying the above func to coupon_id
train['item_redeem'] = train.coupon_id.apply(item_redeem_func)
test['item_redeem'] = test.coupon_id.apply(item_redeem_func)

In [29]:
##### 1.) Calculating redeemed % per customer from cust_tran
#     2.) map it to customer_ids in train and tests

In [30]:
#per_cust_redeem_history - dict mapping customer_id to redemmed %
d_per_cust_redeem_history = ((cust_tran_data.groupby('customer_id').redeem.sum() / cust_tran_data.groupby('customer_id').redeem.count())*100).to_dict()

In [31]:
#adding a col for cust redeem #increased score by 0.03
train['cust_redeem'] = train.customer_id.map(d_per_cust_redeem_history)
test['cust_redeem'] = test.customer_id.map(d_per_cust_redeem_history)

In [32]:
###############

In [33]:


#merging cust_trans with items on item_id
cust_tran_data = pd.merge(cust_tran_data, item_data, how='left', on='item_id')

In [34]:
cust_tran_data.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,redeem,brand,brand_type,category
0,2012-01-02,1501,26830,1.0,35.26,-10.69,0.0,0,56,Local,Natural Products
1,2012-01-02,1501,54253,1.0,53.43,-13.89,0.0,0,56,Local,Natural Products
2,2012-01-02,1501,31962,1.0,106.5,-14.25,0.0,0,524,Established,Grocery
3,2012-01-02,1501,33647,1.0,67.32,0.0,0.0,0,1134,Established,Grocery
4,2012-01-02,1501,48199,1.0,71.24,-28.14,0.0,0,524,Established,Grocery


In [35]:
#################### 1.) Calculating redeemed % per category ---> per_cat_redeem_history
#                    2.) Calculating redeemed % per customer based on cat using (1) ---> per_cust_redeem_history_catwali
#                    3.) map (2) to customer_ids in train and test

In [36]:
#redeem history based on category
d_per_cat_redeem_history = (cust_tran_data.groupby('category').redeem.sum() / cust_tran_data.groupby('category').redeem.count()*1000).to_dict()

In [37]:
#(increased score by 0.0001)
d_per_cust_redeem_history_catwali = cust_tran_data.groupby('customer_id').category.apply(lambda x: np.mean([d_per_cat_redeem_history[k] for k in x.values]))

train['cat_cust_redeem'] = train.customer_id.map(d_per_cust_redeem_history_catwali)
test['cat_cust_redeem'] = test.customer_id.map(d_per_cust_redeem_history_catwali)

In [38]:
############

In [39]:
############ if for a customer, brands bought by him previously are available in the coupon given, high chance of redeem

In [40]:
#cust2brands - dict mapping customer_ids to all brands bought by them
d_cust2brands = cust_tran_data.groupby('customer_id').brand.apply(lambda x: ' '.join([str(k) for k in x.unique()])).to_dict()
d_cust2brands.update({1358:'0.0'})
d_cust2brands.update({405:'0.0'})
d_cust2brands.update({218:'0.0'})
d_cust2brands.update({980:'0.0'})
d_cust2brands.update({1569:'0.0'})
d_cust2brands.update({991:'0.0'})

In [41]:
#item2brand - dict mapping items to their respective brands
d_item2brand = cust_tran_data.groupby('item_id').brand.apply(lambda x: x.unique()[0]).to_dict()

In [42]:
#filling nans in brand of which we have no prior info
coupon_data['brand'] = coupon_data.item_id.map(d_item2brand).fillna('99999999999')

In [43]:
#coupon2brands - dict mapping coupons to all brands available in them to purchase
d_coupon2brands = coupon_data.groupby('coupon_id').brand.apply(lambda x: ' '.join([str(int(k)) for k in x.unique()])).to_dict()

In [44]:
#getting no of common brands in cust2brands and coupon2brands
train['brand_bot'] = train[['customer_id','coupon_id']].apply(lambda x: len(np.intersect1d(d_cust2brands[x[0]].split(), d_coupon2brands[x[1]].split())), axis=1)
test['brand_bot'] = test[['customer_id','coupon_id']].apply(lambda x: len(np.intersect1d(d_cust2brands[x[0]].split(), d_coupon2brands[x[1]].split())), axis=1)

In [45]:
#########

In [46]:
######### Filling some nans in rented, age_range

In [47]:
#filling nans in train.rented with 2
train.rented.fillna(2,inplace=True)
test.rented.fillna(2,inplace=True)

In [48]:
#imputing age_range based on campaign_id

def d_age(df):
    k = df.groupby('campaign_id').age_range.value_counts()
    k = k.reset_index(name='value').sort_values(['campaign_id','value'], ascending=[True,False])
    d_age = {}
    for i in list(df.campaign_id.unique()):
        df = k.loc[k.campaign_id == i,['age_range','value']]
        df = df.set_index('age_range')
        max_val_per_campaign = df.idxmax().value
        d_age[i] = max_val_per_campaign
        
    return d_age

    
#filling nans with d_age
train.loc[(pd.isnull(train.age_range)),'age_range'] = train.loc[(pd.isnull(train.age_range)),'campaign_id'].map(d_age(train))
test.loc[(pd.isnull(test.age_range)),'age_range'] = test.loc[(pd.isnull(test.age_range)),'campaign_id'].map(d_age(test))


In [49]:
###############

In [50]:
#adding brand (most frequent) per coupon_id
train['brand'] = train.coupon_id.map(coupon_data.groupby('coupon_id').brand.apply(lambda x: x.values[0]).to_dict())
test['brand'] = test.coupon_id.map(coupon_data.groupby('coupon_id').brand.apply(lambda x: x.values[0]).to_dict())

In [51]:
############### val set

In [52]:
#array's containing common customer_ids and coupon_ids in train,test ---> (in order to make val set)
commom_cust = np.intersect1d(train.customer_id.unique(),test.customer_id.unique())
commom_coup = np.intersect1d(train.coupon_id.unique(),test.coupon_id.unique())

In [53]:
#adding col to see whether cust, coup is in test or not 
train['test_cust'] = train.customer_id.apply(lambda x: 1 if x in commom_cust else 0)
train['test_coup'] = train.coupon_id.apply(lambda x: 1 if x in commom_coup else 0)

In [54]:
####Validation set

#(len(train[pd.isnull(train.family_size) & (train.redemption_status == 1)]) / len(train)) * 7837 #16
index1 = train[pd.isnull(train.family_size) & (train.redemption_status == 1) & (train.test_cust == 1) & (train.test_coup == 1)].sample(16, random_state=1996).index

#(len(train[pd.notnull(train.family_size) & (train.redemption_status == 1)]) / len(train) ) * 7837 #57
index2 = train[pd.notnull(train.family_size) & (train.redemption_status == 1) & (train.test_cust == 1) & (train.test_coup == 1)].sample(57, random_state=1996).index

#(len(train[pd.isnull(train.family_size) & (train.redemption_status == 0)]) / len(train)) * 7837 #3455
index3 = train[pd.isnull(train.family_size) & (train.redemption_status == 0) & (train.test_cust == 1) & (train.test_coup == 1)].sample(3366, random_state=1996).index

#(len(train[pd.notnull(train.family_size) & (train.redemption_status == 0)]) / len(train)) * 7837 #4309
index4 = train[pd.notnull(train.family_size) & (train.redemption_status == 0) & (train.test_cust == 1) & (train.test_coup == 1)].sample(4309, random_state=1996).index



#new train and val set
val_index = []
for i in [index1,index2, index3, index4]:
    val_index.extend(i)#main val_index
    
train_index = set(train.index)
train_index = train_index.symmetric_difference(val_index)#main train index

new_train = train.loc[train_index]
val = train.loc[val_index].sample(frac=1, random_state = 1996)
new_test = test

In [94]:
import sklearn
#final_train = new_train.dropna(axis=1).drop(['test_cust','test_coup'], axis=1)
#final_test = new_test.dropna(axis=1)#.drop(['coup_redeem'], axis=1)
#val = val.dropna(axis=1).drop(['test_cust','test_coup'], axis=1)


final_train = final_train.astype(int)
final_train = train.dropna(axis=1).drop(['test_cust','test_coup'], axis=1)


################# Label Encoding

#label encoding features
final_train['campaign_type'] = final_train.campaign_type.map({'X':0,'Y':1})
#val['campaign_type'] = val.campaign_type.map({'X':0,'Y':1})

final_train['age_range'] = final_train.age_range.map({'46-55':0,'36-45':1,'18-25':2,'26-35':3,'56-70':4,'70+':5})
#val['age_range'] = val.age_range.map({'46-55':0,'36-45':1,'18-25':2,'26-35':3,'56-70':4,'70+':5})
final_test['age_range'] = final_test.age_range.map({'46-55':0,'36-45':1,'18-25':2,'26-35':3,'56-70':4,'70+':5})

###############

############## train_test

#preparing data
final_train['brand']=final_train['brand'].astype(int)

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(final_train.drop(['redemption_status'],axis=1), final_train['redemption_status'], test_size=0.2, random_state=1)

X_train_final, X_val, y_train_final, y_val = sklearn.model_selection.train_test_split(X_train, y_train, test_size=0.25, random_state=1)
#val_x = val.drop(['redemption_status'],axis=1)
#val_y = val.redemption_status


In [82]:

X_train.dtypes

id                 int64
campaign_id        int64
coupon_id          int64
customer_id        int64
campaign_type      int64
age_range          int64
rented           float64
bought_X           int64
item_redeem      float64
brand_bot          int64
brand              int64
dtype: object

In [83]:
y_train

37736    0
2962     0
29869    0
29805    0
13421    0
        ..
28018    0
30700    0
27794    0
12573    0
11992    0
Name: redemption_status, Length: 47021, dtype: int64

In [96]:
import xgboost
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from datetime import datetime

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


In [91]:

xl = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=np.nan, monotone_constraints='()',
              n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

xl.fit(X_train, y_train)

# Predict
preds = xl.predict(X_test)

roc_auc_score(y_test, preds)

0.5127292810282501

In [99]:
# GridSearchCV for xgboost

params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

In [107]:
folds = 10
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
xgb.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable


Parameters: { "silent" } are not used.


 Time taken: 0 hours 0 minutes and 34.23 seconds.


In [108]:
random_search.fit(X_train, y_train)

Fitting 7 folds for each of 5 candidates, totalling 35 fits
Parameters: { "silent" } are not used.



RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f0008174f90>,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, feature_types=None,
                                           gamma=0, gpu_id=-1,
                                           grow_policy='depthwise',
                                           importance_type=Non...
                                           max_leaves=0, min_child_weight=1,
                                           missing=nan,
                                           monotone_constraints='()',
                   

In [111]:
pred = random_search.predict(X_test)
roc_auc_score(y_test, pred)

81.944456789
