In [None]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "../input/"

In [None]:
dfoff = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
dftest = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
dftest = dftest[~dftest.Coupon_id.isna()]
dftest.reset_index(drop=True, inplace=True)
print(dfoff.shape)
print(dftest.shape)
dfoff.head(20)

(1160742, 7)
(306313, 6)


Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,
5,2223968,3381,9776.0,10:5,2.0,20160129.0,
6,73611,2099,12034.0,100:10,,20160207.0,
7,163606,1569,5054.0,200:30,10.0,20160421.0,
8,3273056,4833,7802.0,200:20,10.0,20160130.0,
9,94107,3381,7610.0,200:20,2.0,20160412.0,


In [None]:
## Creat target label 
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff["label"] = dfoff.apply(label, axis=1)
dfoff["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [None]:
# fillna
dfoff['Discount_rate'] = dfoff['Discount_rate'].fillna('None')
dftest['Discount_rate'] = dftest['Discount_rate'].fillna('None')
dfoff['Distance'] = dfoff['Distance'].fillna(-1)
dftest['Distance'] = dftest['Distance'].fillna(-1)

In [None]:
# drop label=-1
dfoff = dfoff[dfoff["label"] >-1].reset_index(drop=True)
dfoff["label"].value_counts()

0    710665
1     36304
Name: label, dtype: int64

In [None]:
# balance
from sklearn.utils import resample
df = dfoff[['Discount_rate', 'Distance', 'label']]
df_0 = df.loc[df['label']==0]
df_1 = df.loc[df['label']==1]
df_1_upsample = resample(df_1, n_samples=df_0.shape[0], random_state=123)
df = pd.concat([df_0, df_1_upsample])

In [None]:
#df = dfoff[['Discount_rate', 'Distance', 'label']]
mean_df = df.groupby(['Discount_rate', 'Distance'])['label'].mean().reset_index()
mean_df.head()

Unnamed: 0,Discount_rate,Distance,label
0,0.2,-1.0,0.55794
1,0.2,6.0,0.0
2,0.5,-1.0,0.72
3,0.5,0.0,0.95977
4,0.5,1.0,0.904762


In [None]:
dftest = pd.merge(dftest, mean_df, on=['Discount_rate', 'Distance'], how='left')
dftest.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,label
0,1439408,4663,11002.0,150:20,1.0,20160528.0,0.40783
1,1439408,2632,8591.0,20:1,0.0,20160613.0,0.731937
2,1439408,2632,8591.0,20:1,0.0,20160516.0,0.731937
3,2029232,450,1532.0,30:5,0.0,20160530.0,0.749271
4,2029232,6459,12737.0,20:1,0.0,20160519.0,0.731937


In [None]:
dftest["User_id"] = dftest["User_id"].apply(lambda x:str(int(x)))
dftest["Coupon_id"] = dftest["Coupon_id"].apply(lambda x:str(int(x)))
dftest["Date_received"] = dftest["Date_received"].apply(lambda x:str(int(x)))
dftest['uid'] = dftest[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
dftest.reset_index(drop=True, inplace=True)
dftest.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,label,uid
0,1439408,4663,11002,150:20,1.0,20160528,0.40783,1439408_11002_20160528
1,1439408,2632,8591,20:1,0.0,20160613,0.731937,1439408_8591_20160613
2,1439408,2632,8591,20:1,0.0,20160516,0.731937,1439408_8591_20160516
3,2029232,450,1532,30:5,0.0,20160530,0.749271,2029232_1532_20160530
4,2029232,6459,12737,20:1,0.0,20160519,0.731937,2029232_12737_20160519


In [None]:
### NOTE: YOUR SUBMITION FILE SHOULD HAVE COLUMN NAME: uid, label
sub = dftest.groupby("uid", as_index=False).mean()
sub = sub.fillna(0)
sub = sub[["uid", "label"]]
sub.to_csv("mio_balance_mean.csv", header=["uid", "label"], index=False) # submission format

In [None]:
sub["label"] = sub["label"].map(lambda x:1 if x>0.6 else 0)
sub.to_csv("mio_balance_mean_06.csv", header=["uid", "label"], index=False)