In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = 'C:/Users/User/Documents/Python Scripts/kaggle/ml100marathon-02-01/'
df_train = pd.read_csv(data_path + 'train_offline.csv')
df_test = pd.read_csv(data_path + 'test_offline.csv')

In [3]:
df_train = df_train[~df_train.Date_received.isna()]
df_train.reset_index(drop=True, inplace=True)
df_train.shape

(746969, 7)

In [4]:
df_test = df_test[~df_test.Date_received.isna()]
df_test.reset_index(drop=True, inplace=True)
df_test.shape

(306313, 6)

In [5]:
def getLabel(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek+1 # add one to make it from 0~6 -> 1~7
    
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_throld'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_disct'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = 99
    return df

In [6]:
df_train["label"] = df_train.apply(getLabel, axis=1)

In [7]:
train_Y = df_train["label"]
train_num = train_Y.shape[0]
train_num

746969

In [8]:
df_train.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label'],
      dtype='object')

In [49]:
df_test.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received
0,1439408,4663,11002.0,150:20,1.0,20160528.0
1,1439408,2632,8591.0,20:1,0.0,20160613.0
2,1439408,2632,8591.0,20:1,0.0,20160516.0
3,2029232,450,1532.0,30:5,0.0,20160530.0
4,2029232,6459,12737.0,20:1,0.0,20160519.0


In [8]:
df_train.loc[746968,:]

User_id               212662
Merchant_id             2934
Coupon_id               5686
Discount_rate           30:5
Distance                   2
Date_received    2.01603e+07
Date             2.01603e+07
label                      1
Name: 746968, dtype: object

In [9]:
df = pd.concat([df_train[['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance','Date_received']],df_test])
df.reset_index(drop=True, inplace=True)
df.loc[746969,:]

User_id              1439408
Merchant_id             4663
Coupon_id              11002
Discount_rate         150:20
Distance                   1
Date_received    2.01605e+07
Name: 746969, dtype: object

In [10]:
# add one to make it from 0~6 -> 1~7
df['weekday'] = df['Date_received'].apply(getWeekday)
# weekday_type (weekend = 1)
df['weekend'] = df['weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) 

In [11]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [12]:
tmpdf = pd.get_dummies(df['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
df[weekdaycols] = tmpdf
df.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekend,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,8591.0,20:1,0.0,20160217.0,3,0,0,0,1,0,0,0,0
1,1439408,2632,1078.0,20:1,0.0,20160319.0,6,1,0,0,0,0,0,1,0
2,1832624,3381,7610.0,200:20,0.0,20160429.0,5,0,0,0,0,0,1,0,0
3,2029232,3381,11951.0,200:20,1.0,20160129.0,5,0,0,0,0,0,1,0,0
4,2223968,3381,9776.0,10:5,2.0,20160129.0,5,0,0,0,0,0,1,0,0


In [13]:
df = processData(df)
df.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,weekday,weekend,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_throld,discount_disct,discount_type
0,1439408,2632,8591.0,20:1,0.0,20160217.0,3,0,0,0,1,0,0,0,0,0.95,20,1,1
1,1439408,2632,1078.0,20:1,0.0,20160319.0,6,1,0,0,0,0,0,1,0,0.95,20,1,1
2,1832624,3381,7610.0,200:20,0.0,20160429.0,5,0,0,0,0,0,1,0,0,0.9,200,20,1
3,2029232,3381,11951.0,200:20,1.0,20160129.0,5,0,0,0,0,0,1,0,0,0.9,200,20,1
4,2223968,3381,9776.0,10:5,2.0,20160129.0,5,0,0,0,0,0,1,0,0,0.5,10,5,1


In [14]:
#df_train['Merchant_Counts'] = df_train.groupby(['Merchant_id'])['User_id'].transform('count')
#df_train['Merchant_Counts'] = df_train['Merchant_id'].map(df_train['Merchant_id'].value_counts())
#df_train['Merchant_UsedCounts'] = df_train['Merchant_id'].map(df_train.loc[df_train['Date'].notnull(),'Merchant_id'].value_counts())
#df_train['Merchant_UsedCounts'].fillna(0, inplace=True)
#df['Merchant_UsedRate'] = df_train.apply(lambda row: row['Merchant_UsedCounts'] / row['Merchant_Counts'], axis=1)
#df.head(20)

df['Merchant_Counts'] = df['Merchant_id'].map(df['Merchant_id'].value_counts())

In [29]:
df_train['Coupon_Counts'] = df_train['Coupon_id'].map(df_train['Coupon_id'].value_counts())
df_train['Coupon_UsedCounts'] = df_train['Coupon_id'].map(df_train.loc[df_train['Date'].notnull(),'Coupon_id'].value_counts())
df_train['Coupon_UsedCounts'].fillna(0, inplace=True)
df['Coupon_UsedRate'] = df_train.apply(lambda row: row['Coupon_UsedCounts'] / row['Coupon_Counts'], axis=1)
#df.head(20)

#df['Coupon_Counts'] = df['Coupon_id'].map(df['Coupon_id'].value_counts())

In [30]:
#df['Merchant_UsedRate'].fillna(0, inplace=True)
df['Coupon_UsedRate'].fillna(0, inplace=True)

In [31]:
# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))
na_check(df)

Unnamed: 0,Missing Ratio


In [32]:
df.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'weekday', 'weekend', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7',
       'discount_rate', 'discount_throld', 'discount_disct', 'discount_type',
       'Merchant_Counts', 'Coupon_Counts', 'Coupon_UsedRate'],
      dtype='object')

In [44]:
df_new = df[['Distance','weekday','weekend', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7','discount_rate'
            ,'discount_throld', 'discount_disct', 'discount_type',
       'Merchant_Counts', 'Coupon_Counts', 'Coupon_UsedRate']]
        #'Merchant_Counts', 'Coupon_Counts', 'Coupon_UsedRate'
        #,'discount_throld','discount_disct','discount_type','Merchant_UsedRate','Coupon_UsedRate'

In [19]:
from sklearn.model_selection import cross_val_score

#from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
#from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC
#from sklearn.ensemble import GradientBoostingClassifier
#from xgboost import XGBClassifier

In [19]:
# 原始特徵 + 邏輯斯迴歸
train_X = df_new[:train_num].values
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.9513982508043771

In [22]:
# 原始特徵 + DecisionTreeClassifier
estimator = DecisionTreeClassifier()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.9513072161345191

In [23]:
# 原始特徵 + RandomForestClassifier
estimator = RandomForestClassifier(n_estimators = 100,random_state = 8)
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.951297844977411

In [24]:
# 原始特徵 + GradientBoostingClassifier
estimator = GradientBoostingClassifier()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.9513969120625265

In [None]:
#from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, Normalizer
#norm = Normalizer()
#norm_train_data = norm.fit_transform(df_train)
#MMEncoder = MinMaxScaler()
#SSEncoder = StandardScaler()

In [None]:
#df_MM = MinMaxScaler().fit_transform(df_new)

In [None]:
#df_new['Distance'] = SSEncoder.fit_transform(df_new['Distance'].values.reshape(-1, 1))
#df_new['discount_rate'] = SSEncoder.fit_transform(df_new['discount_rate'].values.reshape(-1, 1))
#df_new['discount_denominator'] = SSEncoder.fit_transform(df_new['discount_denominator'].values.reshape(-1, 1))
#df_new['discount_numerator'] = SSEncoder.fit_transform(df_new['discount_numerator'].values.reshape(-1, 1))

#df_new.head()

In [21]:
# 標準化 + 邏輯斯迴歸
train_X = df_new[:train_num].values
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()



0.9513982508043771

In [45]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

In [46]:
df_new.head(10)

Unnamed: 0,Distance,weekday,weekend,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_throld,discount_disct,discount_type,Merchant_Counts,Coupon_Counts
0,0.0,3,0,0,0,1,0,0,0,0,0.95,20,1,1,43,31
1,0.0,6,1,0,0,0,0,0,1,0,0.95,20,1,1,43,12
2,0.0,5,0,0,0,0,0,1,0,0,0.9,200,20,1,122834,46729
3,1.0,5,0,0,0,0,0,1,0,0,0.9,200,20,1,122834,26035
4,2.0,5,0,0,0,0,0,1,0,0,0.5,10,5,1,122834,10345
5,99.0,7,1,0,0,0,0,0,0,1,0.9,100,10,1,16824,16824
6,10.0,4,0,0,0,0,1,0,0,0,0.85,200,30,1,33600,21402
7,10.0,6,1,0,0,0,0,0,1,0,0.9,200,20,1,8321,6495
8,2.0,2,0,0,1,0,0,0,0,0,0.9,200,20,1,122834,46729
9,0.0,7,1,0,0,0,0,0,0,1,0.75,20,5,1,1047,690


In [46]:
df_new['Distance'] = MinMaxScaler().fit_transform(df_new[['Distance']])
df_new['Merchant_Counts'] = MinMaxScaler().fit_transform(df_new[['Merchant_Counts']])
df_new['Coupon_Counts'] = MinMaxScaler().fit_transform(df_new[['Coupon_Counts']])
df_new.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  return self.partial_fit(X, y)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Distance,weekday,weekend,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_throld,discount_disct,discount_type,Merchant_Counts,Coupon_Counts,Coupon_UsedRate
0,0.0,3,0,0,0,1,0,0,0,0,0.95,20,1,1,0.000342,0.000642,0.125
1,0.0,6,1,0,0,0,0,0,1,0,0.95,20,1,1,0.000342,0.000235,0.0
2,0.0,5,0,0,0,0,0,1,0,0,0.9,200,20,1,1.0,1.0,0.008805
3,0.010101,5,0,0,0,0,0,1,0,0,0.9,200,20,1,1.0,0.557139,0.016785
4,0.020202,5,0,0,0,0,0,1,0,0,0.5,10,5,1,1.0,0.221366,0.093668
5,1.0,7,1,0,0,0,0,0,0,1,0.9,100,10,1,0.136958,0.36002,0.101343
6,0.10101,4,0,0,0,0,1,0,0,0,0.85,200,30,1,0.273534,0.457991,0.000982
7,0.10101,6,1,0,0,0,0,0,1,0,0.9,200,20,1,0.067734,0.138974,0.016782
8,0.020202,2,0,0,1,0,0,0,0,0,0.9,200,20,1,1.0,1.0,0.008805
9,0.0,7,1,0,0,0,0,0,0,1,0.75,20,5,1,0.008516,0.014745,0.192754


In [21]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

In [48]:
x_train, x_valid, y_train, y_valid = train_test_split(df_new[:train_num], train_Y, test_size=0.2, random_state=8)

In [52]:
rclf = RandomForestClassifier()

n_estimators = [50, 100, 150, 200]
max_depth = [3, 5, 10, 15]
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

grid_search = GridSearchCV(rclf, param_grid, scoring="roc_auc", n_jobs=-1, verbose=1, cv=5)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed: 15.2min finished


In [53]:
print("Best AUC: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best AUC: 0.901050 using {'max_depth': 10, 'n_estimators': 100}


In [54]:
y_pred = grid_result.predict_proba(x_valid)

In [55]:
auc_score = roc_auc_score(y_true=y_valid, y_score=y_pred[:,1])
acc = accuracy_score(y_true=y_valid, y_pred=y_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.900, Accuracy: 0.952


In [56]:
y_pred = grid_result.predict_proba(df_new[train_num:])

In [None]:
#gdbt = GradientBoostingClassifier(subsample=0.75, max_features=16,random_state=6)

#n_estimators = [50, 100, 150]
#max_depth = [5, 10, 20]
#param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)
#param_grid = {'max_depth':[3, 5, 10, 15], 'min_samples_leaf':[3, 5, 7, 10]}
#grid_search = GridSearchCV(gdbt, param_grid, scoring='roc_auc', n_jobs=-1, verbose=1, cv=5)

#grid_result = grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


In [None]:
#print("Best AUC: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#grid_result.grid_scores_, grid_result.best_score_, grid_result.best_params_

In [27]:
from sklearn.metrics import roc_auc_score, accuracy_score

In [23]:
def check_model(train_x, train_y):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(train_x, train_y)
    
    return grid_search

In [57]:
model = check_model(x_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  27 | elapsed:   40.1s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   40.5s finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


In [58]:
model_y_pred = model.predict_proba(x_valid)

  Xt = transform.transform(Xt)


In [60]:
auc_score = roc_auc_score(y_true=y_valid, y_score=model_y_pred[:,1])
acc = accuracy_score(y_true=y_valid, y_pred=model_y_pred.argmax(axis=1))
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.874, Accuracy: 0.952


In [61]:
model_y_pred = model.predict_proba(df_new[train_num:])

  Xt = transform.transform(Xt)


Validation AUC: 0.780, Accuracy: 0.951


In [67]:
df_test_process = df.loc[train_num:,['Distance','weekday','weekend', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7','discount_rate'
            ,'discount_throld', 'discount_disct', 'discount_type',
       'Merchant_Counts', 'Coupon_Counts', 'Coupon_UsedRate']].copy()
#'Merchant_Counts', 'Coupon_Counts'
#,'discount_throld','discount_disct','discount_type','Merchant_UsedRate','Coupon_UsedRate'
df_test_process['pred_prob'] = (y_pred[:, 1] * 0.3) + (model_y_pred[:, 1] * 0.7)

In [68]:
output = pd.concat((df.loc[train_num:,["User_id", "Coupon_id", "Date_received"]], df_test_process['pred_prob']), axis=1)
print(output.shape)

(306313, 4)


In [69]:
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

In [70]:
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]

out.head()

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.045034
1,1000020_8192_20160513,0.042895
2,1000065_1455_20160527,0.03707
3,1000085_8067_20160513,0.042981
4,1000086_2418_20160613,0.032406


In [71]:
out.to_csv("ml100marathon-02-01_10th.csv", header=["uid", "label"], index=False) # submission format

Decision Tree 0.715
