<a href="https://colab.research.google.com/github/Squrro/DSAI-HW4/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Author : Paul-Antoine Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings("ignore")
# input資料夾位置
dir = '/content/drive/MyDrive/Colab Notebooks/資料科學/'


##讀取資料

In [2]:
priors = pd.read_csv(dir + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

In [3]:
train = pd.read_csv(dir + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

In [4]:
orders = pd.read_csv(dir + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

In [5]:
products = pd.read_csv(dir + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

##產品feature

In [6]:
###

prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
prods['Order2Reorder_rate'] = (prods.orders * prods.orders * prods.reorder_rate).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)

In [7]:

orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)


In [None]:
cnt_srs = priors['product_id'].value_counts().reset_index()
cnt_srs.columns = ['product_id', 'frequency_count']
# TOP50最熱賣
most_sell = cnt_srs.product_id[0:49]

##使用者feature

In [8]:
### user features


usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

users = users.join(usr)

users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)


##額外增加feature

In [10]:
# fast
userXproduct = priors.copy()
userXproduct['user_product'] = userXproduct.product_id + userXproduct.user_id * 100000
userXproduct = userXproduct.sort_values('order_number')
userXproduct = userXproduct \
    .groupby('user_product', sort=False) \
    .agg({'order_id': ['size', 'last'], 'add_to_cart_order': 'sum'})
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.astype(
    {'nb_orders': np.int16, 'last_order_id': np.int32, 'sum_pos_in_cart': np.int16}, 
    # inplace=True
    )

Unnamed: 0_level_0,nb_orders,last_order_id,sum_pos_in_cart
user_product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8623906075,1,1520399,14
15429341329,1,2049062,3
15429323081,3,1489630,8
15429321527,3,2251505,14
15429335050,4,2251505,18
...,...,...,...
16999119311,1,51086,6
8998913176,1,2768756,3
8162545368,1,310464,13
8162535690,1,310464,14


In [11]:
### train / test orders ###
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

##feature process function
---
使train,test的data有相同欄位方便training&fitting

In [12]:
### build list of candidate products to reorder, with features ###

def features(selected_orders, labels_given=False):
    order_list = []
    product_list = []
    labels = []

    train_index = set(train.index)     

    for row in selected_orders.itertuples():
        user_id = row.user_id
        order_id = row.order_id
        user_prods = users['all_products'][user_id]
        product_list += user_prods
        order_list += [order_id] * len(user_prods)

        if labels_given:
            labels += [(order_id, prod) in train_index for prod in user_prods]
    
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)
    df['Order2Reorder_rate'] = df.product_id.map(products.Order2Reorder_rate)

    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    return (df, labels)

##train data建立


In [13]:
df_train, labels = features(train_orders, labels_given=True)

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate','Order2Reorder_rate',
       'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'
       ] # 'dow', 'UP_same_dow_as_last_order'


print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'

formating for lgb


##Lightgbm模型訓練

In [14]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

bst = lgb.train(params, d_train, ROUNDS)
# del d_train


##模型Predict
---
預測該商品會不會再次購買  
將機率大於閾值(0.22)的加入  
反之不加入

In [15]:
### build candidates list for test ###

df_test, _ = features(test_orders)

preds = bst.predict(df_test[f_to_use])

df_test['pred'] = preds


In [16]:
TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)


**將尚未填入的欄位用random填滿**  
隨機1~7項產品(平均每次order約4項)  
填入的產品為前50熱賣的商品  


In [17]:
import time 
import random
i = 0
thing = ''
for order in test_orders.order_id:
    thing = ''
    if order not in d:
      num = int(random.random()*397%8+1)
      for i in range(num):
        thing += ' ' + str(most_sell[int(random.expovariate(1/10)%49)])
      d[order] = thing


In [18]:
sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('submission.csv', index=False)

In [19]:
sub

Unnamed: 0,order_id,products
0,2774568,17668 21903 39190 47766 18599 43961 23650 24810
1,1528013,21903 38293
2,1376945,33572 17706 28465 27959 44632 24799 34658 1494...
3,1356845,11520 14992 7076 28134 10863 13176
4,2161313,11266 196 10441 12427 37710 48142 14715 27839
...,...,...
74995,474081,13176 28204
74996,2363962,27966
74997,2498703,24852 16797 47766
74998,783264,26209 49683 49235 49683 21137 16797
