In [2]:
# This code is rewritten and it's a modified fork from the base line proposed by Author : Paul-Antoine Nguyen which is
# released under Apache 2.0 License.
# Link: https://www.kaggle.com/paulantoine/light-gbm-benchmark-0-3692

# We improved the performance of some parts of the original baseline code (it was too slow), we vecotrized some parts
# to speedup the running time.

# We added/removed features for the lightGBM classifier in order to improve the accuracy and test our method.
# We added some features from the second place winner described in his blog post on Kaggle.

# The Approach here is to tell for each previously purchased product for each user how likely he is going to purchase
# it (binary classification). We used the train data frame as our labels and the priors for collecting features.

from tqdm import tqdm
import lightgbm as lgb
import numpy as np
import pandas as pd

In [3]:
# ====================================
# Reading the dataset.
# ====================================

aisles = pd.read_csv('data/aisles.csv')

departments = pd.read_csv('data/departments.csv')

# We are defining the type of each column to optimize the storage as far as we can.
priors = pd.read_csv('data/order_products__prior.csv',
                     dtype={
                         'order_id': np.int32,
                         'product_id': np.uint16,
                         'add_to_cart_order': np.int16,  # The order of an added item to the cart.
                         'reordered': np.int8}  # Whether the item has been reordered in the past.
                     )

train = pd.read_csv('data/order_products__train.csv',
                    dtype={
                        'order_id': np.int32,
                        'product_id': np.uint16,
                        'add_to_cart_order': np.int16,
                        'reordered': np.int8}
                    )

orders = pd.read_csv('data/orders.csv',
                     dtype={
                         'order_id': np.int32,
                         'user_id': np.int32,
                         'eval_set': 'category',  # Categorical column.
                         'order_number': np.int16,
                         'order_dow': np.int8,
                         'order_hour_of_day': np.int8,
                         'days_since_prior_order': np.float32},
                     )

products = pd.read_csv('data/products.csv',
                       dtype={
                           'product_id': np.uint16,
                           'order_id': np.int32,
                           'aisle_id': np.uint8,
                           'department_id': np.uint},
                       usecols=['product_id', 'aisle_id', 'department_id']  # Ignore the product Name., It's not 
                       # required in our case 
                       )


In [4]:
# Print some information about the dataset.
print(priors.shape)
print(train.shape)
print(products.shape)
print(priors.shape)
print(orders.shape)

print("\n=======Priors table head=======\n")
print(priors.head(n=7))

print("\n=======Train table head=======\n")
print(train.head(n=7))

print("\n=======Orders table head=======\n")
print(orders.head(n=7))

print("\n=======Products table head=======\n")
print(products.head(n=7))


(32434489, 4)
(1384617, 4)
(49688, 3)
(32434489, 4)
(3421083, 7)


   order_id  product_id  add_to_cart_order  reordered
0         2       33120                  1          1
1         2       28985                  2          1
2         2        9327                  3          0
3         2       45918                  4          1
4         2       30035                  5          0
5         2       17794                  6          1
6         2       40141                  7          1


   order_id  product_id  add_to_cart_order  reordered
0         1       49302                  1          1
1         1       11109                  2          1
2         1       10246                  3          0
3         1       49683                  4          0
4         1       43633                  5          1
5         1       13176                  6          0
6         1       47209                  7          0


   order_id  user_id eval_set  order_number  order_dow  order_hou

In [5]:
# ====================================
# Data Cleaning.
# ====================================

orders.set_index('order_id', drop=False, inplace=True)  # inplace means don't create a new object

# Checking and removing Nans if found.
print("\n=======Products table isNan checks =======\n")
print(np.sum(pd.isna(products)))

print("\n=======Train table isNan checks =======\n")
print(np.sum(pd.isna(train)))

print("\n=======Priors table isNan checks =======\n")
print(np.sum(pd.isna(priors)))

print("\n=======Orders table isNan checks =======\n")
print(np.sum(pd.isna(orders)))

# Found Nans only in the days_since_prior_order column in the orders table and replacing.
print("Nans percentage: %.2f%%" % (206209 / 3421083 * 100))
orders['days_since_prior_order'] = orders['days_since_prior_order'].fillna(np.mean(orders['days_since_prior_order']))




product_id       0
aisle_id         0
department_id    0
dtype: int64




order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64




order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64




order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64
Nans percentage: 6.03%


In [6]:
# Approach Two
# ====================================

# ====================================
# Adding Features to Products.
# ====================================

# Add New Features to the Products table.
# The reorder rate.
prod_features = pd.DataFrame()
prod_features['freq'] = priors.groupby(priors.product_id).size().astype(np.int32)
prod_features['reorder_freq'] = priors.reordered.groupby(priors.product_id).sum().astype(np.int32)
prod_features['reorder_rate'] = (prod_features.reorder_freq / prod_features.freq).astype(np.float)

products = products.join(prod_features, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prod_features

# Join prior with orders
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', axis=1, inplace=True)  # Remove the order_id_ redundant column.

Defaulting to column, but this will raise an ambiguity error in a future version
  rsuffix=rsuffix, sort=sort)


In [8]:
# ====================================
# Adding Features to Users.
# ====================================

usrs = pd.DataFrame()
usrs['avg_between_interval'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float)
usrs['orders_count'] = orders.groupby('user_id').size().astype(np.int16)
usrs['avg_hour'] = orders.groupby('user_id')['order_hour_of_day'].mean().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)  # The count of all purchased items.
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)  # A set of distinct products.
users['all_distinct_products'] = users.all_products.map(len).astype(np.int16)  # The count of distinct products.

users = users.join(usrs)
del usrs

users['average_basket'] = (users.total_items / users.orders_count).astype(np.float)

In [9]:
# Add product id related to each user

# (Slow Method)
# priors['user_product'] = priors.product_id + priors.user_id * 1000000
# # Creating a list of unique products bought by each user and the last time the user has bought it.
# dic = dict()
# for record in priors.itertuples():
#     user_product_id = record.user_product
#     if user_product_id not in dic:
#         dic[user_product_id] = (1,  # means bought once
#                                 (record.order_number, record.order_id),
#                                 record.add_to_cart_order
#                                 )
#     else:
#         dic[user_product_id] = (dic[user_product_id][0] + 1,
#                                 max(dic[user_product_id][1], (record.order_number, record.order_id)),
#                                 dic[user_product_id][2] + record.add_to_cart_order
#                                 )
#
# user_product = pd.DataFrame.from_dict(dic,
#                                       orient='index')  # Index means that the keys of the dictionary should be the
# # rows not columns
# del dic
# user_product.columns = ['orders_count', 'last_order_id', 'sum_pos_in_cart']
# user_product.orders_count = user_product.orders_count.astype(np.int16)
# user_product.last_order_id = user_product.last_order_id.map(lambda x: x[1]).astype(np.int32)
# user_product.sum_pos_in_cart = user_product.sum_pos_in_cart.astype(np.int16)
# # We are not now in the need of the priors table.
# del priors

# Vectorized method.
user_product = priors.copy()
user_product['user_product'] = (user_product.product_id + user_product.user_id * 1000000).astype(np.int64)
user_product = user_product.sort_values('order_number')
user_product = user_product \
    .groupby('user_product', sort=False) \
    .agg({'order_id': ['size', 'last'], 'add_to_cart_order': 'sum'})
user_product.columns = ['orders_count', 'last_order_id', 'sum_pos_in_cart']
user_product.astype(
    {'orders_count': np.int16, 'last_order_id': np.int32, 'sum_pos_in_cart': np.int16},
    inplace=True)

del priors

# Printing some stats
print("UserProduct table shape and first 7 rows")
print(user_product.shape)
print(user_product.head(n=7))

In [None]:
# ====================================
# Creating Training and Testing splits.
# ====================================

train_orders = orders[orders.eval_set == 'train']
test_orders = orders[orders.eval_set == 'test']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

In [None]:
def features(selected_orders, labels_given=False):
    orders_l = []
    products_l = []
    labels = []

    for i, _record in enumerate(tqdm(selected_orders.itertuples())):
        # Get the order id
        order_id = _record.order_id
        user_id = _record.user_id
        user_prods = users.all_products[user_id]

        # Generate pairs of the previously purchased products of user and the current order id in the training table.
        products_l.extend(user_prods)
        orders_l.extend([order_id] * len(user_prods))

        if labels_given:
            # Generate a labels, put 1 in front of each previously purchased product of each user if it's found in the
            # training table product list.
            labels.extend([((order_id, prod_id) in train.index) for prod_id in user_prods])

    # Create the Input for our classifier. Where we should create
    x = pd.DataFrame({'order_id': orders_l, 'product_id': products_l})

    # Create the labels for each tuple for x data frame.
    labels = np.asarray(labels)

    del products_l
    del orders_l

    # Feature Engineering.

    # The user features TODO @Samir55 Add more
    x['user_id'] = x.order_id.map(orders.user_id)  # Add the user id column.
    x['user_total_items'] = x.user_id.map(users.total_items)
    x['user_total_orders'] = x.user_id.map(users.orders_count)
    x['user_avg_days_between_orders'] = x.user_id.map(users.avg_between_interval)
    x['user_avg_basket'] = x.user_id.map(users.average_basket)
    x['user_avg_hour_of_day'] = x.user_id.map(users.avg_hour)

    # The products features. TODO @Samir55 Add more
    x['aisle_id'] = x.product_id.map(products.aisle_id)
    x['department_id'] = x.product_id.map(products.department_id)
    x['product_freq'] = x.product_id.map(products.freq)
    x['product_reorder_freq'] = x.product_id.map(products.reorder_freq)
    x['product_reorder_rate'] = x.product_id.map(products.reorder_rate)

    # The order features. TODO @Samir55 Add more
    x['order_hour_of_day'] = x.order_id.map(orders.order_hour_of_day)
    x['order_day_of_week'] = x.order_id.map(orders.order_dow)
    x['days_since_prior_order'] = x.order_id.map(orders.days_since_prior_order)

    # The user_product features. TODO @Samir55 Add more
    x['user_product_id'] = x.user_id * 1000000 + x.product_id
    x.drop(['user_id'], inplace=True, axis=1)  # Remove user_id column and we will be using the user_product_id instead.
    x['user_product_orders_count'] = x.user_product_id.map(user_product.orders_count)
    # x['user_product_last_order_id'] = x.user_product_id.map(user_product.last_order_id)
    x['user_product_avg_pos_in_cart'] = (
            x.user_product_id.map(user_product.sum_pos_in_cart) / x.user_product_orders_count).astype(
        np.float)
    x['user_product_reorder_rate'] = (x.user_product_orders_count / x.user_total_orders).astype(np.float)

    print(x.memory_usage())
    return x, labels



In [None]:
# ====================================
# Training the lightGBM Model.
# ====================================

df_train, labels = features(train_orders, labels_given=True)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

# Features to be trained on.
features_to_train_on = [
    # User Features
    'user_total_items',
    'user_total_orders',  # 'total_distinct_items',
    'user_avg_days_between_orders',
    'user_avg_basket',
    'user_avg_hour_of_day',

    # Product Features
    'aisle_id',
    'department_id',
    'product_freq',
    'product_reorder_freq',
    'product_reorder_rate',

    # Order Features
    'order_hour_of_day',
    'order_day_of_week',
    'days_since_prior_order',

    # User_product Features.
    'user_product_orders_count',
    'user_product_avg_pos_in_cart',
    'user_product_avg_pos_in_cart'
]

# Preparing the input for the LightGBM model.
d_train = lgb.Dataset(df_train[features_to_train_on],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])
del df_train

bst = lgb.train(params, d_train, ROUNDS)

In [None]:
# ====================================
# Testing the lightGBM Model.
# ====================================

df_test, _ = features(test_orders)

print('light GBM predict')
preds = bst.predict(df_test[features_to_train_on])

df_test['pred'] = preds

# Applying thresholds on the predictions.
TRESHOLD = 0.18  # by trials

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'


In [None]:
# Saving Submission.

sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('subs_samir.csv', index=False)