In [106]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import scipy

In [2]:
# ====================================
# Reading the dataset.
# ====================================

aisles = pd.read_csv('data/aisles.csv')

departments = pd.read_csv('data/departments.csv')

# We are defining the type of each column to optimize the storage as far as we can.
priors = pd.read_csv('data/order_products__prior.csv',
                     dtype={
                         'order_id': np.int32,
                         'product_id': np.uint16,
                         'add_to_cart_order': np.int16,  # The order of an added item to the cart.
                         'reordered': np.int8}  # Whether the item has been reordered in the past.
                     )

train = pd.read_csv('data/order_products__train.csv',
                    dtype={
                        'order_id': np.int32,
                        'product_id': np.uint16,
                        'add_to_cart_order': np.int16,
                        'reordered': np.int8}
                    )

orders = pd.read_csv('data/orders.csv',
                     dtype={
                         'order_id': np.int32,
                         'user_id': np.int32,
                         'eval_set': 'category',  # Categorical column.
                         'order_number': np.int16,
                         'order_dow': np.int8,
                         'order_hour_of_day': np.int8,
                         'days_since_prior_order': np.float32},
                     )

products = pd.read_csv('data/products.csv',
                       dtype={
                           'product_id': np.uint16,
                           'order_id': np.int32,
                           'aisle_id': np.uint8,
                           'department_id': np.uint},
                       usecols=['product_id', 'aisle_id', 'department_id']  # Ignore the product Name., It's not 
                       # required in our case 
                       )


In [3]:
# Print some information about the dataset.
print(priors.shape)
print(train.shape)
print(products.shape)
print(priors.shape)
print(orders.shape)

print("\n=======Priors table head=======\n")
print(priors.head(n=7))

print("\n=======Train table head=======\n")
print(train.head(n=7))

print("\n=======Orders table head=======\n")
print(orders.head(n=7))

print("\n=======Products table head=======\n")
print(products.head(n=7))

(32434489, 4)
(1384617, 4)
(49688, 3)
(32434489, 4)
(3421083, 7)


   order_id  product_id  add_to_cart_order  reordered
0         2       33120                  1          1
1         2       28985                  2          1
2         2        9327                  3          0
3         2       45918                  4          1
4         2       30035                  5          0
5         2       17794                  6          1
6         2       40141                  7          1


   order_id  product_id  add_to_cart_order  reordered
0         1       49302                  1          1
1         1       11109                  2          1
2         1       10246                  3          0
3         1       49683                  4          0
4         1       43633                  5          1
5         1       13176                  6          0
6         1       47209                  7          0


   order_id  user_id eval_set  order_number  order_dow  order_hou

In [5]:
# ====================================
# Data Cleaning.
# ====================================

orders.set_index('order_id', drop=False, inplace=True)  # inplace means don't create a new object
products.set_index('product_id', drop=False, inplace=True)  # inplace means don't create a new object

# Checking and removing Nans if found.
print("\n=======Products table isNan checks =======\n")
print(np.sum(pd.isna(products)))

print("\n=======Train table isNan checks =======\n")
print(np.sum(pd.isna(train)))

print("\n=======Priors table isNan checks =======\n")
print(np.sum(pd.isna(priors)))

print("\n=======Orders table isNan checks =======\n")
print(np.sum(pd.isna(orders)))

# Found Nans only in the days_since_prior_order column in the orders table and replacing.
print("Nans percentage: %.2f%%" % (206209 / 3421083 * 100))
orders['days_since_prior_order'] = orders['days_since_prior_order'].fillna(np.mean(orders['days_since_prior_order']))






product_id       0
aisle_id         0
department_id    0
dtype: int64




order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64




order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64




order_id                  0
user_id                   0
eval_set                  0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64
Nans percentage: 6.03%


In [132]:
# Approach два
# ====================================

# ====================================
# Adding Features to Products.
# ====================================

# Add New Features to the Products table.
# The reorder rate.
prod_features = pd.DataFrame()
prod_features['freq'] = priors.groupby(priors.product_id).size().astype(np.int32)
prod_features['reorder_freq'] = priors.reordered.groupby(priors.product_id).sum().astype(np.int32)
prod_features['reorder_rate'] = (prod_features.reorder_freq / prod_features.freq).astype(np.float)

products = products.join(prod_features, on='product_id')

del prod_features


Defaulting to column, but this will raise an ambiguity error in a future version
  rsuffix=rsuffix, sort=sort)


In [133]:
# Join prior with orders
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', axis=1, inplace=True)  # Remove the order_id_ redundant column.


In [134]:
# ====================================
# Adding Features to Users.
# ====================================

usrs = pd.DataFrame()
usrs['avg_between_interval'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float)
usrs['orders_count'] = orders.groupby('user_id').size().astype(np.int16)
usrs['avg_hour'] = orders.groupby('user_id')['order_hour_of_day'].mean().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['all_distinct_products'] = users.all_products.map(len).astype(np.int16)

users = users.join(usrs)
del usrs

users['average_basket'] = (users.total_items / users.orders_count).astype(np.float)


In [135]:
# Add product id related to each user
priors['user_product'] = priors.product_id + priors.user_id * 1000000

# Creating a list of unique products bought by each user and the last time the user has bought it.
dic = dict()
for record in priors.itertuples():
    user_product_id = record.user_product
    if user_product_id not in dic:
        dic[user_product_id] = (1,  # means bought once
                                (record.order_number, record.order_id),
                                record.add_to_cart_order
                                )
    else:
        dic[user_product_id] = (dic[user_product_id][0] + 1,
                                max(dic[user_product_id][1], (record.order_number, record.order_id)),
                                dic[user_product_id][2] + record.add_to_cart_order
                                )

user_product = pd.DataFrame.from_dict(dic,
                                      orient='index')  # Index means that the keys of the dictionary should be the 
# rows not columns

del dic


In [None]:
user_product.columns = ['orders_count', 'last_order_id', 'sum_pos_in_cart']
user_product.orders_count = user_product.orders_count.astype(np.int16)
user_product.last_order_id = user_product.last_order_id.map(lambda x: x[1]).astype(np.int32)
user_product.sum_pos_in_cart = user_product.sum_pos_in_cart.astype(np.int16)

# Printing some stats
print("UserProduct table shape and first 7 rows")
print(user_product.shape)
print(user_product.head(n=7))

# We are not now in the need of the priors table.
del priors


In [None]:
# ====================================
# Creating Training and Testing splits.
# ====================================

train = orders[orders.eval_set == 'train']
test = orders[orders.eval_set == 'test']

train.set_index(['order_id', 'prod'], inplace=True, drop=False)


In [None]:
def features(selected_orders, train=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i = 0

    for row in selected_orders.itertuples():
        i += 1
        if i % 10000 == 0: print('order row', i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]

    df = pd.DataFrame({'order_id': order_list, 'product_id': product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list

    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] = df.user_id.map(users.average_basket)

    print('order related features')
    # df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders

    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(
        lambda x: min(x, 24 - x)).astype(np.int8)
    # df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return df, labels


In [None]:
# ====================================
# Training the lightGBM Model.
# ====================================

df_train, labels = features(train, labels_given=True)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

# Features to be trained on.
features_to_train_on = ['user_total_orders', 'user_total_items', 'total_distinct_items',
                        'user_average_days_between_orders', 'user_average_basket',
                        'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
                        'aisle_id', 'department_id', 'product_orders', 'product_reorders',
                        'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
                        'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
                        'UP_delta_hour_vs_last']

# Preparing the input for the LightGBM model.
d_train = lgb.Dataset(df_train[features_to_train_on],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])
del df_train

bst = lgb.train(params, d_train, ROUNDS)


In [None]:
# ====================================
# Testing the lightGBM Model.
# ====================================

df_test, _ = features(test)

print('light GBM predict')
preds = bst.predict(df_test[features_to_train_on])

df_test['pred'] = preds

# Applying thresholds on the predictions.
TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test.order_id:
    if order not in d:
        d[order] = 'None'


In [None]:
# Saving Submission.

sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('subs_samir.csv', index=False)


In [None]:
# ====================================
# Approach три (Inspired by Second Place Approach Explanation in his blog post on Kaggle)
# Link: http://blog.kaggle.com/2017/09/21/instacart-market-basket-analysis-winners-interview-2nd-place-kazuki-onodera/
# ====================================
