In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import scipy

In [None]:
# ====================================
# Reading the dataset.
# ====================================

aisles = pd.read_csv('data/aisles.csv')

departments = pd.read_csv('data/departments.csv')

# We are defining the type of each column to optimize the storage as far as we can.
priors = pd.read_csv('data/order_products__prior.csv',
                     dtype={
                         'order_id': np.int32,
                         'product_id': np.uint16,
                         'add_to_cart_order': np.int16,  # The order of an added item to the cart.
                         'reordered': np.int8}  # Whether the item has been reordered in the past.
                     )

train = pd.read_csv('data/order_products__train.csv',
                    dtype={
                        'order_id': np.int32,
                        'product_id': np.uint16,
                        'add_to_cart_order': np.int16,
                        'reordered': np.int8}
                    )

orders = pd.read_csv('data/orders.csv',
                     dtype={
                         'order_id': np.int32,
                         'user_id': np.int32,
                         'eval_set': 'category',  # Categorical column.
                         'order_number': np.int16,
                         'order_dow': np.int8,
                         'order_hour_of_day': np.int8,
                         'days_since_prior_order': np.float32},
                     )

products = pd.read_csv('data/products.csv',
                       dtype={
                           'product_id': np.uint16,
                           'order_id': np.int32,
                           'aisle_id': np.uint8,
                           'department_id': np.uint},
                       usecols=['product_id', 'aisle_id', 'department_id']  # Ignore the product Name., It's not 
                       # required in our case 
                       )


In [None]:
# Print some information about the dataset.
print(priors.shape)
print(train.shape)
print(products.shape)
print(priors.shape)
print(orders.shape)

print("\n=======Priors table head=======\n")
print(priors.head(n=7))

print("\n=======Train table head=======\n")
print(train.head(n=7))

print("\n=======Orders table head=======\n")
print(orders.head(n=7))

print("\n=======Products table head=======\n")
print(products.head(n=7))

In [None]:
# ====================================
# Data Cleaning.
# ====================================

orders.set_index('order_id', drop=False, inplace=True)  # inplace means don't create a new object
products.set_index('product_id', drop=False, inplace=True)  # inplace means don't create a new object

# Checking and removing Nans if found.
print("\n=======Products table isNan checks =======\n")
print(np.sum(pd.isna(products)))

print("\n=======Train table isNan checks =======\n")
print(np.sum(pd.isna(train)))

print("\n=======Priors table isNan checks =======\n")
print(np.sum(pd.isna(priors)))

print("\n=======Orders table isNan checks =======\n")
print(np.sum(pd.isna(orders)))

# Found Nans only in the days_since_prior_order column in the orders table and replacing.
print("Nans percentage: %.2f%%" % (206209 / 3421083 * 100))
orders['days_since_prior_order'] = orders['days_since_prior_order'].fillna(np.mean(orders['days_since_prior_order']))


In [None]:
# ====================================
# Approach один (Simple but not that efficient,  Score: 0.21337)
# ====================================

# Get all order ids for each user.
up = priors.join(orders, on='order_id', rsuffix='_')
up.drop('order_id_', axis=1, inplace=True)  # Remove the order_id_ redundant column.

upu = up.groupby('user_id')['product_id'].apply(np.unique, return_counts=True).reset_index()

tests = orders[orders.eval_set == 'test']
tests_order_ids = orders[orders.eval_set == 'test']['order_id']
tests_user_ids = orders[orders.eval_set == 'test']['user_id'].reset_index()

random.seed(33)

res = []
for i, order_id in enumerate(tests_order_ids):
    user_id = tests_user_ids[tests_user_ids.order_id == order_id]['user_id'].item()
    n = min(np.random.randint(1, 20), len(upu[upu.user_id == user_id].product_id.item()[0]))
    arr = []
    x = list(upu[upu.user_id == user_id].product_id.item()[0])
    y = list(upu[upu.user_id == user_id].product_id.item()[1])

    keydict = dict(zip(x, y))
    x.sort(key=keydict.get)
    for i in range(1, max(n - 3, 1)):
        arr.append(x[-i])
    arr.extend(random.sample(list(upu[upu.user_id == user_id].product_id.item()[0]), n - max(n - 3, 1)))
    res.append((order_id, arr))

sub_1 = pd.DataFrame(res)
sub_1.columns = ['order_id', 'products']
sub_1.to_csv('subs_samir_1.csv', index=False)
