Baseline : https://www.kaggle.com/code/paulantoine/light-gbm-benchmark-0-3692

Baseline 소스코드 분석용.

In [6]:
# Author : Paul-Antoine Nguyen

# This script considers all the products a user has ordered
#
# We train a model computing the probability of reorder on the "train" data
#
# For the submission, we keep the orders that have a probability of
# reorder higher than a threshold


import numpy as np
import pandas as pd
import lightgbm as lgb
IDIR = 'data/'


print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

###

loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [7]:
priors

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


In [8]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)

computing product f


In [9]:
len(priors['product_id'].unique())

49677

In [10]:
priors.groupby('product_id').size()

product_id
1        1852
2          90
3         277
4         329
5          15
         ... 
49684       9
49685      49
49686     120
49687      13
49688      89
Length: 49677, dtype: int64

In [11]:
prods

Unnamed: 0_level_0,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1852,1136.0,0.613391
2,90,12.0,0.133333
3,277,203.0,0.732852
4,329,147.0,0.446809
5,15,9.0,0.600000
...,...,...,...
49684,9,1.0,0.111111
49685,49,6.0,0.122449
49686,120,84.0,0.700000
49687,13,6.0,0.461538


In [12]:
products

Unnamed: 0,product_id,aisle_id,department_id
0,1,61,19
1,2,104,13
2,3,94,7
3,4,38,1
4,5,5,13
...,...,...,...
49683,49684,124,5
49684,49685,42,1
49685,49686,112,3
49686,49687,41,8


In [13]:
products.index

RangeIndex(start=0, stop=49688, step=1)

In [14]:
products = products.join(prods, on='product_id')

In [15]:
products

Unnamed: 0,product_id,aisle_id,department_id,orders,reorders,reorder_rate
0,1,61,19,1852.0,1136.0,0.613391
1,2,104,13,90.0,12.0,0.133333
2,3,94,7,277.0,203.0,0.732852
3,4,38,1,329.0,147.0,0.446809
4,5,5,13,15.0,9.0,0.600000
...,...,...,...,...,...,...
49683,49684,124,5,9.0,1.0,0.111111
49684,49685,42,1,49.0,6.0,0.122449
49685,49686,112,3,120.0,84.0,0.700000
49686,49687,41,8,13.0,6.0,0.461538


In [16]:
products.set_index('product_id', drop=False, inplace=True)
del prods

In [17]:
products

Unnamed: 0_level_0,product_id,aisle_id,department_id,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,61,19,1852.0,1136.0,0.613391
2,2,104,13,90.0,12.0,0.133333
3,3,94,7,277.0,203.0,0.732852
4,4,38,1,329.0,147.0,0.446809
5,5,5,13,15.0,9.0,0.600000
...,...,...,...,...,...,...
49684,49684,124,5,9.0,1.0,0.111111
49685,49685,42,1,49.0,6.0,0.122449
49686,49686,112,3,120.0,84.0,0.700000
49687,49687,41,8,13.0,6.0,0.461538


In [18]:
products.isnull().sum()

product_id        0
aisle_id          0
department_id     0
orders           11
reorders         11
reorder_rate     11
dtype: int64

In [None]:
# orders, reorders, reorder_rate 의 결측치를 0 으로 입력해도 문제 없을듯!

In [19]:
products[products.orders.isna()]

Unnamed: 0_level_0,product_id,aisle_id,department_id,orders,reorders,reorder_rate
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3630,3630,57,14,,,
3718,3718,21,16,,,
7045,7045,88,13,,,
25383,25383,61,19,,,
27499,27499,100,21,,,
36233,36233,100,21,,,
37703,37703,75,17,,,
43725,43725,100,21,,,
45971,45971,101,17,,,
46625,46625,31,7,,,


In [20]:
orders

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0
...,...,...,...,...,...,...,...
3421078,2266710,206209,prior,10,5,18,29.0
3421079,1854736,206209,prior,11,4,10,30.0
3421080,626363,206209,prior,12,1,12,18.0
3421081,2977660,206209,prior,13,1,12,7.0


In [21]:
orders.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [22]:
orders[orders.days_since_prior_order.isnull()]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
11,2168274,2,prior,1,2,11,
26,1374495,3,prior,1,1,14,
39,3343014,4,prior,1,6,11,
45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...
3420930,969311,206205,prior,1,4,12,
3420934,3189322,206206,prior,1,3,18,
3421002,2166133,206207,prior,1,6,19,
3421019,2227043,206208,prior,1,1,15,


In [23]:
orders[(orders.eval_set == 'train') & (orders.order_number == 1)]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


In [24]:
orders[(orders.eval_set == 'test') & (orders.order_number == 1)]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order


In [25]:
orders[orders.user_id == 22]

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
257,2300537,22,prior,1,3,7,
258,144358,22,prior,2,3,7,30.0
259,1219654,22,prior,3,0,21,4.0
260,2922708,22,prior,4,3,11,30.0
261,3393222,22,prior,5,5,21,16.0
262,2327987,22,prior,6,1,19,17.0
263,2455079,22,prior,7,6,17,5.0
264,634494,22,prior,8,5,9,13.0
265,1014994,22,prior,9,5,15,28.0
266,3227049,22,prior,10,3,19,5.0


In [26]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')

add order info to priors


In [27]:
priors

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,order_id_,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,2,202279,prior,3,5,9,8.0
1,2,28985,2,1,2,202279,prior,3,5,9,8.0
2,2,9327,3,0,2,202279,prior,3,5,9,8.0
3,2,45918,4,1,2,202279,prior,3,5,9,8.0
4,2,30035,5,0,2,202279,prior,3,5,9,8.0
...,...,...,...,...,...,...,...,...,...,...,...
32434484,3421083,39678,6,1,3421083,25247,prior,24,2,6,21.0
32434485,3421083,11352,7,0,3421083,25247,prior,24,2,6,21.0
32434486,3421083,4600,8,0,3421083,25247,prior,24,2,6,21.0
32434487,3421083,24852,9,1,3421083,25247,prior,24,2,6,21.0


In [28]:
priors.drop('order_id_', inplace=True, axis=1)
priors

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0
...,...,...,...,...,...,...,...,...,...,...
32434484,3421083,39678,6,1,25247,prior,24,2,6,21.0
32434485,3421083,11352,7,0,25247,prior,24,2,6,21.0
32434486,3421083,4600,8,0,25247,prior,24,2,6,21.0
32434487,3421083,24852,9,1,25247,prior,24,2,6,21.0


In [29]:
# priors 에서 orders 를 order_id 로 join 했으니 당연히 eval_set 컬럼의 값은 'prior' 만 있음~

priors.eval_set.unique()

['prior']
Categories (3, object): ['prior', 'test', 'train']

In [30]:
### user features


print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)

computing user f


In [31]:
usr

Unnamed: 0_level_0,average_days_between_orders,nb_orders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,19.000000,11
2,16.285715,15
3,12.000000,13
4,17.000000,6
5,11.500000,5
...,...,...
206205,16.666666,4
206206,3.716418,68
206207,14.312500,17
206208,7.367347,50


In [32]:
users

Unnamed: 0_level_0,total_items,all_products
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088..."
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1..."
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324..."
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905..."
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398..."
...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210..."
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102..."
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235..."
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,..."


In [33]:
(users.all_products.map(len)).astype(np.int16)

user_id
1          18
2         102
3          33
4          17
5          23
         ... 
206205     24
206206    150
206207     92
206208    198
206209     68
Name: all_products, Length: 206209, dtype: int16

In [34]:
# ... 말고 풀 리스트를 봤을때 18 개가 맞는지 확인하자! 맞네!
xx = list(users[users.index == 1].all_products)
xx

[{196,
  10258,
  10326,
  12427,
  13032,
  13176,
  14084,
  17122,
  25133,
  26088,
  26405,
  30450,
  35951,
  38928,
  39657,
  41787,
  46149,
  49235}]

In [35]:
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)
users

Unnamed: 0_level_0,total_items,all_products,total_distinct_items
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23
...,...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210...",24
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102...",150
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235...",92
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,...",198


In [36]:
users = users.join(usr)
del usr
users

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.000000,11
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.000000,13
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.000000,6
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.500000,5
...,...,...,...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210...",24,16.666666,4
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102...",150,3.716418,68
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235...",92,14.312500,17
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,...",198,7.367347,50


In [37]:
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

user f (206209, 6)


In [38]:
users

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.000000,11,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15,13.000000
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.000000,13,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.000000,6,3.000000
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.500000,5,7.400000
...,...,...,...,...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210...",24,16.666666,4,8.000000
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102...",150,3.716418,68,4.191176
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235...",92,14.312500,17,13.117647
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,...",198,7.367347,50,13.540000


In [39]:
priors.eval_set.unique()

['prior']
Categories (3, object): ['prior', 'test', 'train']

In [40]:
### userXproduct features

print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000
priors

compute userXproduct f - this is long...


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,user_product
0,2,33120,1,1,202279,prior,3,5,9,8.0,-1246903360
1,2,28985,2,1,202279,prior,3,5,9,8.0,-1246907495
2,2,9327,3,0,202279,prior,3,5,9,8.0,-1246927153
3,2,45918,4,1,202279,prior,3,5,9,8.0,-1246890562
4,2,30035,5,0,202279,prior,3,5,9,8.0,-1246906445
...,...,...,...,...,...,...,...,...,...,...,...
32434484,3421083,39678,6,1,25247,prior,24,2,6,21.0,-1770227618
32434485,3421083,11352,7,0,25247,prior,24,2,6,21.0,-1770255944
32434486,3421083,4600,8,0,25247,prior,24,2,6,21.0,-1770262696
32434487,3421083,24852,9,1,25247,prior,24,2,6,21.0,-1770242444


user_product 값을 보면 오버플로우 발생한듯!!!!!!!!! 

int32 로는 안됨. int64 로는 괜찮을텐데?

근데 다른 값이랑 겹치거나 그런 문제는 없을거라서 문제 없을듯.

In [41]:
priors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 11 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int32   
 1   product_id              uint16  
 2   add_to_cart_order       int16   
 3   reordered               int8    
 4   user_id                 int32   
 5   eval_set                category
 6   order_number            int16   
 7   order_dow               int8    
 8   order_hour_of_day       int8    
 9   days_since_prior_order  float32 
 10  user_product            int32   
dtypes: category(1), float32(1), int16(2), int32(3), int8(3), uint16(1)
memory usage: 804.2 MB


In [29]:
# This was to slow !!
#def last_order(order_group):
#    ix = order_group.order_number.idxmax
#    return order_group.shape[0], order_group.order_id[ix],  order_group.add_to_cart_order.mean()
#userXproduct = pd.DataFrame()
#userXproduct['tmp'] = df.groupby('user_product').apply(last_order)

In [30]:
keke_data = {'col1':[1,2],'col2':[3,4]}
keke_idx = ['row1','row2']
keke_df = pd.DataFrame(data = keke_data, index=keke_idx)
print(keke_df)

      col1  col2
row1     1     3
row2     2     4


In [31]:
for row in keke_df.itertuples():
    print(row.Index)
    print(row.col1)
    print(row.col2)

row1
1
3
row2
2
4


In [42]:
priors.eval_set.unique()

['prior']
Categories (3, object): ['prior', 'test', 'train']

In [43]:
# d 딕셔너리에는 priors 데이터프레임에 있는 데이터 기준으로 만들어짐.
# train 데이터프레임은 추가 안해도 될려나??

d= dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1,                                  # 어떤 유저가 어떤 제품을 구매한 횟수인듯
                (row.order_number, row.order_id),   # 가장 마지막에 구입한 order_id 를 얻을 수 있음. 근데 의미 있을까?
                row.add_to_cart_order)              # 의미 있을까?
    else:
        d[z] = (d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                d[z][2] + row.add_to_cart_order)                    # 이렇게 계산하는게 의미가 있을까?

In [44]:
d

{-1246903360: (5, (8, 104690), 10),
 -1246907495: (5, (6, 132412), 16),
 -1246927153: (1, (3, 2), 3),
 -1246890562: (5, (7, 2382766), 24),
 -1246906445: (3, (7, 2382766), 14),
 -1246918686: (7, (7, 2382766), 25),
 -1246896339: (5, (6, 132412), 29),
 -1246934661: (2, (3, 2), 19),
 -1246892812: (3, (6, 132412), 20),
 -877802726: (17, (25, 368699), 86),
 -877811642: (14, (25, 368699), 58),
 -877818776: (13, (25, 368699), 72),
 -877814577: (14, (25, 368699), 55),
 -877818812: (6, (16, 3), 54),
 -877789813: (13, (25, 368699), 69),
 -877819019: (4, (24, 2214773), 35),
 -877803815: (6, (24, 2214773), 45),
 672177658: (17, (55, 1609396), 99),
 672157250: (4, (42, 2715951), 19),
 672170574: (19, (56, 1253720), 169),
 672158577: (49, (56, 1253720), 401),
 672140870: (29, (56, 1253720), 235),
 672152167: (33, (55, 1609396), 301),
 672153414: (12, (55, 1609396), 119),
 672165678: (16, (54, 385597), 124),
 672171101: (4, (43, 214045), 52),
 672148432: (19, (56, 1253720), 174),
 672155962: (21, (52,

In [45]:
len(d)

13293564

In [46]:
print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d

to dataframe (less memory)


In [47]:
userXproduct

Unnamed: 0,0,1,2
-1246903360,5,"(8, 104690)",10
-1246907495,5,"(6, 132412)",16
-1246927153,1,"(3, 2)",3
-1246890562,5,"(7, 2382766)",24
-1246906445,3,"(7, 2382766)",14
...,...,...,...
-1770221987,1,"(24, 3421083)",2
-1770246134,1,"(24, 3421083)",3
-1770232085,1,"(24, 3421083)",5
-1770255944,1,"(24, 3421083)",7


In [48]:
userXproduct.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13293564 entries, -1246903360 to -1770262696
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   0       int64 
 1   1       object
 2   2       int64 
dtypes: int64(2), object(1)
memory usage: 405.7+ MB


In [49]:
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct

Unnamed: 0,nb_orders,last_order_id,sum_pos_in_cart
-1246903360,5,"(8, 104690)",10
-1246907495,5,"(6, 132412)",16
-1246927153,1,"(3, 2)",3
-1246890562,5,"(7, 2382766)",24
-1246906445,3,"(7, 2382766)",14
...,...,...,...
-1770221987,1,"(24, 3421083)",2
-1770246134,1,"(24, 3421083)",3
-1770232085,1,"(24, 3421083)",5
-1770255944,1,"(24, 3421083)",7


In [50]:
userXproduct.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13293564 entries, -1246903360 to -1770262696
Data columns (total 3 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   nb_orders        int64 
 1   last_order_id    object
 2   sum_pos_in_cart  int64 
dtypes: int64(2), object(1)
memory usage: 405.7+ MB


In [51]:
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)

In [52]:
userXproduct.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13293564 entries, -1246903360 to -1770262696
Data columns (total 3 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   nb_orders        int16 
 1   last_order_id    object
 2   sum_pos_in_cart  int64 
dtypes: int16(1), int64(1), object(1)
memory usage: 329.6+ MB


In [53]:
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct

Unnamed: 0,nb_orders,last_order_id,sum_pos_in_cart
-1246903360,5,104690,10
-1246907495,5,132412,16
-1246927153,1,2,3
-1246890562,5,2382766,24
-1246906445,3,2382766,14
...,...,...,...
-1770221987,1,3421083,2
-1770246134,1,3421083,3
-1770232085,1,3421083,5
-1770255944,1,3421083,7


In [54]:
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
userXproduct.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13293564 entries, -1246903360 to -1770262696
Data columns (total 3 columns):
 #   Column           Dtype
---  ------           -----
 0   nb_orders        int16
 1   last_order_id    int32
 2   sum_pos_in_cart  int16
dtypes: int16(2), int32(1)
memory usage: 202.8 MB


In [55]:
print('user X product f', len(userXproduct))

del priors

user X product f 13293564


In [57]:
### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

split orders : train, test


In [58]:
test_orders

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2774568,2774568,3,test,13,5,15,11.0
329954,329954,4,test,6,3,12,30.0
1528013,1528013,6,test,4,3,16,22.0
1376945,1376945,11,test,8,6,11,8.0
1356845,1356845,12,test,6,1,20,30.0
...,...,...,...,...,...,...,...
2728930,2728930,206202,test,23,2,17,6.0
350108,350108,206204,test,5,4,14,14.0
1043943,1043943,206206,test,68,0,20,0.0
2821651,2821651,206207,test,17,2,13,14.0


In [59]:
train_orders

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1187899,1187899,1,train,11,4,8,14.0
1492625,1492625,2,train,15,1,11,30.0
2196797,2196797,5,train,5,0,11,6.0
525192,525192,7,train,21,2,11,6.0
880375,880375,8,train,4,1,14,10.0
...,...,...,...,...,...,...,...
2585586,2585586,206199,train,20,2,16,30.0
943915,943915,206200,train,24,6,19,6.0
2371631,2371631,206203,train,6,4,19,30.0
1716008,1716008,206205,train,4,1,16,10.0


In [60]:
train

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1
...,...,...,...,...
1384612,3421063,14233,3,1
1384613,3421063,35548,4,1
1384614,3421070,35951,1,1
1384615,3421070,16953,2,1


In [61]:
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)
train

Unnamed: 0_level_0,Unnamed: 1_level_0,order_id,product_id,add_to_cart_order,reordered
order_id,product_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,49302,1,49302,1,1
1,11109,1,11109,2,1
1,10246,1,10246,3,0
1,49683,1,49683,4,0
1,43633,1,43633,5,1
...,...,...,...,...,...
3421063,14233,3421063,14233,3,1
3421063,35548,3421063,35548,4,1
3421070,35951,3421070,35951,1,1
3421070,16953,3421070,16953,2,1


In [62]:
users

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.000000,11,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15,13.000000
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.000000,13,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.000000,6,3.000000
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.500000,5,7.400000
...,...,...,...,...,...,...
206205,32,"{20995, 21137, 22035, 21910, 17691, 31404, 210...",24,16.666666,4,8.000000
206206,285,"{16896, 44033, 18434, 16387, 21508, 45573, 102...",150,3.716418,68,4.191176
206207,223,"{20995, 18441, 45578, 47626, 33806, 22035, 235...",92,14.312500,17,13.117647
206208,677,"{1025, 20995, 47626, 8203, 5133, 38419, 27156,...",198,7.367347,50,13.540000


In [63]:
orders.memory_usage()

Index                     13684332
order_id                  13684332
user_id                   13684332
eval_set                   3421215
order_number               6842166
order_dow                  3421083
order_hour_of_day          3421083
days_since_prior_order    13684332
dtype: int64

In [66]:
users.all_products[22]

{2452,
 4217,
 4421,
 5212,
 5450,
 7088,
 7948,
 8518,
 13176,
 14678,
 14966,
 15392,
 15984,
 16987,
 17794,
 21903,
 22115,
 22935,
 22963,
 24506,
 24964,
 27171,
 27845,
 32096,
 32655,
 35221,
 36311,
 36724,
 38312,
 39040,
 41950,
 44359,
 44968,
 49533}

In [67]:
### build list of candidate products to reorder, with features ###

def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        
        order_id = row.order_id
        user_id = row.user_id
        
        # order_id 에 귀속된 정보가 아닌데??
        # 이러다보니 예를들어 order_id 가 1187899 일때 product_id 가 17122 가 df 에 추가되는데,
        # 실제로는 order_id 가 1187899 일때 17122 product 는 없는 product_id 이다!
        # --> 이럴때는 label 이 False 로 결정된다.
        user_products = users.all_products[user_id]
        
        # user_products 는 set 인데, product_list 에 추가될 때, 원소 하나하나가 product_list 의 원소로 추가된다.
        product_list += user_products
        
        order_list += [order_id] * len(user_products)
        
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    # df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [68]:
df_train, labels = features(train_orders, labels_given=True)

build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
UP_or

In [69]:
df_train[df_train.order_id == 1187899]

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,...,department_id,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last
0,1187899,17122,11,59,18,19.0,5.363636,8,14.0,0.736842,...,4,13880,9377.0,0.675576,1,0.090909,6.0,0.090909,6,7
1,1187899,196,11,59,18,19.0,5.363636,8,14.0,0.736842,...,7,35791,27791.0,0.77648,10,0.909091,1.4,0.909091,1,0
2,1187899,26405,11,59,18,19.0,5.363636,8,14.0,0.736842,...,17,1214,536.0,0.441516,2,0.181818,5.0,0.181818,7,1
3,1187899,46149,11,59,18,19.0,5.363636,8,14.0,0.736842,...,7,8558,6953.0,0.812456,3,0.272727,3.0,0.272727,1,0
4,1187899,14084,11,59,18,19.0,5.363636,8,14.0,0.736842,...,16,15935,12923.0,0.810982,1,0.090909,2.0,0.090909,10,0
5,1187899,13032,11,59,18,19.0,5.363636,8,14.0,0.736842,...,14,3751,2465.0,0.657158,3,0.272727,6.333333,0.272727,1,0
6,1187899,26088,11,59,18,19.0,5.363636,8,14.0,0.736842,...,19,2523,1360.0,0.539041,2,0.181818,4.5,0.181818,9,1
7,1187899,39657,11,59,18,19.0,5.363636,8,14.0,0.736842,...,19,5019,3846.0,0.766288,1,0.090909,3.0,0.090909,1,0
8,1187899,12427,11,59,18,19.0,5.363636,8,14.0,0.736842,...,19,6476,4797.0,0.740735,10,0.909091,3.3,0.909091,1,0
9,1187899,25133,11,59,18,19.0,5.363636,8,14.0,0.736842,...,16,6196,4586.0,0.740155,8,0.727273,4.0,0.727273,1,0


In [70]:
labels

array([0, 1, 1, ..., 0, 0, 0], dtype=int8)

In [71]:
set(labels)

{0, 1}

In [72]:
print(df_train.shape)
print(labels.shape)

(8474661, 21)
(8474661,)


In [73]:
df_train.columns

Index(['order_id', 'product_id', 'user_total_orders', 'user_total_items',
       'total_distinct_items', 'user_average_days_between_orders',
       'user_average_basket', 'order_hour_of_day', 'days_since_prior_order',
       'days_since_ratio', 'aisle_id', 'department_id', 'product_orders',
       'product_reorders', 'product_reorder_rate', 'UP_orders',
       'UP_orders_ratio', 'UP_average_pos_in_cart', 'UP_reorder_rate',
       'UP_orders_since_last', 'UP_delta_hour_vs_last'],
      dtype='object')

In [74]:
#  order_id, product_id 빠짐!

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'

In [75]:
print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'
del df_train

formating for lgb


In [76]:
d_train

<lightgbm.basic.Dataset at 0x1972eb3ed50>

In [77]:
# 각각이 뭘 의미하는지는..
# https://lightgbm.readthedocs.io/en/stable/Parameters.html

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

In [83]:
np.unique(labels)

array([0, 1], dtype=int8)

In [85]:
# labels 에서 0 값이 압도적으로 많음.

np.count_nonzero(labels)

828824

In [86]:
print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
del d_train

light GBM train :-)
[LightGBM] [Info] Number of positive: 828824, number of negative: 7645837
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.162178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3280
[LightGBM] [Info] Number of data points in the train set: 8474661, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.097800 -> initscore=-2.221909
[LightGBM] [Info] Start training from score -2.221909


In [92]:
### build candidates list for test ###

df_test, _ = features(test_orders)
df_test

build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
UP_orders                             int16
UP_orders_ratio                     float32
UP_average_pos_in

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,...,department_id,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last
0,2774568,17668,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,16,2110,1220.0,0.578199,5,0.384615,3.600000,0.384615,2,3
1,2774568,44683,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,4,22275,11981.0,0.537868,2,0.153846,9.500000,0.153846,7,1
2,2774568,48523,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,1,5129,2376.0,0.463248,2,0.153846,6.500000,0.153846,4,1
3,2774568,21903,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,4,241921,186884.0,0.772500,8,0.615385,4.250000,0.615385,1,0
4,2774568,14992,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,4,29069,16942.0,0.582820,2,0.153846,7.000000,0.153846,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4833287,803273,44532,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,19,311,124.0,0.398714,1,0.020000,11.000000,0.020000,7,3
4833288,803273,46069,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,4,9240,5420.0,0.586580,7,0.140000,5.571429,0.140000,4,1
4833289,803273,12791,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,19,1393,755.0,0.541996,1,0.020000,12.000000,0.020000,7,3
4833290,803273,14332,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,13,6046,1424.0,0.235528,1,0.020000,9.000000,0.020000,8,8


In [93]:
print('light GBM predict')
preds = bst.predict(df_test[f_to_use])
preds

light GBM predict


array([0.38993316, 0.06928635, 0.09174503, ..., 0.02182454, 0.01425185,
       0.04716532])

train 시킬때 labels 에는 0, 1 값만 있었는데,

preds 는 왜 0, 1 이 아닌 실수? 가 나오나??

In [97]:
df_test['pred'] = preds
df_test

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,...,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,pred
0,2774568,17668,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,2110,1220.0,0.578199,5,0.384615,3.600000,0.384615,2,3,0.389933
1,2774568,44683,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,22275,11981.0,0.537868,2,0.153846,9.500000,0.153846,7,1,0.069286
2,2774568,48523,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,5129,2376.0,0.463248,2,0.153846,6.500000,0.153846,4,1,0.091745
3,2774568,21903,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,241921,186884.0,0.772500,8,0.615385,4.250000,0.615385,1,0,0.639037
4,2774568,14992,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,29069,16942.0,0.582820,2,0.153846,7.000000,0.153846,6,0,0.080984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4833287,803273,44532,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,311,124.0,0.398714,1,0.020000,11.000000,0.020000,7,3,0.015784
4833288,803273,46069,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,9240,5420.0,0.586580,7,0.140000,5.571429,0.140000,4,1,0.165744
4833289,803273,12791,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,1393,755.0,0.541996,1,0.020000,12.000000,0.020000,7,3,0.021825
4833290,803273,14332,50,677,198,7.367347,13.540000,11,4.0,0.542936,...,6046,1424.0,0.235528,1,0.020000,9.000000,0.020000,8,8,0.014252


In [98]:
TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

In [99]:
d

{2774568: '17668 21903 39190 47766 18599 43961 23650 24810',
 1528013: '21903 38293',
 1376945: '33572 28465 27959 44632 24799 34658 14947 8309 13176',
 1356845: '11520 14992 7076 28134 10863 13176',
 2161313: '11266 196 10441 12427 37710 48142 14715 27839',
 1416320: '5134 21903 21137 24852 17948 41950 24561',
 1735923: '17008 2192 9387 196 15599 31487 15131 35123 12108 34690 42913',
 1980631: '13575 6184 9387 46061 13914 41400 22362',
 139655: '27845',
 1411408: '26452 22008',
 2940603: '30592 19894 44632 10339 14947 18531 31615',
 1192143: '47626 27307 24852',
 280888: '19213 32566 41406',
 3202221: '49175 49215 10831 21137 43692 4793 45774 17630 24852 45364 13629 27966 11130 46979 9637 21927 39911',
 3222866: '40706 13187 37131 32912 7969 38690 33198 8501 32441 18894 34254 35921 14947 15718',
 707453: '45066 42585 48230 44142 21137 47766 694 18150 24852 21267 32030 44830 7969 8518 4942 39275 21903 6111 28156',
 1320132: '44632',
 882556: '28289 47626 42768 24852 23576 16797 25890 4

In [101]:
for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

In [102]:
d

{2774568: '17668 21903 39190 47766 18599 43961 23650 24810',
 1528013: '21903 38293',
 1376945: '33572 28465 27959 44632 24799 34658 14947 8309 13176',
 1356845: '11520 14992 7076 28134 10863 13176',
 2161313: '11266 196 10441 12427 37710 48142 14715 27839',
 1416320: '5134 21903 21137 24852 17948 41950 24561',
 1735923: '17008 2192 9387 196 15599 31487 15131 35123 12108 34690 42913',
 1980631: '13575 6184 9387 46061 13914 41400 22362',
 139655: '27845',
 1411408: '26452 22008',
 2940603: '30592 19894 44632 10339 14947 18531 31615',
 1192143: '47626 27307 24852',
 280888: '19213 32566 41406',
 3202221: '49175 49215 10831 21137 43692 4793 45774 17630 24852 45364 13629 27966 11130 46979 9637 21927 39911',
 3222866: '40706 13187 37131 32912 7969 38690 33198 8501 32441 18894 34254 35921 14947 15718',
 707453: '45066 42585 48230 44142 21137 47766 694 18150 24852 21267 32030 44830 7969 8518 4942 39275 21903 6111 28156',
 1320132: '44632',
 882556: '28289 47626 42768 24852 23576 16797 25890 4

In [103]:
sub = pd.DataFrame.from_dict(d, orient='index')

In [104]:
sub

Unnamed: 0,0
2774568,17668 21903 39190 47766 18599 43961 23650 24810
1528013,21903 38293
1376945,33572 28465 27959 44632 24799 34658 14947 8309...
1356845,11520 14992 7076 28134 10863 13176
2161313,11266 196 10441 12427 37710 48142 14715 27839
...,...
2498703,
2769683,
783264,
2994962,


In [105]:
sub.reset_index(inplace=True)

In [106]:
sub

Unnamed: 0,index,0
0,2774568,17668 21903 39190 47766 18599 43961 23650 24810
1,1528013,21903 38293
2,1376945,33572 28465 27959 44632 24799 34658 14947 8309...
3,1356845,11520 14992 7076 28134 10863 13176
4,2161313,11266 196 10441 12427 37710 48142 14715 27839
...,...,...
74995,2498703,
74996,2769683,
74997,783264,
74998,2994962,


In [109]:
sub.columns = ['order_id', 'products']
sub

Unnamed: 0,order_id,products
0,2774568,17668 21903 39190 47766 18599 43961 23650 24810
1,1528013,21903 38293
2,1376945,33572 28465 27959 44632 24799 34658 14947 8309...
3,1356845,11520 14992 7076 28134 10863 13176
4,2161313,11266 196 10441 12427 37710 48142 14715 27839
...,...,...
74995,2498703,
74996,2769683,
74997,783264,
74998,2994962,


In [112]:
sub.to_csv('submission.csv', index=False)