In [365]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

## Read datasets

In [366]:
# This file tells to which set (prior, train, test) an order belongs.
# You are predicting reordered items only for the test set orders. 'order_dow' is the day of week.

# COLUMNS:
# order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
orders = pd.read_csv('./data/orders.csv',index_col = 'order_id',usecols = ['order_id','user_id','eval_set','order_number','days_since_prior_order'])
orders.loc[:,'days_since_prior_order'].fillna(0,inplace = True) 
orders = orders.assign(date = orders.groupby('user_id').days_since_prior_order.transform(np.cumsum))
orders.head(11)

Unnamed: 0_level_0,user_id,eval_set,order_number,days_since_prior_order,date
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2539329,1,prior,1,0.0,0.0
2398795,1,prior,2,15.0,15.0
473747,1,prior,3,21.0,36.0
2254736,1,prior,4,29.0,65.0
431534,1,prior,5,28.0,93.0
3367565,1,prior,6,19.0,112.0
550135,1,prior,7,20.0,132.0
3108588,1,prior,8,14.0,146.0
2295261,1,prior,9,0.0,146.0
2550362,1,prior,10,30.0,176.0


In [368]:
# These files specify which products were purchased in each order.
# order_products__prior.csv contains previous order contents for all customers.
# 'reordered' indicates that the customer has a previous order that contains the product. 
# Note that some orders will have no reordered items.
# You may predict an explicit 'None' value for orders with no reordered items.
# See the evaluation page for full details.

# COLUMNS
# order_id,product_id,add_to_cart_order,reordered 
usecols = ['order_id','product_id']
items = pd.concat([pd.read_csv('./data/order_products__train.csv',usecols = usecols),
                   pd.read_csv('./data/order_products__prior.csv',usecols = usecols)])
items.head()

Unnamed: 0,order_id,product_id
0,1,49302
1,1,11109
2,1,10246
3,1,49683
4,1,43633


In [372]:
np.unique(items.product_id.values).shape

(49685L,)

In [4]:
products    = pd.read_csv('./data/products.csv',index_col='product_id')
#aisles      = pd.read_csv('./data/aisles.csv',index_col = 'aisle_id')
#departments = pd.read_csv('./data/departments.csv', index_col = 'department_id')

In [None]:
def addFeatures(df):
    priorSet = df.merge(products, left_on='product_id', right_index=True, how = 'left', copy = False)
    priorSet = priorSet.merge(aisles, left_on = 'aisle_id',right_index = True, how = 'left', copy = False)
    priorSet = priorSet.merge(departments, left_on = 'department_id', right_index = True, how = 'left', copy = False)
    priorSet = priorSet.merge(orders, left_on = 'order_id', right_index = True, how = 'left', copy = False)
    return priorSet

In [None]:
def basketSize(userGroup):
    g = userGroup.groupby(['order_id'])
    return g['order_id'].size().mean()

In [None]:
def save2File(fName,df):
    f = open(fName,'w')
    f.write('order_id,products\n')
    np.set_printoptions(linewidth = 500)
    for key, val in df.iteritems():
        k = np.array2string(val[1])[1:-1]+'\n'
        s = str(val[0])+','+k
        f.write(s)
    f.close()
    np.set_printoptions(linewidth = 75)

In [None]:
def f1Score(y_true, y_pred):
    fullSet      = np.unique(np.hstack((y_pred,y_true)))
    intersection = np.intersect1d(y_true,y_pred).size
    precession   = intersection/float(y_pred.shape[0])
    recall       = intersection/float(y_true.shape[0])
    try:
        f1 = 2*precession*recall/(precession+recall)
    except ZeroDivisionError:
        f1 = 0
    return [precession, recall, f1]

In [322]:
testUserId = orders.query("eval_set == 'test'").user_id.values
trainUserId = orders.query("eval_set == 'train'").user_id.values

In [None]:
plt.figure(figsize=[20,5])

plt.subplot(131)
plt.hist(res[:,2],bins = 20);
plt.title('precession')

plt.subplot(132)
plt.hist(res[:,3],bins = 20);
plt.title('recall')

plt.subplot(133)
plt.hist(res[:,4],bins = 20);
plt.title('f1')

In [329]:
newDf = pd.DataFrame(columns = ['product_id','periodicity','absCount','lastOrder','wasOrdered','decay'])

In [356]:
newDf = pd.DataFrame(columns = ['product_id','periodicity','absCount','lastOrder','wasOrdered','decay'])
for tId, userGroup in orders.groupby('user_id'):
    print tId
    # вот это можно вычислить один раз для пользователя
    trainOrder = userGroup.query("eval_set != 'prior'")
    trainOrderDay = trainOrder.date.values

    # получаем предзаказы
    preOrders = userGroup.query("eval_set == 'prior' ").fillna(0).merge(
        items,left_index=True,right_on='order_id',copy = False,how='inner')

    fullBasket = preOrders.product_id.unique()

    r2 = preOrders.groupby('product_id').agg(
        {'date': lambda x: np.diff(x.values).mean(),
         'order_number': lambda x: x.shape[0]}
        ).rename(columns={'date': 'periodicity','order_number':'absCount'})

    r3 = preOrders.groupby('product_id').agg(
        {'date': lambda x: int(trainOrderDay - x.iloc[-1])}
        ).rename(columns={'date':'lastOrder'})

    r2 = r2.merge(r3,left_index=True,right_index=True)
    r2.reset_index(inplace=True)
    
    trainBasket = list(userGroup.query("(user_id == @tUser) and eval_set != 'prior' ").fillna(0).merge(
        items,left_index=True,right_on='order_id',copy = False,how='inner').product_id.unique())

    r2 = r2.assign(wasOrdered = r2.product_id.apply(lambda x: x in trainBasket))
    # теперь можно рассчитать популярность каждого продукта в процентах
    r2 = r2.assign(decay = r2.lastOrder/r2.periodicity)
    r2.fillna(0,inplace=True)
    r2.replace(np.inf,0,inplace=True)
    #r2.merge(products,left_on='product_id',right_index=True)
    r2.to_hdf('newFeatures.hdf', 'features', format = 'table',append = True)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


KeyboardInterrupt: 

In [None]:
res = np.empty([0,5])
for user_id, val in resTrain.iteritems():
    order_id = val[0]
    y_pr = val[1]
    y_true = train.query('order_id == @order_id').product_id.values
    [precession, recall, f1] = f1Score(y_true,y_pr)
    res = np.vstack([res,[user_id, order_id, precession, recall, f1]])

In [373]:
df = pd.read_hdf('newFeatures2.hdf')
df

Unnamed: 0,product_id,periodicity,absCount,lastOrder,wasOrdered,userId
0,196,19.555556,10,14.0,True,1
1,10258,20.125000,9,14.0,True,1
2,10326,0.000000,1,97.0,False,1
3,12427,19.555556,10,14.0,False,1
4,13032,80.500000,3,14.0,True,1
5,13176,78.000000,2,97.0,False,1
6,14084,0.000000,1,190.0,False,1
7,17122,0.000000,1,97.0,False,1
8,25133,20.000000,8,14.0,True,1
9,26088,15.000000,2,175.0,True,1


In [364]:
df.query('userId == 7 and wasOrdered')

Unnamed: 0,product_id,periodicity,absCount,lastOrder,wasOrdered,userId
13,13198,24.714286,8,6.0,True,7
20,17638,15.6,11,23.0,True,7
40,29894,0.0,1,16.0,True,7
50,37999,49.0,5,13.0,True,7
53,40852,14.416667,13,6.0,True,7
58,43967,19.6,6,81.0,True,7
60,45066,64.333333,4,16.0,True,7
64,47272,44.666667,4,6.0,True,7
