In [44]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold 
from tqdm import tqdm

In [2]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
folder_path = '/content/drive/MyDrive/python_data/kaggle/instacart/data/'
files = ['aisles.csv', 'departments.csv', 'order_products__prior.csv', 'order_products__train.csv', 'orders.csv', 'products.csv']

In [4]:
# aisle = pd.read_csv(folder_path+files[0])
# dep = pd.read_csv(folder_path+files[1])
prior = pd.read_csv(folder_path+files[2])
train = pd.read_csv(folder_path+files[3])
orders = pd.read_csv(folder_path+files[4])
products = pd.read_csv(folder_path+files[5])
test = orders[orders['eval_set'] == 'test']

In [5]:
print(len(products))
products.head(2)

49688


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13


In [6]:
orders.head(2)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0


In [7]:
prior.head(2)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1


In [8]:
print(len(train))
train.head(2)

1384617


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1


In [9]:
'''train和test資料的user_id不重複'''

len(set(orders[orders['eval_set']=='train']['user_id']).intersection(set(orders[orders['eval_set']=='test']['user_id'])))

0

# Preprocessing
* 將prior資訊 agg: df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

## Hour2cat

In [10]:
def hour2cat(x):
  if (x>=6) & (x<= 12): #早上
    y= 0
  elif (x>12) & (x< 18): #下午
    y= 1
  else:
    y= 2 #晚上
  return y

In [11]:
orders['order_hour_of_day'] = orders['order_hour_of_day'].apply(lambda x: hour2cat(x))

## Merging 

In [12]:
train_order = orders[orders['eval_set']=='train'].drop(columns= ['eval_set'], axis= 1)
prior_order = orders[orders['eval_set']=='prior'].drop(columns= ['eval_set'], axis= 1)
train_order.columns = ['train_'+col for col in train_order.columns]
prior_order.columns = ['prior_'+col for col in prior_order.columns]

In [13]:
train_prior = pd.merge(train_order, prior_order, left_on= 'train_user_id', right_on='prior_user_id')
train_prior.head(2)

Unnamed: 0,train_order_id,train_user_id,train_order_number,train_order_dow,train_order_hour_of_day,train_days_since_prior_order,prior_order_id,prior_user_id,prior_order_number,prior_order_dow,prior_order_hour_of_day,prior_days_since_prior_order
0,1187899,1,11,4,0,14.0,2539329,1,1,2,0,
1,1187899,1,11,4,0,14.0,2398795,1,2,3,0,15.0


In [14]:
train_X = pd.merge(train_prior, prior, left_on= 'prior_order_id', right_on= 'order_id')
train_X = pd.merge(train_X, products, left_on= 'product_id', right_on= 'product_id')

## 觀察
* 對於同個train order id且相同product，可能會有許多不同prior order id的訊息，即相同的商品可能買過很多次，要如何組成一組feature？
* 平均每個product的prior order數: 2.44
* 最大每個product的prior order數: 99

In [15]:
print(len(train_X))
train_X.head(3)

20641991


Unnamed: 0,train_order_id,train_user_id,train_order_number,train_order_dow,train_order_hour_of_day,train_days_since_prior_order,prior_order_id,prior_user_id,prior_order_number,prior_order_dow,prior_order_hour_of_day,prior_days_since_prior_order,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id
0,1187899,1,11,4,0,14.0,2539329,1,1,2,0,,2539329,196,1,0,Soda,77,7
1,1187899,1,11,4,0,14.0,2398795,1,2,3,0,15.0,2398795,196,1,1,Soda,77,7
2,1187899,1,11,4,0,14.0,473747,1,3,3,0,21.0,473747,196,1,1,Soda,77,7


In [16]:
avg_n_prior_order = np.mean(train_X.groupby(['train_order_id', 'product_id'])['prior_order_id'].count())
max_n_prior_order = np.max(train_X.groupby(['train_order_id', 'product_id'])['prior_order_id'].count())
print(f'平均每個product的prior order數: {round(avg_n_prior_order, 2)}')
print(f'最大每個product的prior order數: {round(max_n_prior_order, 2)}')

平均每個product的prior order數: 2.44
最大每個product的prior order數: 99


In [17]:
cols = ['train_order_id', 'train_user_id', 'train_order_dow', 'train_order_hour_of_day', 'train_days_since_prior_order', 'product_id', 'aisle_id', 'department_id']
# train_X.groupby(cols).agg({'prior_order_dow':lambda x: x.mode()[0], 'order_id':'count'})
grouped_train_X = train_X.groupby(cols).agg({'prior_order_id':'count'}).add_suffix('_Count').reset_index()
# train_X.groupby(cols).count().reset_index()

In [18]:
print(len(grouped_train_X))
grouped_train_X.head(3)

8474661


Unnamed: 0,train_order_id,train_user_id,train_order_dow,train_order_hour_of_day,train_days_since_prior_order,product_id,aisle_id,department_id,prior_order_id_Count
0,1,112108,4,0,9.0,2067,3,19,1
1,1,112108,4,0,9.0,5707,3,19,2
2,1,112108,4,0,9.0,11109,108,16,2


## 產生Y 
* by merging grouped X and train data
* 有reordered(y=1)僅占10%

In [19]:
# train_array= np.array(train[['order_id', 'product_id']])
# grouped_train_array= np.array(grouped_train_X[['train_order_id', 'product_id']])

In [20]:
full_X = pd.merge(grouped_train_X, train.rename(columns={'reordered':'label'}), how= 'left', left_on=['train_order_id','product_id'], right_on= ['order_id', 'product_id'])

In [21]:
drop_cols= ['train_user_id', 'train_order_id', 'order_id', 'add_to_cart_order']
train_X = full_X.drop(columns=drop_cols, axis= 1).fillna(0).iloc[:, :-1]
train_y = full_X.drop(columns=drop_cols, axis= 1).fillna(0).iloc[:, -1]

In [22]:
train_y.value_counts()*100/len(train_y)

0.0    90.219975
1.0     9.780025
Name: label, dtype: float64

In [23]:
train_X.head(3)

Unnamed: 0,train_order_dow,train_order_hour_of_day,train_days_since_prior_order,product_id,aisle_id,department_id,prior_order_id_Count
0,4,0,9.0,2067,3,19,1
1,4,0,9.0,5707,3,19,2
2,4,0,9.0,11109,108,16,2


## OneHotEncoding
* 先fit好 encoder, 在batch training時將cat data做transform即可，降低記憶體使用

In [24]:
dense_cols= ['train_days_since_prior_order', 'prior_order_id_Count']

In [25]:
train_X_cat= train_X.drop(dense_cols, axis= 1).to_numpy()
train_X_dense= train_X[dense_cols].to_numpy()
train_y= train_y.to_numpy()

In [26]:
encoder= OneHotEncoder(sparse= False)
enc_fitted= encoder.fit(train_X_cat)

In [27]:
batch= train_X_cat[:32]
enc_fitted.transform(batch).shape

(32, 49633)

## Fields

In [28]:
fields = [len(np.unique(train_X_cat[:, i])) for i in range(train_X_cat.shape[1])] + [len(dense_cols)]

In [29]:
fields

[7, 3, 49468, 134, 21, 2]

# Custom Dataset

In [40]:
class custom_dataset(Dataset):
  def __init__(self, X_cat, X_dense, y):
    self.X_cat = torch.tensor(X_cat, dtype= torch.float)
    self.X_dense = torch.tensor(X_dense, dtype= torch.float)
    self.y = torch.tensor(y, dtype= torch.float)
  
  def __len__(self):
    return len(self.y)
  
  def __getitem__(self, idx):
    return self.X_cat[idx], self.X_dense[idx], self.y[idx]

In [41]:
train_dataset = custom_dataset(train_X_cat, train_X_dense, train_y)

# Model 

In [56]:
class DeepFM(nn.Module):
  def __init__(self, fields, k= 5, hidden_dims= [16, 16], dropout= 0.2, n_class= 1):
    super(DeepFM, self).__init__()
    self.fields = fields 
    self.k = k 
    self.hidden_dims = hidden_dims

    """Linear"""
    d = sum(fields)
    self.linear = nn.Linear(d, n_class, bias= False)

    """FM"""
    # self.FM_w = nn.Linear(1, n_class)
    self.embedding_ws = nn.ModuleList()
    for i in fields:
      self.embedding_ws.append(nn.Linear(i, k, bias= False))
    
    """DNN"""
    layers = []
    input_dim = k * len(fields)

    for hidden_dim in hidden_dims:
      layers.append(nn.Linear(input_dim, hidden_dim))
      layers.append(nn.BatchNorm1d(hidden_dim))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(p=dropout))
      input_dim = hidden_dim
    
    layers.append(nn.Linear(hidden_dims[-1], n_class))
    self.dnn = nn.Sequential(*layers)

  def Dense_Embedding(self, X):
    es = []
    start= 0
    for i, field in enumerate(self.fields):
      ei = self.embedding_ws[i](X[:, start:start+field]).unsqueeze(dim= 1) # ei: [n, 1, k]
      # ei = torch.matmul(X[:, start:start+field], self.embedding_ws[i]).unsqueeze(dim= 1) # ei: [n, 1, k]
      start += field
      es.append(ei)

    return torch.cat(es, dim= 1) # [n, n_fields, k]  

  
  def FM(self, X):

    sum_of_square = torch.sum(X, dim= 1)**2 #[n, k]
    square_of_sum = torch.sum(X**2, dim= 1)
    ix = sum_of_square - square_of_sum 
    FM_out = 0.5 * torch.sum(ix, dim= 1, keepdim= True) # [n, 1] 
    # return self.FM_w(FM_out)
    return FM_out

  def DNN(self, X):

    X = X.view(-1, self.k * len(self.fields)) # [n, k*n_fields]
    X = self.dnn(X)
    return X
  
  def forward(self, X):

    dense_X = self.Dense_Embedding(X)
    FM_y = self.FM(dense_X)
    DNN_y = self.DNN(dense_X)
    y = self.linear(X) + FM_y + DNN_y

    return nn.Sigmoid()(y)
    # return nn.ReLU()(y)

# Training

In [58]:
batch_size= 512
lr = 1e-3
n_epoch = 50
k = 20
p = 0.2
hidden_dims = [32, 32]
n_class = 1

In [59]:
train_loader = DataLoader(train_dataset, batch_size= batch_size, shuffle= True, num_workers=2)

In [60]:
test = KFold(n_splits=5, shuffle= True, random_state=42)
model = DeepFM(fields= fields, k= k, hidden_dims= hidden_dims, dropout= p, n_class= n_class).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr= lr)
# criterion = nn.CrossEntropyLoss()
criterion = nn.BCELoss()

## Process

In [None]:
for epoch in range(n_epoch):
  model.train()
  total_loss= list()
  # for i, (X_cat, X_dense, y) in enumerate(tqdm(train_loader)):
  for i, (X_cat, X_dense, y) in enumerate(train_loader):
    X_cat_onehot = torch.tensor(enc_fitted.transform(X_cat), dtype= torch.float)
    X= torch.cat([X_cat_onehot, X_dense], dim= 1).to(device)
    optimizer.zero_grad()
    output= model(X)
    loss= criterion(output, y.unsqueeze(dim= 1).to(device))
    loss.backward()
    optimizer.step()
    total_loss.append(loss.item())
    if i == 50:
      break
  print(f'avg loss: {round(np.mean(total_loss), 4)}')

avg loss: 3.9621
avg loss: 0.996
avg loss: 0.4318
avg loss: 0.3671
avg loss: 0.3465
avg loss: 0.3446
avg loss: 0.3374
avg loss: 0.3329
avg loss: 0.3285
