In [1]:
import numpy as np 
from scipy import stats
import pandas as pd 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold 
from tqdm import tqdm
from collections import Counter

In [2]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# folder_path = '/content/drive/MyDrive/python_data/kaggle/instacart/data/'
# files = ['aisles.csv', 'departments.csv', 'order_products__prior.csv', 'order_products__train.csv', 'orders.csv', 'products.csv']
# aisle = pd.read_csv(folder_path+files[0])
# dep = pd.read_csv(folder_path+files[1])
# prior = pd.read_csv(folder_path+files[2])
# train = pd.read_csv(folder_path+files[3])
# orders = pd.read_csv(folder_path+files[4])
# products = pd.read_csv(folder_path+files[5])
# test = orders[orders['eval_set'] == 'test']

In [4]:
# print(len(products))
# products.head(2)

In [5]:
# orders.head(2)

In [6]:
# prior.head(2)

In [7]:
'''train和test資料的user_id不重複'''

# len(set(orders[orders['eval_set']=='train']['user_id']).intersection(set(orders[orders['eval_set']=='test']['user_id'])))

'train和test資料的user_id不重複'

# Preprocessing
* 將prior資訊 agg: df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

## Hour2cat

In [8]:
def hour2cat(x):
  if (x>=6) & (x<= 12): #早上
    y= 0
  elif (x>12) & (x< 18): #下午
    y= 1
  else:
    y= 2 #晚上
  return y

In [9]:
# orders['order_hour_of_day'] = orders['order_hour_of_day'].apply(lambda x: hour2cat(x))

# Get Data

In [10]:
def get_data(train_bool= True):
  
  prior_order = orders[orders['eval_set']=='prior'].drop(columns= ['eval_set'], axis= 1)

  if train_bool:
    
    train_order = orders[orders['eval_set']=='train'].drop(columns= ['eval_set'], axis= 1)
    train_order.columns = ['train_'+col for col in train_order.columns]
    prior_order.columns = ['prior_'+col for col in prior_order.columns]
    train_prior = pd.merge(train_order, prior_order, left_on= 'train_user_id', right_on='prior_user_id')
    train_X = pd.merge(train_prior, prior, left_on= 'prior_order_id', right_on= 'order_id')
    train_X = pd.merge(train_X, products, left_on= 'product_id', right_on= 'product_id')

    '''針對歷史order資訊做groupby'''
    train_cols = ['train_order_id', 'train_user_id', 'train_order_dow', 'train_order_hour_of_day', 'train_days_since_prior_order', 'product_id', 'aisle_id', 'department_id']
    grouped_train_X = train_X.groupby(train_cols, dropna=False).agg(
      prior_order_count= ('prior_order_id', 'count'), 
      prior_dow_mode= ('prior_order_dow', lambda x: Counter(x).most_common(1)[0][0]), 
      prior_hod_mode= ('prior_order_hour_of_day', lambda x: Counter(x).most_common(1)[0][0]), 
      prior_dspo_mean= ('prior_days_since_prior_order', 'mean'),
      prior_dspo_var= ('prior_days_since_prior_order', 'var')
      ).reset_index().fillna(0)

    '''user feature with dep & aisle'''
    train_user_dep_ratio= train_X.groupby(['train_user_id', 'department_id'])['department_id'].count()/train_X.groupby(['train_user_id'])['department_id'].count()
    train_user_aisle_ratio= train_X.groupby(['train_user_id', 'aisle_id'])['aisle_id'].count()/train_X.groupby(['train_user_id'])['aisle_id'].count()
    grouped_train_X['user_dep_ratio'] = train_user_dep_ratio[pd.MultiIndex.from_frame(grouped_train_X[['train_user_id', 'department_id']])].values
    grouped_train_X['user_aisle_ratio'] = train_user_aisle_ratio[pd.MultiIndex.from_frame(grouped_train_X[['train_user_id', 'aisle_id']])].values

    '''Train x合併y'''
    full_X = pd.merge(grouped_train_X, train.rename(columns={'reordered':'label'}), how= 'left', left_on=['train_order_id','product_id'], right_on= ['order_id', 'product_id'])

    '''drop不需要的columns'''
    drop_cols= ['train_user_id', 'train_order_id', 'order_id', 'add_to_cart_order']
    train_X = full_X.drop(columns=drop_cols, axis= 1).fillna(0).iloc[:, :-1]
    train_y = full_X.drop(columns=drop_cols, axis= 1).fillna(0).iloc[:, -1]

    return train_X, train_y
  
  else:
    
    test_order = orders[orders['eval_set']=='test'].drop(columns= ['eval_set'], axis= 1)
    prior_order.columns = ['prior_'+col for col in prior_order.columns]
    test_order.columns = ['test_'+col for col in test_order.columns]
    test_prior = pd.merge(test_order, prior_order, left_on= 'test_user_id', right_on='prior_user_id')
    test_X = pd.merge(test_prior, prior, left_on= 'prior_order_id', right_on= 'order_id')
    test_X = pd.merge(test_X, products, left_on= 'product_id', right_on= 'product_id')

    '''針對歷史order資訊做groupby'''
    
    test_cols= ['test_order_id', 'test_user_id', 'test_order_dow', 'test_order_hour_of_day', 'test_days_since_prior_order', 'product_id', 'aisle_id', 'department_id']
    grouped_test_X = test_X.groupby(test_cols, dropna=False).agg(
        prior_order_count= ('prior_order_id', 'count'), 
        prior_dow_mode= ('prior_order_dow', lambda x: Counter(x).most_common(1)[0][0]), 
        prior_hod_mode= ('prior_order_hour_of_day', lambda x: Counter(x).most_common(1)[0][0]), 
        prior_dspo_mean= ('prior_days_since_prior_order', 'mean'),
        prior_dspo_var= ('prior_days_since_prior_order', 'var')
        ).reset_index().fillna(0)

    test_order_id = test_X['test_order_id'].values

    '''user feature with dep & aisle'''
    test_user_dep_ratio= test_X.groupby(['test_user_id', 'department_id'])['department_id'].count()/test_X.groupby(['test_user_id'])['department_id'].count()
    test_user_aisle_ratio= test_X.groupby(['test_user_id', 'aisle_id'])['aisle_id'].count()/test_X.groupby(['test_user_id'])['aisle_id'].count()

    grouped_test_X['user_dep_ratio'] = test_user_dep_ratio[pd.MultiIndex.from_frame(grouped_test_X[['test_user_id', 'department_id']])].values
    grouped_test_X['user_aisle_ratio'] = test_user_aisle_ratio[pd.MultiIndex.from_frame(grouped_test_X[['test_user_id', 'aisle_id']])].values
    test_X = grouped_test_X.drop(columns=['test_user_id', 'test_order_id'], axis= 1).fillna(0)
    return test_X, test_order_id

## 觀察
* 對於同個train order id且相同product，可能會有許多不同prior order id的訊息，即相同的商品可能買過很多次，要如何組成一組feature？
* 平均每個product的prior order數: 2.44
* 最大每個product的prior order數: 99

In [11]:
# avg_n_prior_order = np.mean(train_X.groupby(['train_order_id', 'product_id'])['prior_order_id'].count())
# max_n_prior_order = np.max(train_X.groupby(['train_order_id', 'product_id'])['prior_order_id'].count())
# print(f'平均每個product的prior order數: {round(avg_n_prior_order, 2)}')
# print(f'最大每個product的prior order數: {round(max_n_prior_order, 2)}')

In [12]:
'''
prior_order_dow: mode
prior_order_hour_of_day: mode
prior_days_since_prior_order: mean, var
user_dep: ratio 
user_aisle: ratio  
'''

'\nprior_order_dow: mode\nprior_order_hour_of_day: mode\nprior_days_since_prior_order: mean, var\nuser_dep: ratio \nuser_aisle: ratio  \n'

In [13]:
# grouped_train_X.head(3)

In [14]:
# print(len(grouped_train_X))
# print(len(test_X))
# grouped_train_X.head(3)

## 產生Y 
* by merging grouped X and train data
* 有reordered(y=1)僅占10%

In [15]:
# full_X = pd.merge(grouped_train_X, train.rename(columns={'reordered':'label'}), how= 'left', left_on=['train_order_id','product_id'], right_on= ['order_id', 'product_id'])

In [16]:
# drop_cols= ['train_user_id', 'train_order_id', 'order_id', 'add_to_cart_order']
# train_X = full_X.drop(columns=drop_cols, axis= 1).fillna(0).iloc[:, :-1]
# train_y = full_X.drop(columns=drop_cols, axis= 1).fillna(0).iloc[:, -1]
# test_X = test_X.drop(columns=['test_user_id', 'test_order_id'], axis= 1).fillna(0)

In [17]:
# train_y.value_counts()*100/len(train_y)

# OneHotEncoding & load data
* 先fit好 encoder, 在batch training時將cat data做transform即可，降低記憶體使用

In [18]:
'''helper generator'''
# train_X, train_y= get_data(train_bool=True)
# test_X, test_order_id= get_data(train_bool=False)

'''load pkl from data'''
train_X= pd.read_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_X.pkl')
train_y= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_y.pkl', allow_pickle=True) # Series
test_X= pd.read_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_X.pkl')

'''save test_X as dataframe for refering user_id and products'''
# train_X.to_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_X.pkl')
# train_y.to_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_y.pkl')
# test_X.to_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_X.pkl')

'save test_X as dataframe for refering user_id and products'

In [19]:
train_dense_cols= ['train_days_since_prior_order', 'prior_order_count', 'prior_dspo_mean', 'prior_dspo_var', 'user_dep_ratio', 'user_aisle_ratio']
test_dense_cols= ['test_days_since_prior_order', 'prior_order_count', 'prior_dspo_mean', 'prior_dspo_var', 'user_dep_ratio', 'user_aisle_ratio']

In [20]:
train_X_cat= train_X.drop(train_dense_cols, axis= 1).to_numpy()
train_X_dense= train_X[train_dense_cols].to_numpy()
train_y= train_y.to_numpy()

test_X_cat= test_X.drop(test_dense_cols, axis= 1).to_numpy()
test_X_dense= test_X[test_dense_cols].to_numpy()

fields = [len(np.unique(train_X_cat[:, i])) for i in range(train_X_cat.shape[1])] + [len(train_dense_cols)]

In [21]:
# np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_X_cat.npy', train_X_cat)
# np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_X_dense.npy', train_X_dense)
# np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_y.npy', train_y)
# np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_X_cat.npy', test_X_cat)
# np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_X_dense.npy', test_X_dense)
# np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/fields.npy', fields)
# np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_order_id.npy', test_order_id)

In [22]:
# train_X_cat= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_X_cat.npy')
# train_X_dense= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_X_dense.npy')
# train_y= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_y.npy')
# test_X_cat= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_X_cat.npy')
# test_X_dense= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_X_dense.npy')
# fields= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/fields.npy')
# test_order_id= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_order_id.npy')
# test_X= pd.read_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_X.pkl')

In [23]:
encoder= OneHotEncoder(sparse= False, handle_unknown='ignore')
enc_fitted= encoder.fit(train_X_cat)

In [24]:
# batch= train_X_cat[:32]
# enc_fitted.transform(batch).shape

# Custom Dataset

In [25]:
class custom_dataset(Dataset):
  def __init__(self, X_cat, X_dense, y= None, if_y= False):
    self.X_cat = torch.tensor(X_cat, dtype= torch.float)
    self.X_dense = torch.tensor(X_dense, dtype= torch.float)
    self.if_y= if_y
    if if_y:
      self.y = torch.tensor(y, dtype= torch.float)
  
  def __len__(self):
    return len(self.X_cat)
  
  def __getitem__(self, idx):
    if self.if_y:
      return self.X_cat[idx], self.X_dense[idx], self.y[idx]
    else:
      return self.X_cat[idx], self.X_dense[idx]

# Model 
* Evaluation: mean F1 score

In [26]:
class DeepFM(nn.Module):
  def __init__(self, fields, k= 5, hidden_dims= [16, 16], dropout= 0.2, n_class= 1):
    super(DeepFM, self).__init__()
    self.fields = fields 
    self.k = k 
    self.hidden_dims = hidden_dims

    """Linear"""
    d = sum(fields)
    self.linear = nn.Linear(d, n_class, bias= False)

    """FM"""
    # self.FM_w = nn.Linear(1, n_class)
    self.embedding_ws = nn.ModuleList()
    for i in fields:
      self.embedding_ws.append(nn.Linear(i, k, bias= False))
    
    """DNN"""
    layers = []
    input_dim = k * len(fields)

    for hidden_dim in hidden_dims:
      layers.append(nn.Linear(input_dim, hidden_dim))
      layers.append(nn.BatchNorm1d(hidden_dim))
      layers.append(nn.ReLU())
      layers.append(nn.Dropout(p=dropout))
      input_dim = hidden_dim
    
    layers.append(nn.Linear(hidden_dims[-1], n_class))
    self.dnn = nn.Sequential(*layers)

  def Dense_Embedding(self, X):
    es = []
    start= 0
    for i, field in enumerate(self.fields):
      ei = self.embedding_ws[i](X[:, start:start+field]).unsqueeze(dim= 1) # ei: [n, 1, k]
      # ei = torch.matmul(X[:, start:start+field], self.embedding_ws[i]).unsqueeze(dim= 1) # ei: [n, 1, k]
      start += field
      es.append(ei)

    return torch.cat(es, dim= 1) # [n, n_fields, k]  

  
  def FM(self, X):

    sum_of_square = torch.sum(X, dim= 1)**2 #[n, k]
    square_of_sum = torch.sum(X**2, dim= 1)
    ix = sum_of_square - square_of_sum 
    FM_out = 0.5 * torch.sum(ix, dim= 1, keepdim= True) # [n, 1] 
    # return self.FM_w(FM_out)
    return FM_out

  def DNN(self, X):

    X = X.view(-1, self.k * len(self.fields)) # [n, k*n_fields]
    X = self.dnn(X)
    return X
  
  def forward(self, X):

    dense_X = self.Dense_Embedding(X)
    FM_y = self.FM(dense_X)
    DNN_y = self.DNN(dense_X)
    y = self.linear(X) + FM_y + DNN_y

    # return nn.Sigmoid()(y) # BCELoss
    return y # nn.BCEWithLogitsLoss(pos_weight=9)

# Training

In [27]:
batch_size= 512
lr = 1e-3
n_epoch = 2
k = 20
p = 0.3
hidden_dims = [128, 128]
n_class = 1

In [28]:
train_dataset = custom_dataset(train_X_cat, train_X_dense, train_y, if_y= True)
test_dataset = custom_dataset(test_X_cat, test_X_dense, if_y= False)

train_loader = DataLoader(train_dataset, batch_size= batch_size, shuffle= True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size= batch_size, shuffle= False, num_workers=2)

In [None]:
kf = KFold(n_splits=5, shuffle= True, random_state=42)
model = DeepFM(fields= fields, k= k, hidden_dims= hidden_dims, dropout= p, n_class= n_class).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr= lr)
# weight= torch.tensor([1, 10], device= device)
# criterion = nn.BCELoss(weight= weight, reduction= 'sum')
criterion= nn.BCEWithLogitsLoss(pos_weight=torch.tensor(9, device= device))

## Process

In [None]:
for epoch in range(n_epoch):
  model.train()
  total_loss= list()
  for i, (X_cat, X_dense, y) in enumerate(tqdm(train_loader)):
  # for i, (X_cat, X_dense, y) in enumerate(train_loader):
    X_cat_onehot = torch.tensor(enc_fitted.transform(X_cat), dtype= torch.float)
    X= torch.cat([X_cat_onehot, X_dense], dim= 1).to(device)
    optimizer.zero_grad()
    output= model(X)
    loss= criterion(output, y.unsqueeze(dim= 1).to(device))
    loss.backward()
    optimizer.step()
    total_loss.append(loss.item())
    # if i == 200:
    #   break
  print(f'avg loss: {round(np.mean(total_loss), 4)}')

In [None]:
torch.save(model, f'instacart.pt')

In [None]:
!cp 'instacart.pt' '/content/drive/MyDrive/python_data/kaggle/instacart'

# Create Prediction

In [None]:
import csv

In [None]:
# model= torch.load('/content/drive/MyDrive/python_data/kaggle/instacart/instacart.pt', map_location=torch.device('cpu'))

In [None]:
'''
if nn.BCEWithLogitsLoss(pos_weight=9) is used, 
then output from the model should pass a sigmoid function to represent probability
'''
# preds= []
# model.eval()
# with torch.no_grad():
#   for i, (X_cat, X_dense) in enumerate(tqdm(test_loader)):
#     X_cat_onehot = torch.tensor(enc_fitted.transform(X_cat), dtype= torch.float)
#     X= torch.cat([X_cat_onehot, X_dense], dim= 1).to(device)
#     output= model(X)
#     output= nn.Sigmoid()(output) # Careful 
#     preds.extend(output.squeeze(dim=1).detach().cpu().numpy())  

In [None]:
# preds = np.array(preds)
# np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/preds.npy', preds)

In [None]:
preds= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/preds.npy')

In [None]:
test_X['order_id']= test_order_id
test_X['pred']= preds

In [None]:
# (test_X_cat[:, 2] == -1).any()

In [None]:
def rule(x):
  if x >= 0.5:
    return 1 
  else:
    return 0

In [None]:
test_X['pred_binary'] = test_X['pred'].apply(rule)

In [None]:
test_X['pred_binary'].value_counts()

In [None]:
submission_dict= {}
for i, row in test_X[test_X['pred_binary']==1].iterrows():
  order_id = int(row['order_id'])
  product_id= int(row['product_id'])
  if order_id in submission_dict.keys():
    submission_dict[order_id].append(product_id)
  else:
    submission_dict[order_id] = [product_id]

In [None]:
for order_id in test_X[test_X['pred_binary']==0]['order_id'].unique():
  if order_id in submission_dict.keys():
    pass
  else:
    submission_dict[order_id]= 'None'

In [None]:
submission_dict.values()

In [None]:
with open('submission.csv', 'w', newline='') as csvfile:
  # 建立 CSV 檔寫入器
  writer = csv.writer(csvfile, delimiter=',')

  # 寫入一列資料
  writer.writerow(['order_id', 'products'])

  # 寫入另外幾列資料
  for key, value in submission_dict.items():
    if value == 'None':
      writer.writerow([key, 'None'])
    else:
      value= [str(id) for id in value]
      writer.writerow([str(key), ' '.join(value)])

In [None]:
len(submission_dict.keys())