In [1]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from tqdm import tqdm
from collections import Counter

In [2]:
device= 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
# folder_path = '/content/drive/MyDrive/python_data/kaggle/instacart/data/'
# files = ['aisles.csv', 'departments.csv', 'order_products__prior.csv', 'order_products__train.csv', 'orders.csv', 'products.csv']
# aisle = pd.read_csv(folder_path+files[0])
# dep = pd.read_csv(folder_path+files[1])
# prior_product = pd.read_csv(folder_path+files[2])
# train_product = pd.read_csv(folder_path+files[3])
# orders = pd.read_csv(folder_path+files[4])
# products = pd.read_csv(folder_path+files[5]).drop(columns= ['product_name'])

# print('training size:', len(train_product))
# print('reodered %: ', round(100*train_product['reordered'].value_counts()[1]/len(train_product), 1))

# 概念 & Utils
* 138萬training data都有其對應訂單資訊 > merge
* 每個training data都是對應一個product_id -> 該user對該product_id歷史資訊
* 該user對該product_id歷史資訊要前處理-> agg
* 歷史資訊要考慮對該product是否有reordered和add_to_cart_order
* We have the training data for new orders in order_products__train.csv but only the metadata for the test orders

In [4]:
def get_data(eval_set= 'train'):

  '''prior order所有資訊'''
  order_info_prior = orders[orders['eval_set']== 'prior']
  prior_info = pd.merge(prior_product, order_info_prior, on= 'order_id', how= 'left').drop(columns= ['eval_set', 'order_id']) #每個(user_id, product_id)可能不只一個
  prior_info.rename(columns= {'reordered':'reordered_prior'}, inplace= True)
  # prior_info.head(3)
  '''order now資訊'''
  order_info_now = orders[orders['eval_set']== eval_set].drop(columns= ['eval_set'])

  if eval_set == 'train':
    order_info_now = pd.merge(train_product.drop(columns=['add_to_cart_order']), order_info_now, on= 'order_id', how= 'left')
    print(order_info_now.head(1))
    basic_cols = ['order_id', 
            'user_id', 
            'product_id', 
            'reordered', 
            'order_number_now', 
            'order_dow_now', 
            'order_hour_of_day_now', 
            'days_since_prior_order_now']
    X = pd.merge(order_info_now, prior_info, on= ['user_id', 'product_id'], how= 'left', suffixes= ['_now', '_prior']).fillna(0) # 全部基本資訊
  else:
    '''user和prior_info 合併'''

    basic_cols = ['order_id', 
            'user_id', 
            'product_id', 
            'order_number_now', 
            'order_dow_now', 
            'order_hour_of_day_now', 
            'days_since_prior_order_now']
    X = pd.merge(order_info_now, prior_info, on= 'user_id', how= 'left', suffixes= ['_now', '_prior']).fillna(0) # 全部基本資訊

  print('group前', X.head(3))
  '''same for train and test'''
  X = X.groupby(basic_cols).agg(add2cart_mode= ('add_to_cart_order', lambda x: Counter(x).most_common(1)[0][0]),
              reorder_max= ('reordered_prior', 'max'), 
              order_number_max= ('order_number_prior', 'max'),
              order_dow_mode= ('order_dow_prior', lambda x: Counter(x).most_common(1)[0][0]),
              order_hod_mode= ('order_hour_of_day_prior', lambda x: Counter(x).most_common(1)[0][0]), # transformed already to 1, 2, 3
              days_since_mean= ('days_since_prior_order_prior', 'mean'),
              days_since_std= ('days_since_prior_order_prior', 'std'),
              order_count= ('order_number_prior', 'size')
              ).reset_index().fillna(0)
  print('group後', X.head(3))

  '''加入product info以及user_product_feature'''
  X = pd.merge(X, products, on='product_id', how= 'left')
  user_dep_ratio= X.groupby(['user_id', 'department_id'])['department_id'].count()/X.groupby(['user_id'])['department_id'].count()
  user_aisle_ratio= X.groupby(['user_id', 'aisle_id'])['aisle_id'].count()/X.groupby(['user_id'])['aisle_id'].count()
  X['user_dep_ratio'] = user_dep_ratio[pd.MultiIndex.from_frame(X[['user_id', 'department_id']])].values
  X['user_aisle_ratio'] = user_aisle_ratio[pd.MultiIndex.from_frame(X[['user_id', 'aisle_id']])].values
  
  '''drop不需要的columns'''
  if eval_set == 'train':
    # train_y = X['reordered']
    # X.drop(columns=['order_id', 'user_id', 'reordered'], inplace=True)
    return X
  else:
    # order_ids = X['order_id']
    # X.drop(columns=['order_id', 'user_id'], inplace=True)
    return X

In [5]:
def hour2cat(x):
  if (x>=6) & (x<= 12): #早上
    y= 0
  elif (x>12) & (x< 18): #下午
    y= 1
  else:
    y= 2 #晚上
  return y

In [6]:
'''train和test資料的user_id不重複'''

# len(set(orders[orders['eval_set']=='train']['user_id']).intersection(set(orders[orders['eval_set']=='test']['user_id'])))

'train和test資料的user_id不重複'

## 觀察
* 對於同個train order id且相同product，可能會有許多不同prior order id的訊息，即相同的商品可能買過很多次，要如何組成一組feature？
* 平均每個product的prior order數: 2.44
* 最大每個product的prior order數: 99
* 有reordered(y=1)僅占10%

In [7]:
# avg_n_prior_order = np.mean(train_X.groupby(['train_order_id', 'product_id'])['prior_order_id'].count())
# max_n_prior_order = np.max(train_X.groupby(['train_order_id', 'product_id'])['prior_order_id'].count())
# print(f'平均每個product的prior order數: {round(avg_n_prior_order, 2)}')
# print(f'最大每個product的prior order數: {round(max_n_prior_order, 2)}')

# OneHotEncoding & load data 
* 先fit好 encoder, 在batch training時將cat data做transform即可，降低記憶體使用

In [None]:
# train_X = get_data(eval_set= 'train')
# test_X = get_data(eval_set= 'test')
# train_X.to_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/new_train_X.pkl')
# test_X.to_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/new_test_X.pkl')
train_X = pd.read_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/new_train_X.pkl')
test_X = pd.read_pickle('/content/drive/MyDrive/python_data/kaggle/instacart/data/new_test_X.pkl')

In [None]:
train_y = train_X['reordered']
order_ids = test_X['order_id']
train_X.drop(columns= ['order_id', 'user_id', 'reordered'], inplace= True)
test_X.drop(columns= ['order_id', 'user_id'], inplace= True)

In [None]:
'''轉小時成類別'''
train_X['order_hour_of_day_now'] = train_X['order_hour_of_day_now'].apply(lambda x: hour2cat(x))
test_X['order_hour_of_day_now'] = test_X['order_hour_of_day_now'].apply(lambda x: hour2cat(x))
train_X['order_hod_mode'] = train_X['order_hod_mode'].apply(lambda x: hour2cat(x))
test_X['order_hod_mode'] = test_X['order_hod_mode'].apply(lambda x: hour2cat(x))

In [None]:
dense_features = ['order_number_now', 
        'days_since_prior_order_now',
        'add2cart_mode',
        'order_number_max',
        'days_since_mean',
        'days_since_std',
        'order_count',
        'user_dep_ratio',
        'user_aisle_ratio']

train_dense, test_dense = train_X[dense_features], test_X[dense_features]
train_cat, test_cat = train_X.drop(columns= dense_features), test_X.drop(columns= dense_features)

'''標準化連續特徵'''
scaler = MinMaxScaler()
train_size = len(train_dense)
dense = pd.concat([train_dense, test_dense], axis= 0)
dense = scaler.fit_transform(dense)
train_dense, test_dense = dense[:train_size, :], dense[train_size:, :] 

'''OneHotEncoder類別特徵'''
enc = OneHotEncoder(sparse= False)
enc_fitted = enc.fit(train_cat)

'''from dataframe to numpy array'''
train_cat, test_cat = train_cat.to_numpy(), test_cat.to_numpy()

'''fields for model'''
fields = [len(np.unique(train_cat[:, i])) for i in range(train_cat.shape[1])] + [len(dense_features)]

In [None]:
print(len(train_cat), len(test_cat))
# enc_fitted.transform(train_cat)
train_y.value_counts()/len(train_cat)

# Custom Dataset

In [None]:
class custom_dataset(Dataset):
  def __init__(self, X_cat, X_dense, y= None, if_y= False):
    self.X_cat = torch.tensor(X_cat, dtype= torch.float)
    self.X_dense = torch.tensor(X_dense, dtype= torch.float)
    self.if_y= if_y
    if if_y:
      self.y = torch.tensor(y, dtype= torch.float)
  
  def __len__(self):
    return len(self.X_cat)
  
  def __getitem__(self, idx):
    if self.if_y:
      return self.X_cat[idx], self.X_dense[idx], self.y[idx]
    else:
      return self.X_cat[idx], self.X_dense[idx]

# Model 
* Evaluation: mean F1 score

In [None]:
class DeepFM(nn.Module):
  def __init__(self, fields, k= 5, hidden_dims= [16, 16], dropout= 0.2, n_class= 1):
    super(DeepFM, self).__init__()
    self.fields = fields 
    self.k = k 
    self.hidden_dims = hidden_dims
    self.dropout= nn.Dropout(p=dropout)

    """Linear"""
    d = sum(fields)
    self.linear = nn.Linear(d, n_class, bias= False)

    """FM"""
    # self.FM_w = nn.Linear(1, n_class)
    self.embedding_ws = nn.ModuleList()
    for i in fields:
      self.embedding_ws.append(nn.Linear(i, k, bias= False))
    
    """DNN"""
    layers = []
    input_dim = k * len(fields)

    for hidden_dim in hidden_dims:
      layers.append(nn.Linear(input_dim, hidden_dim))
      layers.append(nn.BatchNorm1d(hidden_dim))
      layers.append(nn.ReLU())
      layers.append(self.dropout)
      input_dim = hidden_dim
    
    layers.append(nn.Linear(hidden_dims[-1], n_class))
    self.dnn = nn.Sequential(*layers)

  def Dense_Embedding(self, X):
    es = []
    start= 0
    for i, field in enumerate(self.fields):
      ei = self.embedding_ws[i](X[:, start:start+field]).unsqueeze(dim= 1) # ei: [n, 1, k]
      # ei = torch.matmul(X[:, start:start+field], self.embedding_ws[i]).unsqueeze(dim= 1) # ei: [n, 1, k]
      start += field
      es.append(ei)

    return torch.cat(es, dim= 1) # [n, n_fields, k]  

  
  def FM(self, X):

    sum_of_square = torch.sum(X, dim= 1)**2 #[n, k]
    square_of_sum = torch.sum(X**2, dim= 1)
    ix = sum_of_square - square_of_sum 
    FM_out = 0.5 * torch.sum(ix, dim= 1, keepdim= True) # [n, 1] 
    FM_out = self.dropout(FM_out)
    # return self.FM_w(FM_out)
    return FM_out

  def DNN(self, X):

    X = X.view(-1, self.k * len(self.fields)) # [n, k*n_fields]
    X = self.dnn(X)
    return X
  
  def forward(self, X):

    dense_X = self.Dense_Embedding(X)
    FM_y = self.FM(dense_X)
    DNN_y = self.DNN(dense_X)
    y = self.dropout(self.linear(X)) + FM_y + DNN_y

    # return nn.Sigmoid()(y) # BCELoss
    return y # nn.BCEWithLogitsLoss(pos_weight=9)

# Training

In [None]:
batch_size= 512
lr = 1e-3
n_epoch = 2
k = 10
p = 0.5
hidden_dims = [64, 64]
n_class = 1
threshold= 0.6

In [None]:
train_dataset = custom_dataset(train_cat, train_dense, train_y, if_y= True)
# train_size= int(0.8*len(data_dataset))
# val_size= len(data_dataset)- train_size

'''train, val'''
# train_dataset, val_dataset= random_split(data_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(train_dataset, batch_size= batch_size, shuffle= True, num_workers=2)
# val_loader = DataLoader(val_dataset, batch_size= batch_size, shuffle= False, num_workers=2)

'''test'''
test_dataset = custom_dataset(test_cat, test_dense, if_y= False)
test_loader = DataLoader(test_dataset, batch_size= 512, shuffle= False, num_workers=2)

In [None]:
model= DeepFM(fields= fields, k= k, hidden_dims= hidden_dims, dropout= p, n_class= n_class).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr= lr)
criterion= nn.BCEWithLogitsLoss()
# criterion= nn.BCEWithLogitsLoss(pos_weight=torch.tensor(9, device= device))

## Process

In [None]:
for epoch in range(n_epoch):

  model.train()
  train_loss= 0
  train_score= 0
  val_score= 0

  '''train'''
  for i, (X_cat, X_dense, y) in enumerate(tqdm(train_loader)):
  # for i, (X_cat, X_dense, y) in enumerate(train_loader):
    X_cat_onehot = torch.tensor(enc_fitted.transform(X_cat), dtype= torch.float)
    X= torch.cat([X_cat_onehot, X_dense], dim= 1).to(device)
    optimizer.zero_grad()
    output= model(X)
    loss= criterion(output, y.unsqueeze(dim= 1).to(device))
    loss.backward()
    optimizer.step()
    train_loss += loss.item()
    train_score += sum(nn.Sigmoid()(output)>threshold)

  print(f'train loss: {round(train_loss.item()/train_size, 3)}| train accu: {round(train_score/train_size, 3)}')

  # '''val'''
  # model.eval()
  # for i, (X_cat, X_dense, y) in enumerate(tqdm(val_loader)):
  #   X_cat_onehot = torch.tensor(enc_fitted.transform(X_cat), dtype= torch.float)
  #   X= torch.cat([X_cat_onehot, X_dense], dim= 1).to(device)
  #   with torch.no_grad():
  #     output= model(X)
  #   val_score += sum(nn.Sigmoid()(output)>threshold)
  # print(f'val accu: {round(val_score.item()/val_size, 3)}')


In [None]:
# torch.save(model, f'instacart.pt')

In [None]:
# !cp 'instacart.pt' '/content/drive/MyDrive/python_data/kaggle/instacart'

# Create Prediction

In [None]:
import csv

In [None]:
model= torch.load('/content/drive/MyDrive/python_data/kaggle/instacart/instacart.pt', map_location=torch.device(device))
# model= torch.load('/content/drive/MyDrive/python_data/kaggle/instacart/instacart.pt')

In [None]:
'''
if nn.BCEWithLogitsLoss(pos_weight=9) is used, 
then output from the model should pass a sigmoid function to represent probability for test data
'''
# import time 

preds= []
model.eval()
with torch.no_grad():
  for i, (X_cat, X_dense) in enumerate(tqdm(test_loader)):
    # st= time.time()
    X_cat_onehot = torch.tensor(enc_fitted.transform(X_cat), dtype= torch.float)
    X= torch.cat([X_cat_onehot, X_dense], dim= 1).to(device)
    # en= time.time()
    # print(f'time taken: {en-st}s')
    output= model(X)
    output= nn.Sigmoid()(output) # Careful 
    preds.extend(output.squeeze(dim=1).detach().cpu().numpy())  

In [None]:
preds = np.array(preds)
np.save('/content/drive/MyDrive/python_data/kaggle/instacart/data/preds.npy', preds)

In [None]:
preds= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/preds.npy')

In [None]:
test_X['order_id']= test_order_id
test_X['pred']= preds

In [None]:
# (test_X_cat[:, 2] == -1).any()

In [None]:
def rule(x):
  if x >= 0.6:
    return 1 
  else:
    return 0

In [None]:
test_X['pred_binary'] = test_X['pred'].apply(rule)

In [None]:
test_X['pred_binary'].value_counts()

In [None]:
submission_dict= {}
for i, row in test_X[test_X['pred_binary']==1].iterrows():
  order_id = int(row['order_id'])
  product_id= int(row['product_id'])
  if order_id in submission_dict.keys():
    submission_dict[order_id].append(product_id)
  else:
    submission_dict[order_id] = [product_id]

In [None]:
for order_id in test_X[test_X['pred_binary']==0]['order_id'].unique():
  if order_id in submission_dict.keys():
    pass
  else:
    submission_dict[order_id]= 'None'

In [None]:
submission_dict.values()

In [None]:
with open('submission.csv', 'w', newline='') as csvfile:
  # 建立 CSV 檔寫入器
  writer = csv.writer(csvfile, delimiter=',')

  # 寫入一列資料
  writer.writerow(['order_id', 'products'])

  # 寫入另外幾列資料
  for key, value in submission_dict.items():
    if value == 'None':
      writer.writerow([key, 'None'])
    else:
      value= [str(id) for id in value]
      writer.writerow([str(key), ' '.join(value)])

In [None]:
len(submission_dict.keys())

# Trivial

In [None]:
! pip install pyarrow

In [None]:
import pyarrow.parquet as pq

In [None]:
def load_data(from_file= True):
	if from_file:
		'''load parquet from data'''
		# train_X1= pd.read_parquet('./data/train_X1.parquet', engine='pyarrow')
		# train_X2= pd.read_parquet('./data/train_X2.parquet', engine='pyarrow')
		# test_X= pd.read_parquet('./data/test_X.parquet', engine='pyarrow')
		
		train_y= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_y.npy') # allow_pickle=True Series
		train_X1= pq.read_table('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_X1.parquet').to_pandas()
		train_X2= pq.read_table('/content/drive/MyDrive/python_data/kaggle/instacart/data/train_X2.parquet').to_pandas()
		test_X= pq.read_table('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_X.parquet').to_pandas()
		test_order_id= np.load('/content/drive/MyDrive/python_data/kaggle/instacart/data/test_order_id.npy')
		train_X= pd.concat([train_X1, train_X2], axis= 0)
	else:
		train_X, train_y= get_data(train_bool= True)
		test_X, test_order_id= get_data(train_bool= False)
	return train_X, train_y, test_X, test_order_id

In [None]:
evice= 'cpu'
print(f'using {device}')

train_X, train_y, test_X, test_order_id= load_data(from_file= True)
print('Data Loaded!!!')

train_dense_cols= ['train_days_since_prior_order', 'prior_order_count', 'prior_dspo_mean', 'prior_dspo_var', 'user_dep_ratio', 'user_aisle_ratio']
test_dense_cols= ['test_days_since_prior_order', 'prior_order_count', 'prior_dspo_mean', 'prior_dspo_var', 'user_dep_ratio', 'user_aisle_ratio']

train_X_cat= train_X.drop(train_dense_cols, axis= 1).to_numpy()
train_X_dense= train_X[train_dense_cols].to_numpy()
# train_y= train_y.to_numpy()

test_X_cat= test_X.drop(test_dense_cols, axis= 1).to_numpy()
test_X_dense= test_X[test_dense_cols].to_numpy()

fields = [len(np.unique(train_X_cat[:, i])) for i in range(train_X_cat.shape[1])] + [len(train_dense_cols)]

encoder= OneHotEncoder(sparse= False, handle_unknown='ignore')
enc_fitted= encoder.fit(train_X_cat)

'''test dataloader'''
test_dataset = custom_dataset(test_X_cat, test_X_dense, if_y= False)
test_loader = DataLoader(test_dataset, batch_size= 512, shuffle= False, num_workers=2)

"""load model from pt file"""
print('Loading model...')
model= torch.load('/content/drive/MyDrive/python_data/kaggle/instacart/instacart.pt', map_location=torch.device(device))


"""Testing Phase"""
print('start test phase...')
preds= []
model.eval()
with torch.no_grad():
	for i, (X_cat, X_dense) in enumerate(tqdm(test_loader)):
		X_cat_onehot = torch.tensor(enc_fitted.transform(X_cat), dtype= torch.float)
		X= torch.cat([X_cat_onehot, X_dense], dim= 1).to(device)
		output= model(X)
		output= nn.Sigmoid()(output) # Careful 
		preds.extend(output.squeeze(dim=1).detach().cpu().numpy())  