In [None]:
%cd /content/drive/MyDrive/DataScience

In [3]:
#=======================execute this=======================
#require: pytorch
import os
import numpy as np
import time
import datetime
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd


In [None]:
#test
df = pd.read_csv("./data/sample_submission.csv")
df

In [None]:
#test
df = pd.read_csv("./data/processed/test_v1.csv")
#df
df.sort_values(by=['ncodpers'], inplace=True)

In [None]:

import sys
print(sys.version)
print(torch.__version__)

In [4]:
#=======================execute this=======================
def infinite_iter(iterable):
  it = iter(iterable)
  while True:
    try:
      ret = next(it)
      yield ret
    except StopIteration:
      it = iter(iterable)

class myDataset(Dataset):
  def __init__(self, csv_file):
    self.df_test = pd.read_csv(csv_file)
    self.df_test.sort_values(by=['ncodpers'], inplace=True) #during test
    self.x_size = 358 #19 column, some are onehot encoded

  def __len__(self):
    return len(self.df_test)
  
  def dfrow2tensor(self, row):
    ten = np.zeros(0)
    onehot_column = {'ind_empleado': 5, 'pais_residencia': 118, 'indrel_1mes': 4,
            'tiprel_1mes': 4, 'canal_entrada': 162, 'cod_prov': 52}
    for col in self.df_test.columns:
      if col == 'ncodpers':
        continue
      if col in onehot_column:
        _ = np.zeros(onehot_column[col])
        _[int(row[col])] = 1
        ten = np.concatenate((ten, _), axis=None)
      else:
        _ = np.array([float(row[col])])
        ten = np.concatenate((ten, _), axis=None)
    return ten, row['ncodpers']

  def __getitem__(self, idx):
    r = self.df_test.iloc[idx]
    ten, code = self.dfrow2tensor(r)
    x = torch.from_numpy(ten)
    return x, int(code)

In [5]:
#=======================execute this=======================
class LogisticRegression(nn.Module):
  def __init__(self):
    super(LogisticRegression, self).__init__()
    self.linear = torch.nn.Linear(358, 24) #
    self.act = torch.nn.Sigmoid()
  def forward(self, x):
    y = self.linear(x)
    y = self.act(y)
    return y

In [6]:
#=======================execute this=======================
def save_model(dir, model, opt, iter):
  if not os.path.exists(dir):
    os.mkdir(dir)
  if iter < 0:
    torch.save(model.state_dict(), f'{dir}/model.ckpt')
    torch.save(opt.state_dict(), f'{dir}/opt.opt')
  else:
    torch.save(model.state_dict(), f'{dir}/model.ckpt-{iter}')
    torch.save(opt.state_dict(), f'{dir}/opt.opt-{iter}')

def load_model(dir, model, opt, iter):
  if iter < 0:
    model.load_state_dict(torch.load(f'{dir}/model.ckpt'))
    opt.load_state_dict(torch.load(f'{dir}/opt.opt'))
  else:
    model.load_state_dict(torch.load(f'{dir}/model.ckpt-{iter}'))
    opt.load_state_dict(torch.load(f'{dir}/opt.opt-{iter}'))
  return model, opt

In [None]:
#=======================execute this=======================
print("loading data...")
s_t = time.time()

test_dataset = myDataset('./data/processed/test.csv')
print(len(test_dataset))
#test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=2)
#test_iterator = infinite_iter(test_loader)

print("data prepared!")
et = time.time() - s_t
et = str(datetime.timedelta(seconds=et))[:-7]
print("[{}]".format(et))

In [8]:
#=======================execute this=======================
map = {0: "ind_ahor_fin_ult1", 1: "ind_aval_fin_ult1", 2: "ind_cco_fin_ult1", 3: "ind_cder_fin_ult1",
    4: "ind_cno_fin_ult1", 5: "ind_ctju_fin_ult1", 6: "ind_ctma_fin_ult1", 7: "ind_ctop_fin_ult1",
    8: "ind_ctpp_fin_ult1", 9: "ind_deco_fin_ult1", 10: "ind_deme_fin_ult1", 11: "ind_dela_fin_ult1",
    12: "ind_ecue_fin_ult1", 13: "ind_fond_fin_ult1", 14: "ind_hip_fin_ult1", 15: "ind_plan_fin_ult1",
    16: "ind_pres_fin_ult1", 17: "ind_reca_fin_ult1", 18: "ind_tjcr_fin_ult1", 19: "ind_valo_fin_ult1", 
    20: "ind_viv_fin_ult1", 21: "ind_nomina_ult1", 22: "ind_nom_pens_ult1", 23: "ind_recibo_ult1"}

In [None]:
#=======================execute this=======================
import csv
model_dir = "./model1" #model path
use_gpu = False #not needed, cpu is enough here

model = LogisticRegression()
opt = torch.optim.Adam(model.parameters(), lr = 1e-3)
model, opt = load_model(model_dir, model, opt, -1)
if use_gpu:
  model = model.to('cuda')
model.eval()
print("start testing")

out_path = "./v1.csv" #output path and name
with open(out_path, 'w', newline='') as csvfile: #
  writer = csv.writer(csvfile)
  writer.writerow(["ncodpers", "added_products"])
  s_t = time.time()
  for i in range(0, len(test_dataset)):
    (x , code) = test_dataset.__getitem__(i)
    x = x.float()
    if use_gpu:
      x = x.to('cuda')
    y_pred = model(x)
    y_pred = y_pred.detach().numpy()
    y_pred = y_pred.argsort()[-7:][::-1]
    product = ""
    for e in y_pred:
      product += (map[int(e)] + " ")
    writer.writerow([code, product])
    #if i >=5:
      #break
    if i %10000 == 0:
      current_t = time.time() - s_t
      current_t = str(datetime.timedelta(seconds=current_t))[:-2]
      print("[{}]progress: {}/{}".format(current_t, i, len(test_dataset)))
print("finish")


In [None]:
df_o = pd.read_csv("./v1.csv")
df_o