In [None]:
#build function

In [None]:
#install tensorboard if you want to record training log
!pip install tensorboardX

In [None]:
#for google colab, ignore this
%cd /content/drive/MyDrive/DataScience

In [3]:
#require: pytorch
import os
import numpy as np
import time
import datetime
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import pandas as pd


In [4]:
def infinite_iter(iterable):
  it = iter(iterable)
  while True:
    try:
      ret = next(it)
      yield ret
    except StopIteration:
      it = iter(iterable)

class myDataset(Dataset):
  def __init__(self, csv_file):
    self.df_train = pd.read_csv(csv_file)
    self.x_size = 358 #19 column, some are onehot encoded

  def __len__(self):
    return len(self.df_train)
  
  def dfrow2tensor(self, row):
    ten = np.zeros(0)
    onehot_column = {'ind_empleado': 5, 'pais_residencia': 118, 'indrel_1mes': 4,
            'tiprel_1mes': 4, 'canal_entrada': 162, 'cod_prov': 52}
    for col in self.df_train.columns:
      if col == 'ncodpers':
        continue
      if col in onehot_column:
        _ = np.zeros(onehot_column[col])
        _[int(row[col])] = 1
        ten = np.concatenate((ten, _), axis=None)
      else:
        _ = np.array([float(row[col])])
        ten = np.concatenate((ten, _), axis=None)
    return ten

  def __getitem__(self, idx):
    r = self.df_train.iloc[idx]
    ten = self.dfrow2tensor(r)
    x = ten[:self.x_size]
    y = ten[self.x_size:]
    x = torch.from_numpy(x)
    y = torch.from_numpy(y)
    return (x, y)

In [5]:
class LogisticRegression(nn.Module):
  def __init__(self):
    super(LogisticRegression, self).__init__()
    self.linear = torch.nn.Linear(358, 24) #
    self.act = torch.nn.Sigmoid()
  def forward(self, x):
    y = self.linear(x)
    y = self.act(y)
    return y


In [6]:
def get_loss(y, label):
  criterion = nn.L1Loss()
  loss = criterion(y, label)
  return loss

def save_model(dir, model, opt, iter):
  if not os.path.exists(dir):
    os.mkdir(dir)
  if iter < 0:
    torch.save(model.state_dict(), f'{dir}/model.ckpt')
    torch.save(opt.state_dict(), f'{dir}/opt.opt')
  else:
    torch.save(model.state_dict(), f'{dir}/model.ckpt-{iter}')
    torch.save(opt.state_dict(), f'{dir}/opt.opt-{iter}')

def load_model(dir, model, opt, iter):
  if iter < 0:
    model.load_state_dict(torch.load(f'{dir}/model.ckpt'))
    opt.load_state_dict(torch.load(f'{dir}/opt.opt'))
  else:
    model.load_state_dict(torch.load(f'{dir}/model.ckpt-{iter}'))
    opt.load_state_dict(torch.load(f'{dir}/opt.opt-{iter}'))
  return model, opt


In [None]:
#load train data

In [None]:
print("loading data...")
s_t = time.time()
train_dataset = myDataset('./data/processed/train.csv')
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)
train_iterator = infinite_iter(train_loader)

#test_dataset = myDataset('./data/processed/test.csv')
#test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=2)
#test_iterator = infinite_iter(test_loader)

print("data prepared!")
et = time.time() - s_t
et = str(datetime.timedelta(seconds=et))[:-7]
print("[{}]".format(et))


In [None]:
#train

In [8]:
from tensorboardX import SummaryWriter

model_dir = "./model1"
use_gpu = False #not needed, cpu is enough here
log = True #require tensorboard
log_dir = './log'

model = LogisticRegression()
opt = torch.optim.Adam(model.parameters(), lr = 1e-3)
#model, opt = load_model(model_dir, model, opt, -1)
if use_gpu:
  model = model.to('cuda')
if log:
  logger = SummaryWriter(log_dir)
model.train()
print("start training")
s_t = time.time()
for iteration in range(0, 3000):
  x, y = next(train_iterator)
  #x, y = train_dataset.__getitem__(0)
  x = x.float()
  #print(torch.sum(x, 1) / x.size()[1])
  #print(x.size())
  #print(y.size())
  if use_gpu:
    x = x.to('cuda')
    y = y.to('cuda')
  y_pred = model(x)
  #print(y_pred.size())
  loss = get_loss(y, y_pred)
  opt.zero_grad()
  loss.backward()
  opt.step()
  if ((iteration+1) % 100 == 0):
    et = time.time() - s_t
    et = str(datetime.timedelta(seconds=et))[:-7]
    print("[{}]iteration {} loss: {}".format(et, iteration+1, loss.item()))
    if log:
      logger.add_scalar('L1 loss', loss, iteration+1)
  if ((iteration+1) % 1000 == 0):
    save_model(model_dir, model, opt, -1)
  if ((iteration+1) % 1000 == 0):
    save_model(model_dir, model, opt, iteration+1)

start training
[0:00:08]iteration 100 loss: 0.2631032794161001
[0:00:15]iteration 200 loss: 0.1610944625499542
[0:00:22]iteration 300 loss: 0.11242317681656762
[0:00:28]iteration 400 loss: 0.09508434998012187
[0:00:35]iteration 500 loss: 0.07859054910113628
[0:00:42]iteration 600 loss: 0.060412948532454415
[0:00:49]iteration 700 loss: 0.06284219680674141
[0:00:56]iteration 800 loss: 0.05895390882862254
[0:01:02]iteration 900 loss: 0.06398943880049046
[0:01:09]iteration 1000 loss: 0.057787239741855956
[0:01:19]iteration 1100 loss: 0.06239357281545684
[0:01:26]iteration 1200 loss: 0.06253886151527392
[0:01:33]iteration 1300 loss: 0.046585924793930644
[0:01:40]iteration 1400 loss: 0.05955934148467653
[0:01:47]iteration 1500 loss: 0.06110280457168452
[0:01:54]iteration 1600 loss: 0.05117822363013147
[0:02:01]iteration 1700 loss: 0.05139798370572635
[0:02:08]iteration 1800 loss: 0.0413441271754967
[0:02:15]iteration 1900 loss: 0.057608185894575094
[0:02:22]iteration 2000 loss: 0.05255925753

In [None]:
#load dev data

In [None]:
print("loading data...")
s_t = time.time()

dev_dataset = myDataset('./data/processed/dev.csv')
dev_loader = DataLoader(dev_dataset, batch_size=1, shuffle=False, num_workers=2)
dev_iterator = infinite_iter(dev_loader)

print("data prepared!")
et = time.time() - s_t
et = str(datetime.timedelta(seconds=et))[:-7]
print("[{}]".format(et))

In [31]:
def map7(pred, label): #pred, label: numpay 1d array
  ind = []
  for i in range(0, len(label)): #length should be 24
    if label[i] == 1:
      ind.append(i)
  L = len(ind)
  L = min(7, L)
  if L == 0:
    return 0
  
  pred = pred.argsort()[-7:][::-1]
  #print(ind)
  #print(pred)

  score = 0
  precision = 0
  for i in range(0, 7):
    if (pred[i] in ind):
      precision += 1
      score += precision / (i+1)
  score  = score / L
  return score

In [None]:
#validation

In [None]:
model_dir = './model1'
#use_gpu = False

model = LogisticRegression()
opt = torch.optim.Adam(model.parameters(), lr = 1e-3)
model, opt = load_model(model_dir, model, opt, 10000)
model.eval()
#if use_gpu:
  #model = model.to('cuda')

score = 0
dev_sample = 10000
for i in range(0, dev_sample):
  x, y = next(dev_iterator)
  x = x.float()
  pred = model(x)
  #print(x.squeeze().size(), y.squeeze().size(), pred.squeeze().size())
  _score = map7(pred.detach().squeeze().numpy(), y.detach().squeeze().numpy())
  score += _score
  if (i+1) % 1000 == 0:
    print("progress: {}/{}".format(i+1, dev_sample))

print("MAP@7 score:", score / dev_sample)


In [None]:
_c = 0
for iteration in range(0, 1):
  x, y = next(train_iterator)
  for i in range(0, 128):
    _y = y[i]
    s = torch.sum(_y)
    if s > 1:
      _c += 1
      print(_y)
print(_c)

In [None]:
del train_dataset.df_train

In [None]:
train_dataset.df_train

In [None]:
a = pd.read_csv("./data/processed/ver1_1.csv")