In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import gc
from collections import OrderedDict

import torch as t
from torch.utils.data import DataLoader
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [2]:
%%time
label_1 = pd.read_csv('./data/train_preliminary/user.csv')
label_2 = pd.read_csv('./data/train_semi_final/user.csv')
label = pd.concat([label_1, label_2], axis=0).reset_index(drop=True)
mats_train = []
mats_test = []
for col in tqdm(['creative_id', 'ad_id', 'advertiser_id', 'product_id', 'industry']):    
    mats_train.append(np.load('./inputs_new/{}_inputs_train.npy'.format(col)))
    mats_test.append(np.load('./inputs_new/{}_inputs_test.npy'.format(col)))

100%|██████████| 5/5 [03:09<00:00, 37.85s/it]

CPU times: user 496 ms, sys: 7.93 s, total: 8.43 s
Wall time: 3min 9s





In [3]:
import logging

def get_logger(filename, verbosity=1, name=None):
    level_dict = {0: logging.DEBUG, 1: logging.INFO, 2: logging.WARNING}
    formatter = logging.Formatter(
        "[%(asctime)s][%(filename)s][line:%(lineno)d][%(levelname)s] %(message)s"
    )
    logger = logging.getLogger(name)
    logger.setLevel(level_dict[verbosity])

    fh = logging.FileHandler(filename, "w")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    sh = logging.StreamHandler()
    sh.setFormatter(formatter)
    logger.addHandler(sh)

    return logger

In [4]:
class Inception(nn.Module):
    def __init__(self,cin,co,relu=True,norm=True):
        super(Inception, self).__init__()
        assert(co%4==0)
        cos=[co//4]*4
        self.activa=nn.Sequential()
        if norm:self.activa.add_module('norm',nn.BatchNorm1d(co))
        if relu:self.activa.add_module('relu',nn.ReLU(True))
        self.branch1 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)),
            ])) 
        self.branch2 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[1], 1)),
            ('norm1', nn.BatchNorm1d(cos[1])),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)),
            ]))
        self.branch3 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)),
            ('norm1', nn.BatchNorm1d(cos[2])),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)),
            ]))
        self.branch4 =nn.Sequential(OrderedDict([
            #('pool',nn.MaxPool1d(2)),
            ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)),
            ]))
    def forward(self,x):
        branch1=self.branch1(x)
        branch2=self.branch2(x)
        branch3=self.branch3(x)
        branch4=self.branch4(x)
        result=self.activa(t.cat((branch1,branch2,branch3,branch4),1))
        return result

In [5]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__() 
        emb_outputs = []
        
        cols = ['creative_id', 'ad_id', 'advertiser_id', 'product_id', 'industry']
        n_in = len(cols)
        for i in range(n_in):
            We = np.load('./w2v_256_120/{}_embedding_weight.npy'.format(cols[i]))
            We = np.vstack([We, np.zeros(256)])
            embed = nn.Embedding(num_embeddings=We.shape[0],embedding_dim=We.shape[1],padding_idx=len(We)-1, _weight=t.FloatTensor(We))
            for p in embed.parameters(): 
                p.requires_grad=False
            emb_outputs.append(embed)
            
        for i in range(n_in):
            We = np.load('./w2v_128_60/{}_embedding_weight.npy'.format(cols[i]))
            We = np.vstack([We, np.zeros(128)])
            embed = nn.Embedding(num_embeddings=We.shape[0],embedding_dim=We.shape[1],padding_idx=len(We)-1, _weight=t.FloatTensor(We))
            for p in embed.parameters(): 
                p.requires_grad=False
            emb_outputs.append(embed)
            del We
            gc.collect()

        self.encoders = nn.ModuleList(emb_outputs)
        self.emb_drop = nn.Dropout(p=0.2)
        self.embed_conv=nn.Sequential(
            Inception(1920,1024),#(batch_size,64,opt.title_seq_len)->(batch_size,32,(opt.title_seq_len)/2)
            Inception(1024,1024),
            #nn.MaxPool1d(opt.title_seq_len)
        )
        self.fc = nn.Sequential(
            nn.Linear(1024*2, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.2),
            nn.Linear(1024,n_cls)
        )

    def forward(self, xs):
        inp = [self.encoders[i](x) for i, x in enumerate(xs)] + [self.encoders[i + 5](x) for i, x in enumerate(xs)]
        x = t.cat(inp, 2)
        x = self.emb_drop(x)
        x = self.embed_conv(x.permute(0,2,1))
        x = t.max(x.permute(0,2,1), dim=1)[0]
        logits = self.fc(x)
        return logits

In [6]:
def clip_gradient(optimizer, grad_clip):
    for group in optimizer.param_groups:
        #print(group['params'])
        for param in group['params']:
            param.grad.data.clamp_(-grad_clip, grad_clip)

In [7]:
class Dataset(t.utils.data.Dataset):
    def __init__(self, xs, y, shuffle=False):
        self.xs = xs
        self.y = y
        self.size = len(xs[0])
        self.shuffle = shuffle
         
    def __len__(self):
        return self.size
    
    def __getitem__(self, idx):
        xs = [x[idx].astype(np.int64) for x in self.xs]
        y = self.y[idx]
        if self.shuffle and np.random.rand() < 0.8:
            state = np.random.get_state()
            for x in xs:
                np.random.set_state(state)
                np.random.shuffle(x)     
        return xs, y

In [8]:
def train(model, loader, optimizer, criterion):
    t0 = time.time()
    y_true = []
    y_pred = []
    model.train() 
    for i, (xs, y) in enumerate(loader):
        optimizer.zero_grad()
        xs = [t.LongTensor(x).cuda() for x in xs]
        yp = model(xs)
        loss = criterion(yp, t.LongTensor(y).cuda())
        loss.backward()
        clip_gradient(optimizer, 0.1)
        optimizer.step()
        yp = t.softmax(yp, 1)
        yp = np.argmax(yp.detach().cpu().numpy(), axis=1)
        y_pred.append(yp)
        y_true.append(y)
        print('process: [%d/%d]' % (i + 1, len(loader)), end='\r')
    y_true = np.hstack(y_true)
    y_pred = np.hstack(y_pred)
    score = accuracy_score(y_true, y_pred)
    print('process: [%d/%d], score: %f, dtime: %ds' % (i + 1, len(loader), score, time.time()-t0), end='\n')
    return None

def val(model, loader):
    t0 = time.time()
    y_true = []
    y_pred = []
    model.eval()
    with t.no_grad():
        for i, (xs, y) in enumerate(loader):
            xs = [t.LongTensor(x).cuda() for x in xs]
            yp = model(xs)
            yp = t.softmax(yp, 1)
            yp = np.argmax(yp.cpu().numpy(), axis=1)
            y_pred.append(yp)
            y_true.append(y)
            print('process: [%d/%d]' % (i + 1, len(loader)), end='\r')
    model.train() 
    y_true = np.hstack(y_true)
    y_pred = np.hstack(y_pred)
    score = accuracy_score(y_true, y_pred)
    print('process: [%d/%d], score: %f, dtime: %ds' % (i + 1, len(loader), score, time.time()-t0), end='\n')
    return y_pred, score

def test(model, loader, y_test):
    t0 = time.time()
    y_true = []
    y_pred = []
    model.eval()
    with t.no_grad():
        for i, (xs,y) in enumerate(loader):
            xs = [t.LongTensor(x).cuda() for x in xs]
            yp = model(xs)
            yp = t.softmax(yp, 1)
            y_test[i*batch_size:(i+1)*batch_size] = yp.cpu().numpy()
            print('process: [%d/%d]' % (i + 1, len(loader)), end='\r')
    model.train() 
    print('process: [%d/%d], dtime: %ds' % (i + 1, len(loader), time.time()-t0), end='\n')
    return y_test

In [None]:
tmp_data = pd.read_csv('./data/train_preliminary/user.csv')
y = label['age'].values-1
n_cls = 10
batch_size = 1024
y_test = np.zeros([mats_test[0].shape[0], n_cls])
loader_te = DataLoader(Dataset(mats_test, np.zeros(mats_test[0].shape[0])), batch_size=batch_size, shuffle=False, num_workers=4)
kfold = KFold(n_splits=5, shuffle=True, random_state=1995)
best_scores = []
logger = get_logger('./torch2.log')

for i, (idx_trn, idx_val) in enumerate(kfold.split(np.zeros((mats_train[0].shape[0], 1)))):
    logger.info('fold {} start training!'.format(i))
    x_test = np.zeros([mats_test[0].shape[0], n_cls])
    x_trn = [mat[idx_trn] for mat in mats_train]
    x_val = [mat[idx_val] for mat in mats_train]
    y_trn, y_val = y[idx_trn], y[idx_val]
    model = CNN().cuda()
    optimizer = t.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr = 0.001)
    scheduler = t.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    criterion = nn.CrossEntropyLoss()
    loader_trn = DataLoader(Dataset(x_trn, y_trn, shuffle=True), batch_size=batch_size, shuffle=True, num_workers=4)
    loader_val = DataLoader(Dataset(x_val, y_val), batch_size=batch_size, shuffle=False, num_workers=4)
    best_score = 0.0
    for i in range(20):
        if i > 10:
            scheduler.step()
        train(model, loader_trn, optimizer, criterion)
        result, score = val(model, loader_val)
        logger.info('Epoch:[{}/{}]\t score={:.5f}\t'.format(i , 20, score))
        if score > best_score:
            x_test = test(model, loader_te, x_test)
            best_score = score
            print('best score: %.5f' % score)
    best_scores.append(best_score)
    y_test += x_test
print('best scores:', best_scores)

[2020-07-19 03:50:02,444][<ipython-input-10-6dec02038b62>][line:12][INFO] fold 0 start training!


process: [2344/2344], score: 0.438579, dtime: 1300s
process: [586/586]

[2020-07-19 04:14:05,076][<ipython-input-10-6dec02038b62>][line:29][INFO] Epoch:[0/20]	 score=0.47281	


process: [586/586], score: 0.472813, dtime: 115s
process: [977/977], dtime: 190s
best score: 0.47281
process: [2344/2344], score: 0.470395, dtime: 1296s
process: [586/586]

[2020-07-19 04:40:47,735][<ipython-input-10-6dec02038b62>][line:29][INFO] Epoch:[1/20]	 score=0.47964	


process: [586/586], score: 0.479637, dtime: 115s
process: [977/977], dtime: 190s
best score: 0.47964
process: [2344/2344], score: 0.480679, dtime: 1295s
process: [586/586]

[2020-07-19 05:07:28,655][<ipython-input-10-6dec02038b62>][line:29][INFO] Epoch:[2/20]	 score=0.48953	


process: [586/586], score: 0.489527, dtime: 114s
process: [324/977]

In [11]:
np.save('./age_res/torch_cnn2_test_pred.npy', y_test/5)

In [11]:
np.mean(best_scores)

0.508359

In [16]:
np.sum(y_test[0]/5)

0.9999999922224582