## 模型

In [1]:

import torch as t
import time

class BasicModule(t.nn.Module):
    '''
    封装了nn.Module,主要是提供了save和load两个方法
    '''

    def __init__(self):
        super(BasicModule,self).__init__()
        self.model_name=str(type(self))# 默认名字

    def load(self, path,change_opt=True):
        print("Loading model from " + path)
        data = t.load(path)
        self.load_state_dict(data)
        return self.cuda()

    def save(self, name=None,new=False):
        prefix = '../ckpt/' + self.model_name + '_'
        if name is None:
            name = time.strftime('%m%d_%H:%M:%S.pth')
        path = prefix+name +'.pt'
        data=self.state_dict()
        t.save(data, path)
        print("Saving model to "+ path)
        return path

    def get_optimizer(self,lr1,lr2=0,weight_decay = 0):
        ignored_params = list(map(id, self.encoder.parameters()))
        base_params = filter(lambda p: id(p) not in ignored_params,
                        self.parameters())
        if lr2 is None: lr2 = lr1*0.5 
        optimizer = t.optim.Adam([
                dict(params=base_params,weight_decay = weight_decay,lr=lr1),
                {'params': self.encoder.parameters(), 'lr': lr2}
            ])
        return optimizer


In [2]:
from torch import nn
from collections import OrderedDict

class Inception(nn.Module):
    def __init__(self,cin,co,relu=True,norm=True):
        super(Inception, self).__init__()
        assert(co%4==0)
        cos=[co//4]*4
        self.activa=nn.Sequential()
        if norm:self.activa.add_module('norm',nn.BatchNorm1d(co))
        if relu:self.activa.add_module('relu',nn.ReLU(True))
        self.branch1 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)),
            ])) 
        self.branch2 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[1], 1)),
            ('norm1', nn.BatchNorm1d(cos[1])),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)),
            ]))
        self.branch3 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)),
            ('norm1', nn.BatchNorm1d(cos[2])),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)),
            ]))
        self.branch4 =nn.Sequential(OrderedDict([
            #('pool',nn.MaxPool1d(2)),
            ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)),
            ]))
    def forward(self,x):
        branch1=self.branch1(x)
        branch2=self.branch2(x)
        branch3=self.branch3(x)
        branch4=self.branch4(x)
        result=self.activa(torch.cat((branch1,branch2,branch3,branch4),1))
        return result
    
    


In [3]:
import copy
import pickle as pkl
class CNNText_inception(BasicModule):
    def __init__(self, opt, model_pre=''):
        super(CNNText_inception, self).__init__()
        incept_dim=opt.inception_dim
        self.model_name = 'CNNText_inception' + model_pre
        self.opt=opt
        self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)
        self.content_conv=nn.Sequential(
            Inception(opt.embedding_dim,incept_dim),#(batch_size,64,opt.content_seq_len)->(batch_size,64,(opt.content_seq_len)/2)
            #Inception(incept_dim,incept_dim),#(batch_size,64,opt.content_seq_len/2)->(batch_size,32,(opt.content_seq_len)/4)
            Inception(incept_dim,incept_dim),
            nn.MaxPool1d(opt.content_seq_len)
        )
        self.fc = nn.Sequential(
            nn.Linear(incept_dim,opt.linear_hidden_size),
            nn.BatchNorm1d(opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(opt.linear_hidden_size,opt.num_classes)
        )
        if opt.embedding_path:
            print('load embedding')
            self.encoder.weight.data.copy_(t.from_numpy(np.load(opt.embedding_path)['vector']))
    def forward(self, content):
        content = self.encoder(content)
        content_out=self.content_conv(content.permute(0,2,1))        
        out = content_out.view(content_out.size(0), -1)
        return self.fc(out)

In [4]:
import torch as t
import numpy as np
from torch import nn


def kmax_pooling(x, dim, k):
    index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
    return x.gather(dim, index)

class RCNN(BasicModule): 
    def __init__(self, opt, model_pre=''):
        super(RCNN, self).__init__()
        self.model_name = 'RCNN' + model_pre
        self.opt=opt
        kernel_size = opt.kernel_size
        self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)

        self.title_lstm = nn.LSTM(input_size = opt.embedding_dim,\
                            hidden_size = opt.hidden_size,
                            num_layers = opt.num_layers,
                            bias = True,
                            batch_first = False,
                            dropout = 0.5,
                            bidirectional = True
                            )

        self.content_lstm =nn.LSTM(input_size = opt.embedding_dim,\
                            hidden_size = opt.hidden_size,
                            num_layers = opt.num_layers,
                            bias = True,
                            batch_first = False,
                            # dropout = 0.5,
                            bidirectional = True
                            )

        self.content_conv = nn.Sequential(
            nn.Conv1d(in_channels = opt.hidden_size*2 + opt.embedding_dim,
                      out_channels = opt.content_dim,
                      kernel_size =  kernel_size),
            nn.BatchNorm1d(opt.content_dim),
            nn.ReLU(inplace=True),

            nn.Conv1d(in_channels = opt.content_dim,
                      out_channels = opt.content_dim,
                      kernel_size =  kernel_size),
            nn.BatchNorm1d(opt.content_dim),
            nn.ReLU(inplace=True),
        )

        self.fc = nn.Sequential(
            nn.Linear(opt.kmax_pooling*(opt.content_dim),opt.linear_hidden_size),
            nn.BatchNorm1d(opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(opt.linear_hidden_size,opt.num_classes)
        )
    def forward(self, content):
        content = self.encoder(content)

        content_out = self.content_lstm(content.permute(1,0,2))[0].permute(1,2,0)
        content_em = (content).permute(0,2,1)
        content_out = t.cat((content_out,content_em),dim=1)
        content_conv_out = kmax_pooling(self.content_conv(content_out),2,self.opt.kmax_pooling)
        out = content_conv_out.view(content_conv_out.size(0), -1)
        return self.fc(out)

## config

In [5]:
#coding:utf8
import time
import warnings

class ModelConfig(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''
    content_dim = 200 #描述的卷积核数
    num_classes = 4 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 1024 # 全连接层隐藏元数目
    kmax_pooling = 2# k
    hidden_size = 256 #LSTM hidden size
    num_layers=2 #LSTM layers
    inception_dim = 512 #inception的卷积核数
    
    vocab_size = 65462
    kernel_size = 3 #单尺度卷积核
    kernel_sizes = [2,3,4] #多尺度卷积核
    content_seq_len = 200 
    embedding_path = '../inputs/fasttextwordvec.npz' # Embedding

In [7]:
# modelopt = ModelConfig()
modelopt = ModelConfig()

## Dataset

## 导入数据集

In [8]:
import torch.utils.data as D
import numpy as np
import pandas as pd

In [9]:
X_train_f, Y_train_f = "../inputs/X_train.npz","../inputs/Y_train.npz"
X_valid_f, Y_valid_f = "../inputs/X_valid.npz","../inputs/Y_valid.npz"
X_test_f = "../inputs/X_test.npz"
X_train_tensor = t.from_numpy(np.load(X_train_f)['X']).long()
X_valid_tensor = t.from_numpy(np.load(X_valid_f)['X']).long()
X_test_tensor = t.from_numpy(np.load(X_test_f)['X']).long()



In [10]:
Y_train_dict = np.load(Y_train_f)
Y_valid_dict = np.load(Y_valid_f)

In [11]:
import pickle as pkl
columns = pkl.load(open("../inputs/columns.pkl", 'rb'))

In [12]:
len(columns)

20

In [13]:
from sklearn import metrics

In [15]:
def eval(model, valiloader,y_true, use_cuda=True):
    
    model.eval()
    y_pred = []
    for bx in validloader:
        if use_cuda:
            bx =  bx.cuda()
        y_pre = model(bx)
        y_label = torch.max(y_pre, 1)[1].data
        y_pred.extend(y_label.tolist())
    y_true_ = np.argmax(y_true, 1)
    f1 =  metrics.f1_score(y_true_, y_pred, average='macro')
    return f1

In [16]:
def pred_test(model, testloader, use_cuda=True):
    model.eval()
    y_pred = []
    for bx in testloader:
        if use_cuda:
            bx = bx.cuda()
        y_pre = model(bx)
        y_label = torch.max(y_pre, 1)[1].data
        y_pred.extend(y_label.tolist())
    return y_pred

In [None]:
class FGSentimetDataset(data.Dataset):
    def __init__(self, X_npz, Y_npz, label_pkl, augument=False, training=False, dropout_rate=0.3, augument_rate=0.4):
        self.dropout_rate = dropout_rate
        self.augument_rate = augument_rate
        self.augument=augument
        self.training = training
        self.label_list = pickle.load(open(label_pkl, 'rb'))
        self.X = np.load(X_npz)['X']
        dataset = np.load(Y_npz)
        self.Y = {}
        for col in self.label_list:
            self.Y[col] = dataset[col]
        self._len = self.X.shape[0]
    def shuffle(self,d):
        return np.random.permutation(d.tolist())

    def dropout(self,d,p=0.5):
        len_ = len(d)
        index = np.random.choice(len_,int(len_*p))
        d[index]=0
        return d     


    def __getitem__(self,index):
        
        content =  self.X[index]
    
        if self.training:  
            if self.augument :
                augument=random.random()

                if augument>self.augument_rate:
                    content = self.dropout(content,p=self.dropout_rate)
                else:
                    content = self.shuffle(content)

            data =t.from_numpy(content).long()
            label_dict = {label:t.from_numpy(self.Y[label][index]).long() for label in self.label_list}
            return data, label_dict
        else:
            return t.from_numpy(content).long()

    def __len__(self):
        return self._len

In [24]:
import torch
import torch.utils.data 
loss_function = nn.CrossEntropyLoss()
Valid_X = torch.utils.data.DataLoader(X_valid_tensor,
                            batch_size = 128,
                            shuffle = False,
                            num_workers = 12,
                            pin_memory = True)
Test_X = torch.utils.data.DataLoader(X_test_tensor,
                            batch_size = 128,
                            shuffle = False,
                            num_workers = 12,
                            pin_memory = True)
Test_pre_dict = {}
for col in columns:
    print(f"Training model for col: {col}")
    Y_train_tensor = t.from_numpy(Y_train_dict[col]).long()
    model =  CNNText_inception(ModelConfig, col)
    Train_loader = torch.utils.data.DataLoader(TrainDataSet(X_train_tensor, Y_train_tensor),
                                batch_size = 128,
                                shuffle = True,
                                num_workers = 12,
                                pin_memory = True)
    Valid_Y = Y_valid_dict[col]
    best_score = 0
    lr = 5e-3
    lr2 = 1e-3
    lr_decay = 0.9
    i = 0
    optimizer = model.get_optimizer(lr, lr2,lr_decay)
    model.cuda()

    for epoch in range(50):
        model.train()
        print(f"training epoch {epoch}")
        for ii,(content, true) in enumerate(Train_loader):
            content = content.cuda()
            optimizer.zero_grad()
            pred_dict = model(content)
            loss = loss_function(pred, torch.max(true.cuda(), 1)[1])
            loss.backward()
            optimizer.step()

        scores = eval(model,Valid_X, Valid_Y)
        print(f"epoch: {epoch} LR: {lr}, F1_score:{scores}")
        if scores>best_score:
            print(f"F1-score improved from {best_score} to {scores}")
            i = 0
            best_score = scores
            best_path = model.save(name = str(scores),new=True)
        if scores < best_score:
            i += 1
            print(f"F1-score did not improved from {best_score} for {i} epochs")       
            model.load(best_path,change_opt=False)
            lr = lr * lr_decay
            lr2= 2e-4 if lr2==0 else  lr2*0.8
            optimizer = model.get_optimizer(lr,lr2,0)          
            
            
    Test_pre_dict[col] =  pred_test(model, Test_X, use_cuda=True)

    
    

Training model for col: location_traffic_convenience
load embedding
training epoch 0


RuntimeError: Traceback (most recent call last):
  File "/home/xq/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 114, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/home/xq/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 175, in default_collate
    return torch.stack(batch, 0, out=out)
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 200 and 4 in dimension 2 at /home/xq/packages/pytorch/aten/src/TH/generic/THTensorMoreMath.cpp:1317


In [None]:
def eval_metrics(y_pred_dict, y_true_dict):
    accuracys = {}
    f1s = {}
    for col, y_pred in y_pred_dict.items():
        # accuracys[col] = metrics.accuracy_score(y_true_dict[col], y_pred)
        f1s[col] = metrics.f1_score(y_true_dict[col], y_pred, average='macro')

    return f1s

def eval(model, valiloader,y_true, use_cuda=True):
    
    model.eval()
    y_true_ = {}
    y_pred = defaultdict(list)
    for bx in validloader:
        if use_cuda:
            bx =  bx.cuda()
        y_pre = model(bx)
        for col in model.label_list:
            y_label = torch.max(y_pre[col], 1)[1].data
            y_pred[col].extend(y_label.tolist())
            y_true_[col] = np.argmax(y_true[col], 1)
    f1s = eval_metrics(y_pred, y_true_)
    print(json.dumps(f1s, indent=4, sort_keys=True))
    return np.mean(list(f1s.values()))

#scores = eval(model,validloader, Valid_dataset.Y)

In [None]:
import tqdm
import random
best_score = 0
lr = 5e-3
lr2 = 1e-3
lr_decay = 0.9
early_stops = 20
i = 0
optimizer = model.get_optimizer(lr, lr2,lr_decay)
model.cuda()

for epoch in range(1000):
    model.train()
    print(f"training epoch {epoch}")
    for ii,(content, true_dict) in enumerate(trainloader):
        content = content.cuda()
        optimizer.zero_grad()
        pred_dict = model(content)
        loss = torch.mean(torch.stack([loss_function(pred, torch.max(true_dict[col].cuda(), 1)[1]) for col, pred in pred_dict.items()]))
        loss.backward()
        optimizer.step()
    
    scores = eval(model,validloader, Valid_dataset.Y)
    print(f"epoch: {epoch} LR: {lr}, F1_score:{scores}")
    if scores>best_score:
        print(f"F1-score improved from {best_score} to {scores}")
        i = 0
        best_score = scores
        best_path = model.save(name = str(scores),new=True)

    if scores < best_score:
        i += 1
        print(f"F1-score did not improved from {best_score} for {i} epochs")       
        model.load(best_path,change_opt=False)
        if i > early_stops:
            print("Stop training")
            break
        lr = lr * lr_decay
        lr2= 2e-4 if lr2==0 else  lr2*0.8
        optimizer = model.get_optimizer(lr,lr2,0)                        


In [None]:
test_arr = np.load("../inputs/X_test.npz")

In [None]:
test_arr['X']

In [None]:
test_loader = data.DataLoader(torch.from_numpy(test_arr['X']).long(), batch_size=1024, shuffle=False, num_workers=8)

In [None]:
def pred_test(model, testloader, use_cuda=True):
    model.eval()
    y_pred = defaultdict(list)
    for bx in testloader:
        if use_cuda:
            bx = bx.cuda()
        y_pre = model(bx)
        for col in model.label_list:
            y_label = torch.max(y_pre[col], 1)[1].data
            y_pred[col].extend(y_label.tolist())
    return y_pred
            



In [None]:
pred_test = pred_test(model, test_loader)

In [None]:
import pandas as pd
def load_data_from_csv(file_name, header=0, encoding="utf-8"):

    data_df = pd.read_csv(file_name, header=header, encoding=encoding)

    return data_df

In [None]:
test = load_data_from_csv("../inputs/sentiment_analysis_testa.csv")

In [None]:
for col, pred in pred_test.items():
    test[col] = pred
    test[col] -= 2

In [None]:
test.to_csv("../output/RCNN1.csv", encoding="utf_8_sig", index=False)

In [19]:
D.dataloader

<module 'torch.utils.data.dataloader' from '/home/xq/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py'>