## 数据处理, 数据保存到npz中

1. train, vali, testa 的词id np array
2. train, vali, test 的标签保存为 np array
3. word2id保存为json
4. embedding 保存为 np array

In [1]:
import pandas as pd
import numpy as np
EMBEDDING_FILE = '../inputs/fasttextwordvec.vec'
train = pd.read_csv("../inputs/train.csv")
test = pd.read_csv("../inputs/testa.csv")
val = pd.read_csv("../inputs/vali.csv")
X_train = train["content"].fillna("无").str.lower()
X_val = val["content"].fillna("无").str.lower()
X_test = test["content"].fillna("无").str.lower()

In [2]:
import pickle

## 处理词向量

In [3]:
word2id = pickle.load(open("../inputs/word2id.pkl", 'rb'))

In [5]:
# 数据参数
max_features=65462
maxlen=200
embed_size=300

In [6]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
word_index = word2id
#prepare embedding matrix
num_words = max_features
    

In [19]:
embedding_matrix = np.random.randn(num_words, embed_size) * 0.01
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector  

In [24]:
# 保存embedding
np.savez_compressed("../inputs/fasttextwordvec.npz",vector=embedding_matrix)

## 保存Y

In [27]:
from tensorflow.keras.utils import to_categorical

from typing import List

columns = train.columns.tolist()[2:]
def get_y(df:pd.DataFrame, cols:List[str]=columns) -> List[np.array]:
    y_dict = dict()
    for col in cols:
        y = df[col].values + 2
        y_ = to_categorical(y, num_classes=4)
        y_dict[col] = y_
    return y_dict

In [28]:
y_train = get_y(train)
y_val = get_y(val)

In [33]:
np.savez_compressed("../inputs/Y_train.npz",**y_train)
np.savez_compressed("../inputs/Y_valid.npz",**y_val)

In [35]:
pickle.dump(columns, open("../inputs/columns.pkl", 'wb'))

## 保存X

In [39]:
import tensorflow as tf

def transform_data_to_id(x_arr, word2id):
    data = []

    def map_word_to_id(word):
        output = []
        if word in word2id:
            output.append(word2id[word])
        else:
            chars = list(word)
            for char in chars:
                if char in word2id:
                    output.append(word2id[char])
                else:
                    output.append(1)
        return output

    def map_sent_to_id(sent):
        output = []
        for word in sent:
            output.extend(map_word_to_id(word))
        return output
    for s in x_arr:
        data.append(map_sent_to_id(s))
    
    return data

In [43]:
x_val = transform_data_to_id(X_val, word2id)

In [45]:
import keras
pad_sequence = keras.preprocessing.sequence.pad_sequences

Using TensorFlow backend.


In [49]:
X_val = transform_data_to_id(X_val, word2id)
X_val = pad_sequence(X_val,maxlen=maxlen,padding='pre',truncating='pre',value = 0)
X_train = transform_data_to_id(X_train, word2id)
X_train = pad_sequence(X_train,maxlen=maxlen,padding='pre',truncating='pre',value = 0)
X_test = transform_data_to_id(X_test, word2id)
X_test = pad_sequence(X_test,maxlen=maxlen,padding='pre',truncating='pre',value = 0)

In [50]:
np.savez_compressed("../inputs/X_train.npz",X = X_train)
np.savez_compressed("../inputs/X_valid.npz",X = X_val)
np.savez_compressed("../inputs/X_test.npz",X = X_test)

In [51]:
columns

['location_traffic_convenience',
 'location_distance_from_business_district',
 'location_easy_to_find',
 'service_wait_time',
 'service_waiters_attitude',
 'service_parking_convenience',
 'service_serving_speed',
 'price_level',
 'price_cost_effective',
 'price_discount',
 'environment_decoration',
 'environment_noise',
 'environment_space',
 'environment_cleaness',
 'dish_portion',
 'dish_taste',
 'dish_look',
 'dish_recommendation',
 'others_overall_experience',
 'others_willing_to_consume_again']

In [1]:
import torch
import torch.utils.data
# train = torch.utils.data.TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
# val = torch.utils.data.TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))

## 模型

In [2]:

import torch as t
import time

class BasicModule(t.nn.Module):
    '''
    封装了nn.Module,主要是提供了save和load两个方法
    '''

    def __init__(self):
        super(BasicModule,self).__init__()
        self.model_name=str(type(self))# 默认名字

    def load(self, path,change_opt=True):
        print(path)
        data = t.load(path)
        if 'opt' in data:
            # old_opt_stats = self.opt.state_dict() 
            if change_opt:
                
                self.opt.parse(data['opt'],print_=False)
                self.opt.embedding_path=None
                self.__init__(self.opt)
            # self.opt.parse(old_opt_stats,print_=False)
            self.load_state_dict(data['d'])
        else:
            self.load_state_dict(data)
        return self.cuda()

    def save(self, name=None,new=False):
        prefix = '../ckpt/' + self.model_name + '_' +self.opt.type_+'_'
        if name is None:
            name = time.strftime('%m%d_%H:%M:%S.pth')
        path = prefix+name

        if new:
            data = {'opt':self.opt.state_dict(),'d':self.state_dict()}
        else:
            data=self.state_dict()

        t.save(data, path)
        return path

    def get_optimizer(self,lr1,lr2=0,weight_decay = 0):
        ignored_params = list(map(id, self.encoder.parameters()))
        base_params = filter(lambda p: id(p) not in ignored_params,
                        self.parameters())
        if lr2 is None: lr2 = lr1*0.5 
        optimizer = t.optim.Adam([
                dict(params=base_params,weight_decay = weight_decay,lr=lr1),
                {'params': self.encoder.parameters(), 'lr': lr2}
            ])
        return optimizer


In [3]:
from torch import nn
from collections import OrderedDict

class Inception(nn.Module):
    def __init__(self,cin,co,relu=True,norm=True):
        super(Inception, self).__init__()
        assert(co%4==0)
        cos=[co//4]*4
        self.activa=nn.Sequential()
        if norm:self.activa.add_module('norm',nn.BatchNorm1d(co))
        if relu:self.activa.add_module('relu',nn.ReLU(True))
        self.branch1 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[0], 1,stride=1)),
            ])) 
        self.branch2 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[1], 1)),
            ('norm1', nn.BatchNorm1d(cos[1])),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv3', nn.Conv1d(cos[1],cos[1], 3,stride=1,padding=1)),
            ]))
        self.branch3 =nn.Sequential(OrderedDict([
            ('conv1', nn.Conv1d(cin,cos[2], 3,padding=1)),
            ('norm1', nn.BatchNorm1d(cos[2])),
            ('relu1', nn.ReLU(inplace=True)),
            ('conv3', nn.Conv1d(cos[2],cos[2], 5,stride=1,padding=2)),
            ]))
        self.branch4 =nn.Sequential(OrderedDict([
            #('pool',nn.MaxPool1d(2)),
            ('conv3', nn.Conv1d(cin,cos[3], 3,stride=1,padding=1)),
            ]))
    def forward(self,x):
        branch1=self.branch1(x)
        branch2=self.branch2(x)
        branch3=self.branch3(x)
        branch4=self.branch4(x)
        result=self.activa(torch.cat((branch1,branch2,branch3,branch4),1))
        return result
    
    


In [6]:
import copy
import pickle as pkl
class CNNText_inception(BasicModule):
    def __init__(self, opt):
        super(CNNText_inception, self).__init__()
        incept_dim=opt.inception_dim
        self.label_list = pkl.load(open(opt.label_list_pkl, "rb"))
        self.model_name = 'CNNText_inception'
        self.opt=opt
        self.encoder = nn.Embedding(opt.vocab_size,opt.embedding_dim)
        self.content_conv=nn.Sequential(
            Inception(opt.embedding_dim,incept_dim),#(batch_size,64,opt.content_seq_len)->(batch_size,64,(opt.content_seq_len)/2)
            #Inception(incept_dim,incept_dim),#(batch_size,64,opt.content_seq_len/2)->(batch_size,32,(opt.content_seq_len)/4)
            Inception(incept_dim,incept_dim),
            nn.MaxPool1d(opt.content_seq_len)
        )
        self.fc_sub = nn.Sequential(
            nn.Linear(incept_dim,opt.linear_hidden_size),
            nn.BatchNorm1d(opt.linear_hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(opt.linear_hidden_size,opt.num_classes)
        )
        self.fc = nn.ModuleDict({label: copy.deepcopy(self.fc_sub) for label in self.label_list}
        )
        if opt.embedding_path:
            print('load embedding')
            self.encoder.weight.data.copy_(t.from_numpy(np.load(opt.embedding_path)['vector']))
    def forward(self, content):
        content = self.encoder(content)
        content_out=self.content_conv(content.permute(0,2,1))        
        out = content_out.view(content_out.size(0), -1)
        out_dict = {label: self.fc[label](out) for label in self.label_list}
        return out_dict

## config

In [7]:
#coding:utf8
import time
import warnings

class ModelConfig(object):
    '''
    并不是所有的配置都生效,实际运行中只根据需求获取自己需要的参数
    '''
    model='CNNText' 
    content_dim = 200 #描述的卷积核数
    num_classes = 80 # 类别
    embedding_dim = 300 # embedding大小
    linear_hidden_size = 4 # 全连接层隐藏元数目
    kmax_pooling = 2# k
    hidden_size = 256 #LSTM hidden size
    num_layers=2 #LSTM layers
    inception_dim = 512 #inception的卷积核数
    
    vocab_size = 65462
    kernel_size = 3 #单尺度卷积核
    kernel_sizes = [2,3,4] #多尺度卷积核
    content_seq_len = 200 
    label_list_pkl = "../inputs/columns.pkl"
    embedding_path = '../inputs/fasttextwordvec.npz' # Embedding

In [8]:
modelopt = ModelConfig()

In [10]:
import numpy as np
model = CNNText_inception(modelopt)

load embedding


## Dataset

In [11]:
from torch.utils import data
class FGSentimetDataset(data.Dataset):
    def __init__(self, X_npz, Y_npz, label_pkl, augument=False, training=True):
        self.augument=augument
        self.training = training
        self.label_list = pickle.load(open(label_pkl, 'rb'))
        self.X = np.load(X_npz)['X']
        self.Y = np.load(Y_npz)
        self._len = self.X.shape[0]
    def shuffle(self,d):
        return np.random.permutation(d.tolist())

    def dropout(self,d,p=0.5):
        len_ = len(d)
        index = np.random.choice(len_,int(len_*p))
        d[index]=0
        return d     


    def __getitem__(self,index):
        
        content =  self.X[index]
    
        if self.training:  
            if self.augument :
                augument=random.random()

                if augument>0.5:
                    content = self.dropout(content,p=0.3)
                else:
                    content = self.shuffle(content)

            data =t.from_numpy(content).long()
            label_dict = {label:t.from_numpy(self.Y[label][index]).long() for label in self.label_list}
            return data, label_dict
        else:
            return t.from_numpy(content).long()

    def __len__(self):
        return self.len_        
  

## 验证函数

In [13]:
from collections import defaultdict    
def val(model,dataset,val_opt:FGSentimetDataset):
    '''
    计算模型在验证集上的分数
    '''
    opt = val_opt
    model.eval()
    dataloader = data.DataLoader(dataset,
                    batch_size = opt.batch_size,
                    shuffle = False,
                    num_workers = opt.num_workers,
                    pin_memory = True
                    )
    
    predict_dict = defaultdict(list)
    for ii, content in tqdm.tqdm(enumerate(dataloader)):
        content =  content.cuda()
        predict_dict = model(content)
        for col_name in model.label_list:
            predict_dict[col_name].extend(predict_dict[col_name].cpu().tolist())
    scores = get_score(dataset.Y, predict_dict)
    return scores

In [None]:
import tqdm
dataloader = torch.utils.data.DataLoader(train,
                batch_size = 256,
                shuffle = True,
                num_workers = 8,
                pin_memory = True
                )
optimizer = model.get_optimizer(5e-3, 1e-3, 0.99)
best_score = 0
loss_function = torch.nn.MultiLabelSoftMarginLoss()
model.cuda()
for epoch in range(100):
    for ii,(content,label) in tqdm.tqdm(enumerate(dataloader)):
        # 训练 更新参数
        content,label = content.long().cuda(),label.long().cuda()
        optimizer.zero_grad()
        score = model(content)
        loss = loss_function(score,label.float())
        loss.backward()
        optimizer.step()