### [(LSTM + leaky feature) + handcrafted_Feature] -> xgboost

In [1]:
import torch.nn as nn
import torch
import numpy as np
import pandas as pd
import datetime, time, json
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import os
import torch.nn.functional as F
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

from config import *

In [2]:
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 1996
NB_EPOCHS = 25
DROPOUT = 0.2
BATCH_SIZE = 1024
LSTM_HIDDEN = 128
NUM_LAYERS = 2
MODEL_NAME = 'best_model.pt'
MODEL_PATH = 'data/model'
DENSE_HIDDEN = 128
FEATURE_NUM = 3 # leaky_feature = 3 , all magic_features = 15 , all feature = 18

In [3]:
# 判断gpu是否可用
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'
DEVICE = torch.device(device)

### 加载已保存好的数据集

In [4]:
train_q1_data = np.load(open(Q1_TRAINING_DATA_FILE, 'rb'))
train_q2_data = np.load(open(Q2_TRAINING_DATA_FILE, 'rb'))
nlp_feature_train = pd.read_csv(FEATURE_TRAIN).values
leaky_feature_train = np.load(open(LEAKY_FEATURE_TRAIN, 'rb'))

test_q1_data = np.load(open(Q1_TEST_DATA_FILE, 'rb'))
test_q2_data = np.load(open(Q2_TEST_DATA_FILE, 'rb'))
nlp_feature_test = pd.read_csv(FEATURE_TEST).values
leaky_feature_test = np.load(open(LEAKY_FEATURE_TEST, 'rb'))

labels = np.load(open(LABEL_TRAINING_DATA_FILE, 'rb'))
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
    NB_WORDS = json.load(f)['nb_words']

### 获取最后预测集

In [5]:
predict_X = np.stack((test_q1_data, test_q2_data), axis=1)

predict_X_Q1 = predict_X[:,0]
predict_X_Q2 = predict_X[:,1]
nlp_predict_feature = nlp_feature_test
predict_leaky_feature = leaky_feature_test
predict_feature = np.concatenate((nlp_predict_feature,predict_leaky_feature),axis = 1)

### 分割验证集

In [6]:
part1_x = np.vstack((train_q1_data, train_q2_data)) 
part2_x = np.vstack((train_q2_data, train_q1_data)) #反对称数据集

X = np.stack((part1_x, part2_x), axis=1)
y = np.concatenate((labels,labels))
feature_train = np.concatenate((nlp_feature_train,nlp_feature_train))
leaky_feature_train = np.concatenate((leaky_feature_train,leaky_feature_train))
concate_feature_train = np.concatenate((feature_train,leaky_feature_train),axis = 1)


# normalize the features
ss = StandardScaler()
ss.fit(np.vstack((concate_feature_train, predict_feature)))
concate_feature_train = ss.transform(concate_feature_train)
predict_feature = ss.transform(predict_feature)


Q1_all = X[:,0]
Q2_all = X[:,1]
y_all = y

indices = np.arange(X.shape[0])

X_train, X_test, y_train, y_test,idx_train, idx_test = train_test_split(X, y,indices, test_size = TEST_SPLIT, random_state = RNG_SEED)
Q1_train = X_train[:,0]
Q2_train = X_train[:,1]
F_train = feature_train[idx_train]
F_test = feature_train[idx_test]
leaky_F_train = leaky_feature_train[idx_train]
leaky_F_test = leaky_feature_train[idx_test]
all_feature_train = concate_feature_train[idx_train]
all_feature_test = concate_feature_train[idx_test]

Q1_test = X_test[:,0]
Q2_test = X_test[:,1]

TRAIN_SIZE = X_train.shape[0]
TEST_SIZE = X_test.shape[0]

### 创建数据集

In [7]:
class PointDataSet(Dataset):
    def __init__(self,Q1_train,Q2_train,feature,y_train = None):
        self.Q1_train=Q1_train
        self.Q2_train=Q2_train
        self.feature=feature
        self.labels=y_train
        self.lenth=Q1_train.shape[0]
    def __getitem__(self, index):
        if self.labels is not None:
            return self.Q1_train[index],self.Q2_train[index],self.feature[index],self.labels[index]
        else:
            return self.Q1_train[index],self.Q2_train[index],self.feature[index]
    def __len__(self):
        return self.lenth

In [8]:
########################################
## leaky features
########################################

# train
train_dl_data = PointDataSet(Q1_train, Q2_train,leaky_F_train, y_train)
train_dl = DataLoader(dataset=train_dl_data, batch_size=BATCH_SIZE,shuffle=True)
# test
test_dl_data = PointDataSet(Q1_test, Q2_test,leaky_F_test, y_test)
test_dl = DataLoader(dataset=test_dl_data, batch_size=BATCH_SIZE,shuffle=True)
#all
all_dl_data = PointDataSet(Q1_all, Q2_all,leaky_feature_train, y_all)
all_dl = DataLoader(dataset=all_dl_data, batch_size=2048,shuffle=False)
# predict
predict_data = PointDataSet(predict_X_Q1, predict_X_Q2,predict_leaky_feature)
predict_dl = DataLoader(dataset=predict_data, batch_size=2048,shuffle=False)

### LSTM模型

### LSTM + feature

In [9]:
class LSTM_dis_angle(nn.Module):
    def __init__(self,pretrained_weight = None,pretrained_embed = False):
        super(LSTM_dis_angle, self).__init__()
        
        if pretrained_embed:
            pretrained_weight = torch.FloatTensor(pretrained_weight).to(DEVICE)
            self.embed = nn.Embedding.from_pretrained(pretrained_weight,freeze=True)
        else:
            self.embed = nn.Embedding(NB_WORDS + 1, EMBEDDING_DIM).cuda()
        
        # input shape: 词向量维度，hidden个数，lstm层数
        self.LSTM_stack = nn.LSTM(EMBEDDING_DIM, LSTM_HIDDEN, num_layers=NUM_LAYERS)
        for name, param in self.LSTM_stack.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_normal_(param)
                
        self.fc1 = nn.Linear(FEATURE_NUM, DENSE_HIDDEN//2)  ##  (max sentence length * hidden layer, 256)
        
        self.concat_bn = nn.BatchNorm1d(num_features=DENSE_HIDDEN//2 + NUM_LAYERS*2*DENSE_HIDDEN)
        self.dp = nn.Dropout(DROPOUT)
        
        self.fc2 = nn.Linear(DENSE_HIDDEN//2 + NUM_LAYERS*2*DENSE_HIDDEN, DENSE_HIDDEN)
        self.bn2 = nn.BatchNorm1d(num_features=DENSE_HIDDEN)
        self.dp2 = nn.Dropout(DROPOUT)
        
        self.fc3 = nn.Linear(DENSE_HIDDEN, 2)

    def exponent_neg_manhattan_distance(self, x1, x2):
        ''' Helper function for the similarity estimate of the LSTMs outputs '''
        return torch.exp(-torch.sum(torch.abs(x1 - x2), dim=1))

    def angle(self, x1, x2):
        ''' Helper function for the similarity estimate of the LSTMs outputs '''
        return torch.bmm(x1,x2)

    def forward(self, x1, x2, feature):
        # feature : batchsize x 15
        x1 = self.embed(x1)
        x2 = self.embed(x2)
        batch_size = x1.shape[0]
        x1 = x1.transpose(0, 1) # L,B,E
        x2 = x2.transpose(0, 1) # L,B,E

        x1, (hidden_1,_) = self.LSTM_stack(x1)  # L,B,H / NUM_LAYERS,B,H
        x2, (hidden_2,_) = self.LSTM_stack(x2)  # L,B,H / NUM_LAYERS,B,H
        
        hidden_1 = hidden_1.transpose(0,1).contiguous().view(batch_size,-1) # B, NUM_LAYERS * H
        hidden_2 = hidden_1.transpose(0,1).contiguous().view(batch_size,-1)# B,  NUM_LAYERS* H
        
        f = self.fc1(feature) # B,2H
        x = torch.cat((hidden_1, hidden_2, f), 1) # (B, DENSE_HIDDEN//2 + NUM_LAYERS*2*DENSE_HIDDEN)
        hidden_x = x
        x = self.concat_bn(x)
        x = self.dp(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.dp2(x)
        x = self.fc3(x)
        
        return x,hidden_x

### 模型保存

In [10]:
def save_model(network, path = MODEL_PATH, name=MODEL_NAME):
#     torch.save(network, os.path.join(path, name))
    state = model.state_dict()
    for key in state:
        state[key] = state[key].clone().cpu()
    torch.save(network.state_dict(),os.path.join(path,name))

### 模型预测

In [11]:
def predict(network, data_iter):
    network = network.to(DEVICE)
    network.eval()
    predictions = []
    shape = 2*NUM_LAYERS*DENSE_HIDDEN + DENSE_HIDDEN//2
    F_data = np.array([]).reshape(0,shape)
    for each in tqdm(data_iter):
        q1 = each[0]
        q2 = each[1]
        feature = each[2]
        q1 = q1.long().to(DEVICE)
        q2 = q2.long().to(DEVICE)
        feature = feature.float().to(DEVICE)
        outputs_1,q1_F = network(q1,q2,feature)
        outputs_2,q2_F = network(q2,q1,feature)
        
        m = nn.Softmax(dim=1)
        outputs_1 = m(outputs_1)
        outputs_2 = m(outputs_2)
        
        pred_y_1 = outputs_1.data.cpu().numpy()
        pred_y_1 = pred_y_1[:,1] #把是duplicate的概率
        
        pred_y_2 = outputs_2.data.cpu().numpy()
        pred_y_2 = pred_y_2[:,1] #把是duplicate的概率
        
        pred_y = ((pred_y_1 + pred_y_2)/2).tolist()
        predictions += pred_y
        q1_F = q1_F.data.cpu().numpy()
        q2_F = q2_F.data.cpu().numpy()

        avg_F = ((q1_F + q2_F)/2)
        F_data = np.vstack((F_data, avg_F))
        
    return predictions,F_data

### 训练函数

In [12]:
def train(net,EPOCH,loss_func,optimizer,data_iter,valid_dl,use_valid = True):
    
    def evaluate_lossAndAcc(data_iter, net):
        net.eval()
        l_sum ,acc = 0.0 , 0.0
        count = 0
        for q1,q2,feature,y in data_iter:
            #cuda
            q1 = q1.long().to(DEVICE)
            q2 = q2.long().to(DEVICE)
            feature = feature.float().to(DEVICE)
            y = y.cuda()
            
            outputs,_ = net(q1,q2,feature)

            #calculate acc
            _, pred_y = torch.max(outputs.data, 1)
            #calculate sum
            l_sum += loss_func(outputs, y).data.cpu().numpy()
            
            pred_y = pred_y.data.cpu().numpy()
            accuracy = float((pred_y == y.data.cpu().numpy()).astype(int).sum()) / float(y.size(0))
            acc += accuracy

            count+=1
        return l_sum / count, acc / count
        
    best_loss = 100
    for epoch in range(1,EPOCH+1):
        start = time.time()
        train_lsum, n , acc = 0.0, 0, 0.0
        for q1,q2,feature,y in data_iter:
            net.train()
            q1 = q1.long().to(DEVICE)
            q2 = q2.long().to(DEVICE)
            feature = feature.float().to(DEVICE)
            y = y.cuda()
            outputs,_ = net(q1, q2, feature)
            _, prediction = torch.max(outputs.data, 1)
            
            optimizer.zero_grad()
            loss = loss_func(outputs,y)
            loss.backward()
            optimizer.step()
            train_lsum += loss.data.cpu().numpy()
            n+=1
            
            pred_y = prediction.data.cpu().numpy()
            accuracy = float((pred_y == y.data.cpu().numpy()).astype(int).sum()) / float(y.size(0))
            acc += accuracy
            
        if epoch % 1 == 0:
            valid_loss,valid_acc = evaluate_lossAndAcc(valid_dl, net)
            print("epoch %d | train loss : %.4f | valid loss : %.4f | train_acc : %.4f | valid_acc : %.4f | time :%.4f sec" 
                       % (epoch, train_lsum / n, valid_loss, acc / n, valid_acc, (time.time() - start)))
            if valid_loss < best_loss:
                best_loss = valid_loss
                save_model(net)

### 训练

In [13]:
print('training on %d samples, validate on %d samples' %(TRAIN_SIZE,TEST_SIZE))
loss_func = nn.CrossEntropyLoss(weight = torch.tensor([1.309028344,0.472001959]).to(DEVICE))
model = LSTM_dis_angle(pretrained_weight = word_embedding_matrix,pretrained_embed = True).to(DEVICE)
EPOCH = 50
optimizer = Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
train(model,EPOCH,loss_func,optimizer,train_dl,test_dl,use_valid = False)

training on 727722 samples, validate on 80858 samples
epoch 1 | train loss : 0.2821 | valid loss : 0.2688 | train_acc : 0.8071 | valid_acc : 0.8253 | time :16.9390 sec
epoch 2 | train loss : 0.2686 | valid loss : 0.2623 | train_acc : 0.8184 | valid_acc : 0.8231 | time :16.9248 sec
epoch 3 | train loss : 0.2654 | valid loss : 0.2617 | train_acc : 0.8216 | valid_acc : 0.8246 | time :17.0675 sec
epoch 4 | train loss : 0.2634 | valid loss : 0.2600 | train_acc : 0.8231 | valid_acc : 0.8269 | time :16.9671 sec
epoch 5 | train loss : 0.2622 | valid loss : 0.2592 | train_acc : 0.8239 | valid_acc : 0.8227 | time :17.1785 sec
epoch 6 | train loss : 0.2608 | valid loss : 0.2630 | train_acc : 0.8249 | valid_acc : 0.8334 | time :17.2071 sec
epoch 7 | train loss : 0.2599 | valid loss : 0.2594 | train_acc : 0.8253 | valid_acc : 0.8309 | time :17.3195 sec
epoch 8 | train loss : 0.2592 | valid loss : 0.2608 | train_acc : 0.8260 | valid_acc : 0.8205 | time :17.2913 sec
epoch 9 | train loss : 0.2584 | va

KeyboardInterrupt: 

### 提取所有训练数据的LSTM的中间表达


In [14]:
model_dict = torch.load(os.path.join(MODEL_PATH, MODEL_NAME))
model = LSTM_dis_angle()
model.load_state_dict(model_dict)
_,F_train = predict(model, all_dl)
_,F_test = predict(model, predict_dl)

100%|██████████| 395/395 [04:56<00:00,  1.44s/it]
100%|██████████| 1146/1146 [40:31<00:00,  4.22s/it]


In [15]:
np.save(open('data/tmp/train_dl_feature', 'wb'), F_train)
np.save(open('data/tmp/test_dl_feature', 'wb'), F_test)