定义加载数据的方法

In [1]:
import time
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import itertools
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize

def process_data(path):

    # 现在是使用10M个进行尝试
    tra_data = pd.read_csv(
        path,
        header=None,
        names=['query_id', 'query', 'query_title_id', 'title', 'label'],
        iterator=True
    )
    chunks = tra_data.get_chunk(2500000)
    tra_num = 2000000
    test_num = 500000
    train_text = [s.split() for s in chunks['title'][:tra_num]]
    test_text = [s.split() for s in chunks['title']
                 [tra_num: tra_num + test_num]]

    train_y = chunks['label'][:tra_num]
    test_y = chunks['label'][tra_num: tra_num + test_num]
    del(chunks)
    
    submit_data = pd.read_csv(
                    "/home/kesci/input/bytedance/first-round/test.csv",
                    header=None,
                    names=['query_id', 'query', 'query_title_id', 'title']
                    )
    submit_text = [s.split() for s in submit_data['title']]
    
    
    return train_text, test_text, train_y, test_y, submit_text, submit_data[['query_id', 'query_title_id']]


def pad_sentences(sentences, sequence_length=20, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    # max_length = max(len(x) for x in sentences)
    # ave_length = sum(len(x) for x in sentences)/len(sentences)
    # print('max length', max_length)
    # print('ave length', ave_length)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        if num_padding >= 0:
            new_sentence = sentence + [padding_word] * num_padding
            padded_sentences.append(new_sentence)
        else:
            padded_sentences.append(sentence[:sequence_length])
    return padded_sentences, sequence_length


def build_vocab(sentences, min_count=100):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0]
                      for x in word_counts.most_common() if x[1] >= min_count]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]


def build_input_data_generator(sentences, labels, vocabulary, batch_size=50000, samples_per_epoch=2000000):
    steps_per_epoch=int(samples_per_epoch/batch_size)
    while 1:
        inx = 0
        for i in range(steps_per_epoch):
            batch_x = sentences[inx:inx+batch_size]
            batch_y = labels[inx:inx+batch_size]
            inx += batch_size
            # create Numpy arrays of input data
            # and labels, from each line in the file
            x = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in batch_x])
            y = np.array(batch_y)
            # x, y = process_line(line)
            yield (x, y)
            

def build_test_4_input(vocabulary, len1, len2, len3, len4):
    test1 = pd.read_csv(
                    "/home/kesci/input/bytedance/first-round/test.csv",
                    header=None, 
                    names=['query_id', 'query', 'query_title_id', 'title'], 
                    ) 
    
    print('loading 4 input test data .....')
    len1 = len1
    len2 = len2
    len3 = len3
    len4 = len4
    sens1 = [s.split() for s in test1['query']]
    pad1, _ = pad_sentences(sens1, len1)
    sens2 = [s.split() for s in test1['title']]
    pad2, _ = pad_sentences(sens2, len2)
    del(sens1, sens2)
    sub_info = test1[['query_id', 'query_title_id']]
    del(test1)
    test2 = pd.read_csv(
                    "/home/kesci/work/test_and_diff_text.csv",
                    ) 
    sens3 = [str(s).split() for s in test2['andText']]
    print('padding 2-------')
    pad3, _ = pad_sentences(sens3, len3)
    del(sens3)
    sens4 = [str(s).split() for s in test2['diffText']]
    del(test2)
    pad4, _ = pad_sentences(sens4, len4)
    del(sens4)
    print('transfer ------')
    x1 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad1])
    del(pad1)
    x2 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad2])
    del(pad2)
    x3 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad3])
    del(pad3)
    x4 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad4])
    del(pad4)
    x = [x1, x2, x3, x4]
    return x, sub_info


def generator_4_input(vocabulary, len1, len2, len3, len4, batch_size=50000, samples=1000000):
    steps_per_epoch=int(samples/batch_size)
    # chunks_1 = pd.read_csv(
    #                 "/home/kesci/input/bytedance/first-round/train.csv",
    #                 header=None, 
    #                 names=['query_id', 'query', 'query_title_id', 'title', 'label'], 
    #                 iterator=True
    #                 ) 
    # chunks_2 = pd.read_csv(
    #                 "/home/kesci/work/15M_and_diff_text.csv",
    #                 iterator=True
    #                 ) 
    while 1:
        chunks_1 = pd.read_csv(
                    "/home/kesci/input/bytedance/first-round/train.csv",
                    header=None, 
                    names=['query_id', 'query', 'query_title_id', 'title', 'label'], 
                    iterator=True
                    ) 
        chunks_2 = pd.read_csv(
                    "/home/kesci/work/15M_and_diff_text.csv",
                    iterator=True
                    ) 
        # for j in range(7):
        #     chunks_1.get_chunk(1000000)
        #     chunks_2.get_chunk(1000000)
        # print('从700万开始')
        len1 = len1
        len2 = len2
        len3 = len3
        len4 = len4
        for i in range(steps_per_epoch):
            x = chunks_1.get_chunk(batch_size)
            sen1 = x['query']
            sen2 = x['title']
            label = x['label']
            chunk2 = chunks_2.get_chunk(batch_size)
            # print(sen1.)
            sens1 = [s.split() for s in sen1]
            sens2 = [s.split() for s in sen2]
            sens3 = [str(s).split() for s in chunk2['andText']]
            sens4 = [str(s).split() for s in chunk2['diffText']]

            pad1, _ = pad_sentences(sens1, len1)
            pad2, _ = pad_sentences(sens2, len2)
            pad3, _ = pad_sentences(sens3, len3)
            pad4, _ = pad_sentences(sens4, len4)

            # create Numpy arrays of input data
            # and labels, from each line in the file
            x1 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad1])
            x2 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad2])
            x3 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad3])
            x4 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad4])
            y = np.array(label)
            x = [x1, x2, x3, x4]
            # x, y = process_line(line)
            yield (x, y)
         
        
def build_valid_4_input(vocabulary, len1, len2, len3, len4):

    chunks_1 = pd.read_csv(
                    "/home/kesci/input/bytedance/first-round/train.csv",
                    header=None, 
                    names=['query_id', 'query', 'query_title_id', 'title', 'label'], 
                    iterator=True
                    ) 
    chunks_2 = pd.read_csv(
                    "/home/kesci/work/15M_and_diff_text.csv",
                    iterator=True
                    ) 
    for j in range(14):
        chunks_1.get_chunk(1000000)
        chunks_2.get_chunk(1000000)
    
    print('从1400万开始取valid data ......')
    len1 = len1
    len2 = len2
    len3 = len3
    len4 = len4
    
    # 1400W 到 1500W作为valid
    x = chunks_1.get_chunk(1000000)
    sen1 = x['query']
    sen2 = x['title']
    label = x['label']
    chunk2 = chunks_2.get_chunk(1000000)
            # print(sen1.)
    sens1 = [s.split() for s in sen1]
    sens2 = [s.split() for s in sen2]
    sens3 = [str(s).split() for s in chunk2['andText']]
    sens4 = [str(s).split() for s in chunk2['diffText']]

    pad1, _ = pad_sentences(sens1, len1)
    pad2, _ = pad_sentences(sens2, len2)
    pad3, _ = pad_sentences(sens3, len3)
    pad4, _ = pad_sentences(sens4, len4)

            # create Numpy arrays of input data
            # and labels, from each line in the file
    x1 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad1])
    x2 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad2])
    x3 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad3])
    x4 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad4])
    y = np.array(label)
    x = [x1, x2, x3, x4]
    print('build valid data over ......')
        # x, y = process_line(line)
    return (x, y)


def build_test_2_input(vocabulary):
    test1 = pd.read_csv(
                    "/home/kesci/input/bytedance/first-round/test.csv",
                    header=None, 
                    names=['query_id', 'query', 'query_title_id', 'title'], 
                    ) 
    
    print('loding')
    len1 = 20
    len2 = 10
    len3 = 15
    # sens1 = [s.split() for s in test1['query']]
    # pad1, _ = pad_sentences(sens1, len1)
    # del(sens1)
    sub_info = test1[['query_id', 'query_title_id']]
    del(test1)
    test2 = pd.read_csv(
                    "/home/kesci/work/test_and_diff_text.csv",
                    ) 
    sens2 = [str(s).split() for s in test2['andText']]
    print('padding 2-------')
    pad2, _ = pad_sentences(sens2, len2)
    del(sens2)
    sens3 = [str(s).split() for s in test2['diffText']]
    del(test2)
    pad3, _ = pad_sentences(sens3, len3)
    del(sens3)
    print('transfer ------')
    # x1 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad1])
    # del(pad1)
    x2 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad2])
    del(pad2)
    x3 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad3])
    del(pad3)
    x = [x2, x3]
    return x, sub_info
    
    
def build_test_3_input(vocabulary, len1, len2, len3):
    test1 = pd.read_csv(
                    "/home/kesci/input/bytedance/first-round/test.csv",
                    header=None, 
                    names=['query_id', 'query', 'query_title_id', 'title'], 
                    ) 
    
    print('loding')
    len1 = len1
    len2 = len2
    len3 = len3
    sens1 = [s.split() for s in test1['query']]
    pad1, _ = pad_sentences(sens1, len1)
    del(sens1)
    sub_info = test1[['query_id', 'query_title_id']]
    del(test1)
    test2 = pd.read_csv(
                    "/home/kesci/work/test_and_diff_text.csv",
                    ) 
    sens2 = [str(s).split() for s in test2['andText']]
    print('padding 2-------')
    pad2, _ = pad_sentences(sens2, len2)
    del(sens2)
    sens3 = [str(s).split() for s in test2['diffText']]
    del(test2)
    pad3, _ = pad_sentences(sens3, len3)
    del(sens3)
    print('transfer ------')
    x1 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad1])
    del(pad1)
    x2 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad2])
    del(pad2)
    x3 = np.array([[vocabulary.get(word, 0) for word in sentence] for sentence in pad3])
    del(pad3)
    x = [x1, x2, x3]
    return x, sub_info



def generator_3_input(vocabulary, len1, len2, len3, batch_size=50000, samples=1000000):
    steps_per_epoch=int(samples/batch_size)
    # chunks_1 = pd.read_csv(
    #                 "/home/kesci/input/bytedance/first-round/train.csv",
    #                 header=None, 
    #                 names=['query_id', 'query', 'query_title_id', 'title', 'label'], 
    #                 iterator=True
    #                 ) 
    # chunks_2 = pd.read_csv(
    #                 "/home/kesci/work/15M_and_diff_text.csv",
    #                 iterator=True
    #                 ) 
    while 1:
        chunks_1 = pd.read_csv(
                    "/home/kesci/input/bytedance/first-round/train.csv",
                    header=None, 
                    names=['query_id', 'query', 'query_title_id', 'title', 'label'], 
                    iterator=True
                    ) 
        chunks_2 = pd.read_csv(
                    "/home/kesci/work/15M_and_diff_text.csv",
                    iterator=True
                    ) 
        # for j in range(7):
        #     chunks_1.get_chunk(1000000)
        #     chunks_2.get_chunk(1000000)
        # print('从700万开始')
        len1 = len1
        len2 = len2
        len3 = len3
        for i in range(steps_per_epoch):
            x = chunks_1.get_chunk(batch_size)
            sen1 = x['query']
            label = x['label']
            sen2 = chunks_2.get_chunk(batch_size)
            # print(sen1.)
            sens1 = [s.split() for s in sen1]
            sens2 = [str(s).split() for s in sen2['andText']]
            sens3 = [str(s).split() for s in sen2['diffText']]

            pad1, _ = pad_sentences(sens1, len1)
            pad2, _ = pad_sentences(sens2, len2)
            pad3, _ = pad_sentences(sens3, len3)

            # create Numpy arrays of input data
            # and labels, from each line in the file
            x1 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad1])
            x2 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad2])
            x3 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad3])
            y = np.array(label)
            x = [x1, x2, x3]
            # x, y = process_line(line)
            yield (x, y)

            
def generator_2_input(vocabulary, batch_size=50000, samples=1000000):
    steps_per_epoch=int(samples/batch_size)

    while 1:
        chunks_1 = pd.read_csv(
                    "/home/kesci/input/bytedance/first-round/train.csv",
                    header=None, 
                    names=['query_id', 'query', 'query_title_id', 'title', 'label'], 
                    iterator=True
                    ) 
        chunks_2 = pd.read_csv(
                    "/home/kesci/work/15M_and_diff_text.csv",
                    iterator=True
                    ) 
        # for j in range(7):
        #     chunks_1.get_chunk(1000000)
        #     chunks_2.get_chunk(1000000)
        # print('从700万开始')
        for i in range(steps_per_epoch):
            label = chunks_1.get_chunk(batch_size)['label']
            # sen1 = x['query']
            # label = x['label']
            sen2 = chunks_2.get_chunk(batch_size)
            # print(sen1.)
            # sens1 = [s.split() for s in sen1]
            sens2 = [str(s).split() for s in sen2['andText']]
            sens3 = [str(s).split() for s in sen2['diffText']]
            # len1 = 20
            len2 = 10
            len3 = 15
            # pad1, _ = pad_sentences(sens1, len1)
            pad2, _ = pad_sentences(sens2, len2)
            pad3, _ = pad_sentences(sens3, len3)

            # create Numpy arrays of input data
            # and labels, from each line in the file
            # x1 = np.array([[vocabulary.get(word, 0) for word in sentence]
                #   for sentence in pad1])
            x2 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad2])
            x3 = np.array([[vocabulary.get(word, 0) for word in sentence]
                  for sentence in pad3])
            y = np.array(label)
            x = [x2, x3]
            # x, y = process_line(line)
            yield (x, y)
    

定义NN层，Keras based on tensorflow

In [2]:
import tensorflow as tf
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, SpatialDropout1D
from keras.engine import Layer
from keras.layers import Activation, Add, Bidirectional, Conv1D, Dense, Dropout, Embedding, Flatten
from keras.layers import concatenate, GRU, Input, K, LSTM, MaxPooling1D
from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import MaxPool1D
from keras.engine import InputSpec
from keras.layers.pooling import MaxPool1D, AveragePooling1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from sklearn.metrics import roc_curve
from sklearn.metrics import auc


def TextCNN(sequence_length,
            num_filters,
            voc,
            dense_size,
            drop_out_rate=0.4,
            embedding_size=200,
            filter_sizes=[3, 4, 5]
            ):
        # input layer
        input1 = Input(shape=(sequence_length,))
        embed_layer = Embedding(len(voc), embedding_size, trainable=True)(input1)
        embed_layer = SpatialDropout1D(0.3)(embed_layer)
        # create a convolution + maxpool layer for each filter size
        pooled_outputs = []
        for filter_size in filter_sizes:
            x = Conv1D(num_filters, filter_size, activation='relu')(embed_layer)
            x = MaxPool1D(int(x.shape[1]))(x)
            pooled_outputs.append(x)
        merged = concatenate(pooled_outputs)
        x = Flatten()(merged)
        drop = Dropout(drop_out_rate)(x)
        outputs = Dense(dense_size, activation='sigmoid')(drop)

        model = Model(input1, outputs)
        model.compile(loss="binary_crossentropy", optimizer='adam', metrics=[auc_score, 'accuracy'])
        return model


class KMaxPooling(Layer):
    """
    k-max-pooling
    """

    def __init__(self, k=1, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k

    def compute_output_shape(self, input_shape):
        return (input_shape[0], (input_shape[2] * self.k))

    def call(self, inputs):
        shifted_input = tf.transpose(inputs, [0, 2, 1])
        top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
        # return flattened output
        # print(top_k.shape)
        return Flatten()(top_k)

    
def simple_2_input(len_2, len_3, voc, dropout_p, num_class=1, drop_out_p=0.5, embedding_size=128): 
    '''
    使用差集与交集两个输入
    '''
    input2 = Input(shape=(len_2,))
    input3 = Input(shape=(len_3,))
    embedder = Embedding(len(voc), embedding_size, trainable=True)
    embedder2 = Embedding(len(voc), embedding_size, trainable=True)
    embed2 = embedder(input2)
    embed3 = embedder2(input3)

    ave2 = GlobalAveragePooling1D()(embed2)
    ave3 = GlobalAveragePooling1D()(embed3)

    max2 = GlobalMaxPooling1D()(embed2)
    max3 = GlobalMaxPooling1D()(embed3)
    con = concatenate([ave2, max2, ave3, max3])
    output = Dense(100)(con)
    output = Dropout(drop_out_p)(output)
    main_output = Dense(num_class, activation='sigmoid')(output)
    model = Model(inputs=[input2, input3], outputs=main_output)
    model.compile(loss="binary_crossentropy", optimizer='adam', metrics=[auc_score, 'accuracy'])
    
    return model


def simple_4_input(len_1, len_2, len_3, len_4, voc, dropout_p, num_class=1, drop_out_p=0.5, embedding_size=128):
    '''
    使用query title 交集 差集4个输入
    '''
    input1 = Input(shape=(len_1,))
    input2 = Input(shape=(len_2,))
    input3 = Input(shape=(len_3,))
    input4 = Input(shape=(len_4,))
    embedder = Embedding(len(voc), embedding_size, trainable=True)
    embed1 = embedder(input1)
    embed2 = embedder(input2)
    embed3 = embedder(input3)
    embed4 = embedder(input4)
    ave1 = GlobalAveragePooling1D()(embed1)
    ave2 = GlobalAveragePooling1D()(embed2)
    ave3 = GlobalAveragePooling1D()(embed3)
    ave4 = GlobalAveragePooling1D()(embed4)
    max1 = GlobalMaxPooling1D()(embed1)
    max2 = GlobalMaxPooling1D()(embed2)
    max3 = GlobalMaxPooling1D()(embed3)
    max4 = GlobalMaxPooling1D()(embed4)
    con = concatenate([ave1, ave2, ave3, ave4, max1, max2, max3, max4])
    output = Dense(100)(con)
    output = Dropout(drop_out_p)(output)
    main_output = Dense(num_class, activation='sigmoid')(output)
    model = Model(inputs=[input1, input2, input3, input4], outputs=main_output)
    model.compile(loss="binary_crossentropy", optimizer='adam', metrics=[auc_score, 'accuracy'])
    return model

def simple_3_input(len_1, len_2, len_3, voc, dropout_p, num_class=1, drop_out_p=0.5, embedding_size=128):
    '''
    使用query 交集 差集3个输入
    '''
    input1 = Input(shape=(len_1,))
    input2 = Input(shape=(len_2,))
    input3 = Input(shape=(len_3,))
    embedder = Embedding(len(voc), embedding_size, trainable=True)
    embed1 = embedder(input1)
    embed2 = embedder(input2)
    embed3 = embedder(input3)
    ave1 = GlobalAveragePooling1D()(embed1)
    ave2 = GlobalAveragePooling1D()(embed2)
    ave3 = GlobalAveragePooling1D()(embed3)
    max1 = GlobalMaxPooling1D()(embed1)
    max2 = GlobalMaxPooling1D()(embed2)
    max3 = GlobalMaxPooling1D()(embed3)
    con = concatenate([ave1, max1, ave2, max2, ave3, max3])
    output = Dense(100)(con)
    output = Dropout(drop_out_p)(output)
    main_output = Dense(num_class, activation='sigmoid')(output)
    model = Model(inputs=[input1, input2, input3], outputs=main_output)
    model.compile(loss="binary_crossentropy", optimizer='adam', metrics=[auc_score, 'accuracy'])
    return model
    
    
def simpleLayer(sequence_length, voc, dropout_p, num_class=1, drop_out_p=0.5, embedding_size=128, chose='1'):
    main_input = Input(shape=(sequence_length,), dtype='float64')
    # 词嵌入（使用预训练的词向量）
    embedder = Embedding(len(voc), embedding_size, input_length=sequence_length,
                         trainable=True)
    # embedder2 = Embedding(len(voc), embedding_size, input_length=sequence_length,
    #                       trainable=True)
    embed = embedder(main_input)
    # embed2 = embedder2(main_input)
    var = VariancePooling()(embed)
    new_ave = GlobalAveragePooling1D()(embed)
    new_max = GlobalMaxPooling1D()(embed)
    topK_max = KMaxPooling(k=2)(embed)
    #new_min = MinPooling()(embed)
    #new_min = Dropout(dropout_p)(new_min)
    var = Dropout(dropout_p)(var)

    new_ave = Dropout(dropout_p)(new_ave)
    new_max = Dropout(dropout_p)(new_max)

    topK_max = Dropout(dropout_p)(topK_max)

    # con = new_max
    # con = new_max
    if chose==1:
        con = concatenate([new_ave, new_max])
    elif chose==2:
        con = concatenate([new_ave, var])
    output = Dense(64)(con)
    output = Dropout(drop_out_p)(output)
    main_output = Dense(num_class, activation='sigmoid')(output)
    model = Model(inputs=main_input, outputs=main_output)
    model.compile(loss="mse", optimizer='adam', metrics=[auc_score, 'accuracy'])
    return model


def auc_score(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

ImportError: No module named tensorflow

生成训练集的query与title并集与交集文件，我先生成了前1500W的

In [None]:
# 生成训练集的并集与交集数据
import time

path = '/home/kesci/input/bytedance/first-round/train.csv'
tra_data = pd.read_csv(
        path,
        header=None,
        names=['query_id', 'query', 'query_title_id', 'title', 'label'],
        iterator=True
    )

chunks = tra_data.get_chunk(15000000)
# chunks.head()
t1 = time.time()
print('begin----')

and_list = []  #交集
diff_list = []  #差集
for i in range(15000000):
    if i%500000==0:
        print(i, round(time.time() - t1, 4))
    que_set = set(chunks['query'][i].split())
    tit_set = set(chunks['title'][i].split())
    and_list.append( ' '.join(list(que_set & tit_set)) )
    diff_list.append( ' '.join(list(tit_set - que_set)) )
print()
print('生成并集与交集文件总共耗费了{}秒'.format( round(time.time() - t1, 4) ))
print('and_list长度为{}'.format(len(and_list)))
print('diff_list长度为{}'.format(len(diff_list)))

df = pd.DataFrame({
    'andText':and_list,
    'diffText':diff_list
})
print(df.shape)

# 生成15M训练集并集差集文件
df.to_csv('15M_and_diff_text.csv')
print(time.time() - t1)
# for ti in chunks['title'][:1]:
#     print(ti)
# chunks['title'][:10]

生成测试集的并集与差集文件

In [None]:
#生成测试集的交集和差集
path = '/home/kesci/input/bytedance/first-round/test.csv'
test_data = pd.read_csv(
        path,
        header=None,
        names=['query_id', 'query', 'query_title_id', 'title']
    )

chunks = test_data
chunks.head()
t1 = time.time()
print('begin----')
and_list = []  #交集
diff_list = []  #差集
for i in range(len(chunks)):
    if i%500000==0:
        print(i, round(time.time() - t1, 4))
    que_set = set(chunks['query'][i].split())
    tit_set = set(chunks['title'][i].split())
    and_list.append( ' '.join(list(que_set & tit_set)) )
    diff_list.append( ' '.join(list(tit_set - que_set)) )
print()

print('总共耗费了{}秒'.format( round(time.time() - t1, 4) ))
print('and_list长度为{}'.format(len(and_list)))
print('diff_list长度为{}'.format(len(diff_list)))

df = pd.DataFrame({
    'andText':and_list,
    'diffText':diff_list
})
print(df.shape)
df.to_csv('test_and_diff_text.csv')

构建词典，loop阈值确定了用多少训练集生成字典，例如：loop<15，代表使用1500W数据

In [None]:
import time
import pandas as pd
from collections import Counter
import numpy as np
import itertools

path = "/home/kesci/input/bytedance/first-round/"
ts = time.time()

tra_data = pd.read_csv(
                    path + "train.csv",
                    header=None, 
                    names=['query_id', 'query', 'query_title_id', 'title', 'label'], 
                    iterator=True
                    )

loop = 0
chunkSize = 1000000 
min_count = 10  #最小频次
voc_init = Counter([])
voc_inv = ["<PAD/>"]  #padding词占位0
# chunks = []
nums = 0.
# chunk = tra_data.get_chunk(chunkSize)
# print(chunk)
t_ = time.time()

while loop < 15:
    try:
        loop += 1
        # print(loop)
        chunk = tra_data.get_chunk(chunkSize)
        sentens = [s.split() for s in chunk['title']]
        voc_init += Counter(itertools.chain(*sentens))
        nums += chunk.shape[0]
        # print('训练集加载{}秒,已经加载{}%'.format(time.time() - ts, 100*round(nums/100000000, 4)))
        # chunks.append(chunk)
    except StopIteration:
        loop = 56
        print("Iteration is stopped.")
        
#构建词典
del(chunk, sentens)

voc_inv += [x[0] for x in voc_init.most_common() if x[1] >= min_count]
    # Mapping from word to index
voc_ = {x: i for i, x in enumerate(voc_inv)}
print('build vocabulary over----, 用时 {} 秒'.format(round(time.time() - t_, 4)))

In [None]:
print(list(voc_.keys())[:10])
print(len(voc_))

In [None]:
#参数设定和模型
voc = voc_
embedding_size = 100
len_1 = 20
len_2 = 25
len_3 = 8
len_4 = 15

model = simple_4_input(
    len_1,
    len_2, 
    len_3, 
    len_4,
    voc, 
    dropout_p=0.2,
    num_class=1, 
    drop_out_p=0.2,
    embedding_size=embedding_size)
print('build model over ......')

In [None]:
#valid data
valid_data = build_valid_4_input(voc, len_1, len_2, len_3, len_4)

In [None]:
classify = model
#使用了14W， samples_per_epoch * batch_size = 1400W，你们试的时候可以samples_per_epoch设置小一点看看
classify.fit_generator(
            generator_4_input(voc, len_1, len_2, len_3, len_4, batch_size=50000, samples=15000000),
            samples_per_epoch=280, validation_data=valid_data,
            epochs=12)

In [None]:
#生成要提交的数据，也就是模型预测，提交的前两列 query_id和title_id  sub_info已经包含了
sub, sub_info = build_test_4_input(voc, len_1, len_2, len_3, len_4)
print('load over')
y_sub = classify.predict(sub)
print(y_sub.shape)
print('predict over -----')

In [None]:
#生成提交文件
# print(y_sub.shape)
sub_info['prediction'] = y_sub
sub_info.head()
sub_info.to_csv('AveMax_150embed_4_input_14M_v2.csv', header=False, index=False)