In [1]:
import re,os,json
from keras import layers,models,utils
import numpy as np
import pandas as pd
import keras.backend as K
import tensorflow as tf
from keras.utils import plot_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def reset_everything():
    tf.reset_default_graph()

In [3]:
VOCAB_SIZE = 250000
EMBEDDING_SIZE = 100
MAX_DOC_LEN = 128
MIN_DOC_LEN = 12
# xml_7z = utils.get_file(
#     fname='travel.stackexchange.com.7z',
#     origin='https://ia800107.us.archive.org/27/items/stackexchange/travel.stackexchange.com.7z',
# )


In [4]:
def extract_stackexchange(filename, limit=100000):
    json_file = filename + 'limit=%s.json' % limit

    rows = []
#     f = os.popen('7z x -so "%s" Posts.xml' % filename) #解压后得到Posts.xml文件
    with open(filename) as f:
        for i, line in enumerate(f):
            line = str(line)
            if not line.startswith('  <row'):
                continue

            if i % 1000 == 0:
                print('\r%05d/%05d' % (i, limit), end='', flush=True)

            parts = line[6:-5].split('"')
            record = {}
            for i in range(0, len(parts), 2):
                k = parts[i].replace('=', '').strip()
                v = parts[i+1].strip()
                record[k] = v
            rows.append(record)

            if len(rows) > limit:
                break
    
    with open(json_file, 'w') as fout:
        json.dump(rows, fout)
    
    return rows

rows = extract_stackexchange('data/Posts.xml')

95000/100000

In [5]:
df = pd.DataFrame.from_records(rows)
df = df.set_index('Id', drop=False)
df['Title'] = df['Title'].fillna('').astype('str')
df['Tags'] = df['Tags'].fillna('').astype('str')
df['Body'] = df['Body'].fillna('').astype('str')
df['Id'] = df['Id'].astype('int')
df['PostTypeId'] = df['PostTypeId'].astype('int')
df['ViewCount'] = df['ViewCount'].astype('float')

### tokenizer

In [6]:
# df[df['ViewCount'] > 500]['Title']
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words = VOCAB_SIZE)
tokenizer.fit_on_texts(df['Body'] +' '+ df['Title'])

### compute  word tf/idf value

In [7]:
total_count = sum(tokenizer.word_counts.values())
idf = {k:np.log(total_count/v) for k,v in tokenizer.word_counts.items()}

In [8]:
df['title_tokens'] = tokenizer.texts_to_sequences(df['Title']) # token-> id
df['body_tokens'] = tokenizer.texts_to_sequences(df['Body'])

### 数据增强，生成样本

In [9]:
import random
def data_genator(batch_size, negative_sample = 1):
    """
    data augment 用来产生样本对[(a,b,0),(a,b,1)]
    """
    questions = df[df['PostTypeId'] == 1]
    all_q_id = list(questions.index) # 所有问题的idx
    batch_x_a = []
    batch_x_b = []
    batch_y = []
    
    def _add(x_a,x_b,y):
        batch_x_a.append(x_a[:MAX_DOC_LEN])
        batch_x_b.append(x_b[:MAX_DOC_LEN])
        batch_y.append(y)
        
    while True:
        questions = questions.sample(frac=1.0)
        for i, q in questions.iterrows():
            _add(q['title_tokens'], q['body_tokens'], 1)
            negative_q = random.sample(all_q_id, negative_sample)
            for nq_id in negative_q:
                _add(q['title_tokens'], df.at[nq_id, 'body_tokens'], 0) # 负样本
            if len(batch_y) >= batch_size:
                yield({
                    'title':pad_sequences(batch_x_a, maxlen=MAX_DOC_LEN), # 默认向前pad
                    'body':pad_sequences(batch_x_b, maxlen=MAX_DOC_LEN)
                }, np.array(batch_y))
                
                batch_x_a = []
                batch_x_b = []
                batch_y = []


### a simple model

In [10]:

def sum_model_v1(embedding_size=EMBEDDING_SIZE,vocab_size=VOCAB_SIZE):
    """
    返回的为两个模型,1、分类使用 2、embedding
    """
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')

    # This layer can only be used as the first layer in a model
    embedding = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=embedding_size)
    mask = layers.Masking(mask_value=0)
    def _combine_sum(v):
        return K.sum(v, axis=2)
    sum_layer = layers.Lambda(_combine_sum)
    title_sum = sum_layer(mask(embedding(title)))
    body_sum = sum_layer(mask(embedding(body)))
    sim = layers.dot([title_sum, body_sum], normalize=True, axes=1)
    sim_model = models.Model(input=[title, body], outputs=[sim])
    sim_model.compile(loss='mae', optimizer='rmsprop')
    sim_model.summary()
    
    embedding_model = models.Model(input=[title], outputs=[title_sum])
    
    return sim_model,embedding_model

### 查看模型的结构

In [11]:
sim_model,embedding_model = sum_model_v1()
# plot_model(sim_model,'model.png') # 绘制出模型的

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
body (InputLayer)               (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 100)    25000000    title[0][0]                      
                                                                 body[0][0]                       
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 100)    0           embedding_1[0][0]                
          



### Train model

In [12]:
sim_model.fit_generator(data_genator(batch_size=128), epochs=10, steps_per_epoch=100)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0xb40cfc438>

In [13]:
x, y = next(data_genator(batch_size=4096))
sim_model.evaluate(x,y)



0.48628981900401413

In [14]:
questions = df[df['PostTypeId'] == 1]['Title'].reset_index(drop=True)
question_tokens = pad_sequences(tokenizer.texts_to_sequences(questions))

class EmbeddingWrapper(object):
    def __init__(self, model):
        self._r = questions
        self._i = {i:s for (i,s) in enumerate(questions)}
        self._w = model.predict({'title':question_tokens}, verbose=1, batch_size=1024)
        self._model = model
        self._norm = np.sqrt(np.sum(self._w **2 + le-5, axis=1))
    def nearest(self, sentence, n=10):
        x = tokenizer.texts_to_sequences([sentence])
        if len(x[0]) < MIN_DOC_LEN:
            x[0] += [0] * (MIN_DOC_LEN - len(x))
        e = self._model.predict(np.asarray(x))[0]
        norm_e = np.sqrt(np.dot(e, e))
        dist = np.dot(self._w, e) / (norm_e * self._norm)

        top_idx = np.argsort(dist)[-n:]
        return pd.DataFrame.from_records([
            {'question': self._r[i], 'dist': float(dist[i])}
            for i in top_idx
        ])


In [None]:
lookup = EmbeddingWrapper(model=embedding_model)
lookup.nearest('Python Postgres object relational model')



### weighted model # todo

In [15]:
def sum_model(embedding_size, vocab_size, embedding_weight = None,idf_weight=None):
    title = layers.Input(shape=(None,), dtype='int32', name='title')
    body = layers.Input(shape=(None,), dtype='int32', name='body')
    def make_embedding(name):
        if embedding_weight is not None:
            embedding = layers.Embedding(mask_zero=True, 
                                         input_dim=vocab_size, 
                                         output_dim=w2v_weights.shape[1], 
                                         weights=[w2v_weights],
                                        trainable=False,name=f'{name}/embedding')
            
        else:
            embedding = layers.Embedding(mask_zero=True,
                                        input_dim=vocab_size,
                                        output_dim=embedding_size,
                                        name=f'{name}/embedding')
            
            
        if idf_weight is not None:
            idf = layers.Embedding(mask_zero=True,
                                   input_dim=vocab_size, 
                                   output_dim=1,
                                   weights=[idf_weight],
                                   trainable=False,
                                   name=f'{name}/idf')
        else:
            idf = layers.Embedding(mask_zero=True, input_dim=vocab_size, output_dim=1, name=f'{name}/idf')
        return embedding, idf
    embedding_a, idf_a = make_embedding('a')
    embedding_b, idf_b = make_embedding('b') # 需要共享参数
    
    mask = layers.Masking(mask_value=0)
    def _combine_and_sum(aegs):
        embedding, idf = args
        return K.sum(embedding * K.abs(idf), axis=1)
    
    
        