Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DSSM #141

Merged
merged 13 commits into from
Jul 13, 2017
Merged

DSSM #141

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
430 changes: 430 additions & 0 deletions dssm/README.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions dssm/data/classification/test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
苹果 苹果 6s 0
汽车 驾驶 驾校 培训 1
2 changes: 2 additions & 0 deletions dssm/data/classification/train.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
苹果 六 袋 苹果 6s 0
新手 汽车 驾驶 驾校 培训 1
2 changes: 2 additions & 0 deletions dssm/data/rank/test.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
苹果 六 袋 苹果 6s 新手 汽车 驾驶 1
新手 汽车 驾驶 驾校 培训 苹果 6s 0
2 changes: 2 additions & 0 deletions dssm/data/rank/train.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
苹果 六 袋 苹果 6s 新手 汽车 驾驶 1
新手 汽车 驾驶 驾校 培训 苹果 6s 1
10 changes: 10 additions & 0 deletions dssm/data/vocab.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
UNK
苹果
6s
新手
汽车
驾驶
驾校
培训
Binary file added dssm/images/dssm.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added dssm/images/dssm.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added dssm/images/dssm2.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added dssm/images/dssm2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added dssm/images/dssm3.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added dssm/infer.py
Empty file.
307 changes: 307 additions & 0 deletions dssm/network_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
from paddle import v2 as paddle
from paddle.v2.attr import ParamAttr
from utils import TaskType, logger, ModelType, ModelArch


class DSSM(object):
def __init__(self,
dnn_dims=[],
vocab_sizes=[],
model_type=ModelType.create_classification(),
model_arch=ModelArch.create_cnn(),
share_semantic_generator=False,
class_num=None,
share_embed=False):
'''
@dnn_dims: list of int
dimentions of each layer in semantic vector generator.
@vocab_sizes: 2-d tuple
size of both left and right items.
@model_type: int
type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
@model_arch: int
model architecture
@share_semantic_generator: bool
whether to share the semantic vector generator for both left and right.
@share_embed: bool
whether to share the embeddings between left and right.
@class_num: int
number of categories.
'''
assert len(
vocab_sizes
) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2."
assert len(dnn_dims) > 1, "more than two layers is needed."

self.dnn_dims = dnn_dims
self.vocab_sizes = vocab_sizes
self.share_semantic_generator = share_semantic_generator
self.share_embed = share_embed
self.model_type = ModelType(model_type)
self.model_arch = ModelArch(model_arch)
self.class_num = class_num
logger.warning("build DSSM model with config of %s, %s" %
(self.model_type, self.model_arch))
logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))

# bind model architecture
_model_arch = {
'cnn': self.create_cnn,
'fc': self.create_fc,
'rnn': self.create_rnn,
}

def _model_arch_creater(emb, prefix=''):
sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
dnn = self.create_dnn(sent_vec, prefix)
return dnn

self.model_arch_creater = _model_arch_creater

# build model type
_model_type = {
'classification': self._build_classification_model,
'rank': self._build_rank_model,
'regression': self._build_regression_model,
}
print 'model type: ', str(self.model_type)
self.model_type_creater = _model_type[str(self.model_type)]

def __call__(self):
# if self.model_type.is_classification():
# return self._build_classification_model()
# return self._build_rank_model()
return self.model_type_creater()

def create_embedding(self, input, prefix=''):
'''
Create an embedding table whose name has a `prefix`.
'''
logger.info("create embedding table [%s] which dimention is %d" %
(prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding(
input=input,
size=self.dnn_dims[0],
param_attr=ParamAttr(name='%s_emb.w' % prefix))
return emb

def create_fc(self, emb, prefix=''):
'''
A multi-layer fully connected neural networks.

@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1])
return fc

def create_rnn(self, emb, prefix=''):
'''
A GRU sentence vector learner.
'''
gru = paddle.layer.gru_memory(
input=emb, )
sent_vec = paddle.layer.last_seq(gru)
return sent_vec

def create_cnn(self, emb, prefix=''):
'''
A multi-layer CNN.

@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''

def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool(
input=emb,
context_len=context_len,
hidden_size=hidden_size,
# set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
fc_param_attr=ParamAttr(name=key + '_fc.w'),
fc_bias_attr=ParamAttr(name=key + '_fc.b'),
pool_bias_attr=ParamAttr(name=key + '_pool.b'))
return conv

logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn")

return conv_3, conv_4

def create_dnn(self, sent_vec, prefix):
# if more than three layers, than a fc layer will be added.
if len(self.dnn_dims) > 1:
_input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc(
name=name,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

如果不是显示地引用名字,去掉指定名字。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

input=_input_layer,
size=dim,
act=paddle.activation.Tanh(),
param_attr=ParamAttr(name='%s.w' % name),
bias_attr=ParamAttr(name='%s.b' % name))
_input_layer = fc
return _input_layer

def _build_classification_model(self):
logger.info("build classification model")
assert self.model_type.is_classification()
return self._build_classification_or_regression_model(
is_classification=True)

def _build_regression_model(self):
logger.info("build regression model")
assert self.model_type.is_regression()
return self._build_classification_or_regression_model(
is_classification=False)

def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.

A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
logger.info("build rank model")
assert self.model_type.is_rank()
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))

prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source left right'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()

word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)

semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)

# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])

# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, None

def _build_classification_or_regression_model(self, is_classification):
'''
Build a classification/regression model, and the cost is returned.

A Classification has 3 inputs:
- source sentence
- target sentence
- classification label

'''
if is_classification:
# prepare inputs.
assert self.class_num

source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_vector(1))

prefixs = '_ _'.split(
) if self.share_semantic_generator else 'left right'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'left right'.split()

word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)

semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)

if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.mse_cost(prediction, label)
return cost, prediction, label


class RankMetrics(object):
'''
A custom metrics to calculate AUC.

Paddle's rank model do not support auc evaluator directly,
to make it, infer all the outputs and use python to calculate
the metrics.
'''

def __init__(self, model_parameters, left_score_layer, right_score_layer,
label):
'''
@model_parameters: dict
model's parameters
@left_score_layer: paddle.layer
left part's score
@right_score_laeyr: paddle.layer
right part's score
@label: paddle.data_layer
label input
'''
self.inferer = paddle.inference.Inference(
output_layer=[left_score_layer, right_score_layer],
parameters=model_parameters)

def test(self, input):
scores = []
for id, rcd in enumerate(input()):
# output [left_score, right_score, label]
res = self.inferer(input=input)
scores.append(res)
print scores
Loading