Skip to content

Commit

Permalink
ReplicatedPG: Don't cache recovery and scrub data
Browse files Browse the repository at this point in the history
Signed-off-by: Haomai Wang <haomaiwang@gmail.com>
  • Loading branch information
yuyuyu101 committed Aug 14, 2015
1 parent 1092ed9 commit fabd635
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 20 deletions.
5 changes: 4 additions & 1 deletion src/osd/ECBackend.cc
Expand Up @@ -1778,6 +1778,9 @@ void ECBackend::be_deep_scrub(
if (stride % sinfo.get_chunk_size())
stride += sinfo.get_chunk_size() - (stride % sinfo.get_chunk_size());
uint64_t pos = 0;

uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;

while (true) {
bufferlist bl;
handle.reset_tp_timeout();
Expand All @@ -1787,7 +1790,7 @@ void ECBackend::be_deep_scrub(
poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
pos,
stride, bl,
true);
fadvise_flags, true);
if (r < 0)
break;
if (bl.length() % sinfo.get_chunk_size()) {
Expand Down
3 changes: 3 additions & 0 deletions src/osd/PGBackend.h
Expand Up @@ -250,6 +250,9 @@
* the pending recovery operations.
*/
struct RecoveryHandle {
bool cache_dont_need;

RecoveryHandle(): cache_dont_need(false) {}
virtual ~RecoveryHandle() {}
};

Expand Down
42 changes: 26 additions & 16 deletions src/osd/ReplicatedBackend.cc
Expand Up @@ -742,13 +742,16 @@ void ReplicatedBackend::be_deep_scrub(
bufferlist bl, hdrbl;
int r;
__u64 pos = 0;

uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;

while ( (r = store->read(
coll,
ghobject_t(
poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
pos,
cct->_conf->osd_deep_scrub_stride, bl,
true)) > 0) {
coll,
ghobject_t(
poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
pos,
cct->_conf->osd_deep_scrub_stride, bl,
fadvise_flags, true)) > 0) {
handle.reset_tp_timeout();
h << bl;
pos += bl.length();
Expand Down Expand Up @@ -1518,6 +1521,7 @@ void ReplicatedBackend::prepare_pull(
pi.head_ctx = headctx;
pi.recovery_info = op.recovery_info;
pi.recovery_progress = op.recovery_progress;
pi.cache_dont_need = h->cache_dont_need;
}

/*
Expand All @@ -1526,7 +1530,7 @@ void ReplicatedBackend::prepare_pull(
*/
void ReplicatedBackend::prep_push_to_replica(
ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
PushOp *pop)
PushOp *pop, bool cache_dont_need)
{
const object_info_t& oi = obc->obs.oi;
uint64_t size = obc->obs.oi.size;
Expand Down Expand Up @@ -1582,7 +1586,7 @@ void ReplicatedBackend::prep_push_to_replica(
data_subset, clone_subsets);
}

prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop);
prep_push(obc, soid, peer, oi.version, data_subset, clone_subsets, pop, cache_dont_need);
}

void ReplicatedBackend::prep_push(ObjectContextRef obc,
Expand All @@ -1605,7 +1609,7 @@ void ReplicatedBackend::prep_push(
eversion_t version,
interval_set<uint64_t> &data_subset,
map<hobject_t, interval_set<uint64_t> >& clone_subsets,
PushOp *pop)
PushOp *pop, bool cache_dont_need)
{
get_parent()->begin_peer_recover(peer, soid);
// take note.
Expand All @@ -1627,7 +1631,7 @@ void ReplicatedBackend::prep_push(
pi.recovery_progress,
&new_progress,
pop,
&(pi.stat));
&(pi.stat), cache_dont_need);
assert(r == 0);
pi.recovery_progress = new_progress;
}
Expand Down Expand Up @@ -1671,6 +1675,7 @@ void ReplicatedBackend::submit_push_data(
ObjectRecoveryInfo &recovery_info,
bool first,
bool complete,
bool cache_dont_need,
const interval_set<uint64_t> &intervals_included,
bufferlist data_included,
bufferlist omap_header,
Expand Down Expand Up @@ -1698,13 +1703,16 @@ void ReplicatedBackend::submit_push_data(
t->omap_setheader(coll, ghobject_t(target_oid), omap_header);
}
uint64_t off = 0;
uint32_t fadvise_flags = CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL;
if (cache_dont_need)
fadvise_flags |= CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
for (interval_set<uint64_t>::const_iterator p = intervals_included.begin();
p != intervals_included.end();
++p) {
bufferlist bit;
bit.substr_of(data_included, off, p.get_len());
t->write(coll, ghobject_t(target_oid),
p.get_start(), p.get_len(), bit);
p.get_start(), p.get_len(), bit, fadvise_flags);
off += p.get_len();
}

Expand Down Expand Up @@ -1827,7 +1835,7 @@ bool ReplicatedBackend::handle_pull_response(
bool complete = pi.is_complete();

submit_push_data(pi.recovery_info, first,
complete,
complete, pi.cache_dont_need,
data_included, data,
pop.omap_header,
pop.attrset,
Expand Down Expand Up @@ -1871,6 +1879,7 @@ void ReplicatedBackend::handle_push(
submit_push_data(pop.recovery_info,
first,
complete,
true, // must be replicate
pop.data_included,
data,
pop.omap_header,
Expand Down Expand Up @@ -1950,7 +1959,8 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
const ObjectRecoveryProgress &progress,
ObjectRecoveryProgress *out_progress,
PushOp *out_op,
object_stat_sum_t *stat)
object_stat_sum_t *stat,
bool cache_dont_need)
{
ObjectRecoveryProgress _new_progress;
if (!out_progress)
Expand Down Expand Up @@ -2042,7 +2052,8 @@ int ReplicatedBackend::build_push_op(const ObjectRecoveryInfo &recovery_info,
++p) {
bufferlist bit;
store->read(coll, ghobject_t(recovery_info.soid),
p.get_start(), p.get_len(), bit);
p.get_start(), p.get_len(), bit,
cache_dont_need ? CEPH_OSD_OP_FLAG_FADVISE_DONTNEED: 0);
if (p.get_len() != bit.length()) {
dout(10) << " extent " << p.get_start() << "~" << p.get_len()
<< " is actually " << p.get_start() << "~" << bit.length()
Expand Down Expand Up @@ -2387,8 +2398,7 @@ int ReplicatedBackend::start_pushes(
++pushes;
h->pushes[peer].push_back(PushOp());
prep_push_to_replica(obc, soid, peer,
&(h->pushes[peer].back())
);
&(h->pushes[peer].back()), h->cache_dont_need);
}
}
return pushes;
Expand Down
10 changes: 7 additions & 3 deletions src/osd/ReplicatedBackend.h
Expand Up @@ -191,6 +191,7 @@ class ReplicatedBackend : public PGBackend {
ObjectContextRef head_ctx;
ObjectContextRef obc;
object_stat_sum_t stat;
bool cache_dont_need;

void dump(Formatter *f) const {
{
Expand Down Expand Up @@ -262,10 +263,12 @@ class ReplicatedBackend : public PGBackend {
const ObjectRecoveryProgress &progress,
ObjectRecoveryProgress *out_progress,
PushOp *out_op,
object_stat_sum_t *stat = 0);
object_stat_sum_t *stat = 0,
bool cache_dont_need = true);
void submit_push_data(ObjectRecoveryInfo &recovery_info,
bool first,
bool complete,
bool cache_dont_need,
const interval_set<uint64_t> &intervals_included,
bufferlist data_included,
bufferlist omap_header,
Expand All @@ -291,7 +294,7 @@ class ReplicatedBackend : public PGBackend {
RPGHandle *h);
void prep_push_to_replica(
ObjectContextRef obc, const hobject_t& soid, pg_shard_t peer,
PushOp *pop);
PushOp *pop, bool cache_dont_need = true);
void prep_push(ObjectContextRef obc,
const hobject_t& oid, pg_shard_t dest,
PushOp *op);
Expand All @@ -300,7 +303,8 @@ class ReplicatedBackend : public PGBackend {
eversion_t version,
interval_set<uint64_t> &data_subset,
map<hobject_t, interval_set<uint64_t> >& clone_subsets,
PushOp *op);
PushOp *op,
bool cache = false);
void calc_head_subsets(ObjectContextRef obc, SnapSet& snapset, const hobject_t& head,
const pg_missing_t& missing,
const hobject_t &last_backfill,
Expand Down
2 changes: 2 additions & 0 deletions src/osd/ReplicatedPG.cc
Expand Up @@ -388,6 +388,7 @@ void ReplicatedPG::wait_for_unreadable_object(
} else {
dout(7) << "missing " << soid << " v " << v << ", recovering." << dendl;
PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
h->cache_dont_need = false;
if (is_missing_object(soid)) {
recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
} else {
Expand Down Expand Up @@ -464,6 +465,7 @@ void ReplicatedPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef
}
}
PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
h->cache_dont_need = false;
prep_object_replica_pushes(soid, v, h);
pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
}
Expand Down

0 comments on commit fabd635

Please sign in to comment.