Skip to content
Browse files

Merge tag 'md-3.5' of git://neil.brown.name/md

Pull md updates from NeilBrown:
 "It's been a busy cycle for md - lots of fun stuff here..  if you like
  this kind of thing :-)

  Main features:
   - RAID10 arrays can be reshaped - adding and removing devices and
     changing chunks (not 'far' array though)
   - allow RAID5 arrays to be reshaped with a backup file (not tested
     yet, but the priciple works fine for RAID10).
   - arrays can be reshaped while a bitmap is present - you no longer
     need to remove it first
   - SSSE3 support for RAID6 syndrome calculations

  and of course a number of minor fixes etc."

* tag 'md-3.5' of git://neil.brown.name/md: (56 commits)
  md/bitmap: record the space available for the bitmap in the superblock.
  md/raid10: Remove extras after reshape to smaller number of devices.
  md/raid5: improve removal of extra devices after reshape.
  md: check the return of mddev_find()
  MD RAID1: Further conditionalize 'fullsync'
  DM RAID: Use md_error() in place of simply setting Faulty bit
  DM RAID: Record and handle missing devices
  DM RAID: Set recovery flags on resume
  md/raid5: Allow reshape while a bitmap is present.
  md/raid10: resize bitmap when required during reshape.
  md: allow array to be resized while bitmap is present.
  md/bitmap: make sure reshape request are reflected in superblock.
  md/bitmap: add bitmap_resize function to allow bitmap resizing.
  md/bitmap: use DIV_ROUND_UP instead of open-code
  md/bitmap: create a 'struct bitmap_counts' substructure of 'struct bitmap'
  md/bitmap: make bitmap bitops atomic.
  md/bitmap: make _page_attr bitops atomic.
  md/bitmap: merge bitmap_file_unmap and bitmap_file_put.
  md/bitmap: remove async freeing of bitmap file.
  md/bitmap: convert some spin_lock_irqsave to spin_lock_irq
  ...
  • Loading branch information...
2 parents 2c13bc0 + 1dff2b8 commit c80ddb526331a72c9e9d1480f85f6fd7c74e3d2d @torvalds torvalds committed
View
5 arch/x86/Makefile
@@ -115,9 +115,10 @@ cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTI
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
+avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
+KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
+KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr)
LDFLAGS := -m elf_$(UTS_MACHINE)
View
6 arch/x86/include/asm/xor_32.h
@@ -861,6 +861,9 @@ static struct xor_block_template xor_block_pIII_sse = {
.do_5 = xor_sse_5,
};
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
/* Also try the generic routines. */
#include <asm-generic/xor.h>
@@ -871,6 +874,7 @@ do { \
xor_speed(&xor_block_8regs_p); \
xor_speed(&xor_block_32regs); \
xor_speed(&xor_block_32regs_p); \
+ AVX_XOR_SPEED; \
if (cpu_has_xmm) \
xor_speed(&xor_block_pIII_sse); \
if (cpu_has_mmx) { \
@@ -883,6 +887,6 @@ do { \
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
#define XOR_SELECT_TEMPLATE(FASTEST) \
- (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+ AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
#endif /* _ASM_X86_XOR_32_H */
View
8 arch/x86/include/asm/xor_64.h
@@ -347,15 +347,21 @@ static struct xor_block_template xor_block_sse = {
.do_5 = xor_sse_5,
};
+
+/* Also try the AVX routines */
+#include "xor_avx.h"
+
#undef XOR_TRY_TEMPLATES
#define XOR_TRY_TEMPLATES \
do { \
+ AVX_XOR_SPEED; \
xor_speed(&xor_block_sse); \
} while (0)
/* We force the use of the SSE xor block because it can write around L2.
We may also be able to load into the L1 only depending on how the cpu
deals with a load to a line that is being prefetched. */
-#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+ AVX_SELECT(&xor_block_sse)
#endif /* _ASM_X86_XOR_64_H */
View
214 arch/x86/include/asm/xor_avx.h
@@ -0,0 +1,214 @@
+#ifndef _ASM_X86_XOR_AVX_H
+#define _ASM_X86_XOR_AVX_H
+
+/*
+ * Optimized RAID-5 checksumming functions for AVX
+ *
+ * Copyright (C) 2012 Intel Corporation
+ * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
+ *
+ * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#ifdef CONFIG_AS_AVX
+
+#include <linux/compiler.h>
+#include <asm/i387.h>
+
+#define ALIGN32 __aligned(32)
+
+#define YMM_SAVED_REGS 4
+
+#define YMMS_SAVE \
+do { \
+ preempt_disable(); \
+ cr0 = read_cr0(); \
+ clts(); \
+ asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
+ asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
+ asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
+ asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
+} while (0);
+
+#define YMMS_RESTORE \
+do { \
+ asm volatile("sfence" : : : "memory"); \
+ asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
+ asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
+ asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
+ asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
+ write_cr0(cr0); \
+ preempt_enable(); \
+} while (0);
+
+#define BLOCK4(i) \
+ BLOCK(32 * i, 0) \
+ BLOCK(32 * (i + 1), 1) \
+ BLOCK(32 * (i + 2), 2) \
+ BLOCK(32 * (i + 3), 3)
+
+#define BLOCK16() \
+ BLOCK4(0) \
+ BLOCK4(4) \
+ BLOCK4(8) \
+ BLOCK4(12)
+
+static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16();
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
+ unsigned long *p2, unsigned long *p3, unsigned long *p4)
+{
+ unsigned long cr0, lines = bytes >> 9;
+ char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
+
+ YMMS_SAVE
+
+ while (lines--) {
+#undef BLOCK
+#define BLOCK(i, reg) \
+do { \
+ asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p3[i / sizeof(*p3)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p2[i / sizeof(*p2)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p1[i / sizeof(*p1)])); \
+ asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
+ "m" (p0[i / sizeof(*p0)])); \
+ asm volatile("vmovdqa %%ymm" #reg ", %0" : \
+ "=m" (p0[i / sizeof(*p0)])); \
+} while (0);
+
+ BLOCK16()
+
+ p0 = (unsigned long *)((uintptr_t)p0 + 512);
+ p1 = (unsigned long *)((uintptr_t)p1 + 512);
+ p2 = (unsigned long *)((uintptr_t)p2 + 512);
+ p3 = (unsigned long *)((uintptr_t)p3 + 512);
+ p4 = (unsigned long *)((uintptr_t)p4 + 512);
+ }
+
+ YMMS_RESTORE
+}
+
+static struct xor_block_template xor_block_avx = {
+ .name = "avx",
+ .do_2 = xor_avx_2,
+ .do_3 = xor_avx_3,
+ .do_4 = xor_avx_4,
+ .do_5 = xor_avx_5,
+};
+
+#define AVX_XOR_SPEED \
+do { \
+ if (cpu_has_avx) \
+ xor_speed(&xor_block_avx); \
+} while (0)
+
+#define AVX_SELECT(FASTEST) \
+ (cpu_has_avx ? &xor_block_avx : FASTEST)
+
+#else
+
+#define AVX_XOR_SPEED {}
+
+#define AVX_SELECT(FASTEST) (FASTEST)
+
+#endif
+#endif
View
13 crypto/xor.c
@@ -21,6 +21,7 @@
#include <linux/gfp.h>
#include <linux/raid/xor.h>
#include <linux/jiffies.h>
+#include <linux/preempt.h>
#include <asm/xor.h>
/* The xor routines to use. */
@@ -63,12 +64,14 @@ static void
do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
{
int speed;
- unsigned long now;
+ unsigned long now, j;
int i, count, max;
tmpl->next = template_list;
template_list = tmpl;
+ preempt_disable();
+
/*
* Count the number of XORs done during a whole jiffy, and use
* this to calculate the speed of checksumming. We use a 2-page
@@ -76,9 +79,11 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
*/
max = 0;
for (i = 0; i < 5; i++) {
- now = jiffies;
+ j = jiffies;
count = 0;
- while (jiffies == now) {
+ while ((now = jiffies) == j)
+ cpu_relax();
+ while (time_before(jiffies, now + 1)) {
mb(); /* prevent loop optimzation */
tmpl->do_2(BENCH_SIZE, b1, b2);
mb();
@@ -89,6 +94,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
max = count;
}
+ preempt_enable();
+
speed = max * (HZ * BENCH_SIZE / 1024);
tmpl->speed = speed;
View
1,100 drivers/md/bitmap.c
632 additions, 468 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
60 drivers/md/bitmap.h
@@ -111,9 +111,9 @@ typedef __u16 bitmap_counter_t;
/* use these for bitmap->flags and bitmap->sb->state bit-fields */
enum bitmap_state {
- BITMAP_STALE = 0x002, /* the bitmap file is out of date or had -EIO */
- BITMAP_WRITE_ERROR = 0x004, /* A write error has occurred */
- BITMAP_HOSTENDIAN = 0x8000,
+ BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
+ BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
+ BITMAP_HOSTENDIAN =15,
};
/* the superblock at the front of the bitmap file -- little endian */
@@ -128,8 +128,10 @@ typedef struct bitmap_super_s {
__le32 chunksize; /* 52 the bitmap chunk size in bytes */
__le32 daemon_sleep; /* 56 seconds between disk flushes */
__le32 write_behind; /* 60 number of outstanding write-behind writes */
+ __le32 sectors_reserved; /* 64 number of 512-byte sectors that are
+ * reserved for the bitmap. */
- __u8 pad[256 - 64]; /* set to zero */
+ __u8 pad[256 - 68]; /* set to zero */
} bitmap_super_t;
/* notes:
@@ -160,35 +162,48 @@ struct bitmap_page {
*/
unsigned int hijacked:1;
/*
+ * If any counter in this page is '1' or '2' - and so could be
+ * cleared then that page is marked as 'pending'
+ */
+ unsigned int pending:1;
+ /*
* count of dirty bits on the page
*/
- unsigned int count:31;
+ unsigned int count:30;
};
/* the main bitmap structure - one per mddev */
struct bitmap {
- struct bitmap_page *bp;
- unsigned long pages; /* total number of pages in the bitmap */
- unsigned long missing_pages; /* number of pages not yet allocated */
- struct mddev *mddev; /* the md device that the bitmap is for */
+ struct bitmap_counts {
+ spinlock_t lock;
+ struct bitmap_page *bp;
+ unsigned long pages; /* total number of pages
+ * in the bitmap */
+ unsigned long missing_pages; /* number of pages
+ * not yet allocated */
+ unsigned long chunkshift; /* chunksize = 2^chunkshift
+ * (for bitops) */
+ unsigned long chunks; /* Total number of data
+ * chunks for the array */
+ } counts;
- /* bitmap chunksize -- how much data does each bit represent? */
- unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
- unsigned long chunks; /* total number of data chunks for the array */
+ struct mddev *mddev; /* the md device that the bitmap is for */
__u64 events_cleared;
int need_sync;
- /* bitmap spinlock */
- spinlock_t lock;
-
- struct file *file; /* backing disk file */
- struct page *sb_page; /* cached copy of the bitmap file superblock */
- struct page **filemap; /* list of cache pages for the file */
- unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
- unsigned long file_pages; /* number of pages in the file */
- int last_page_size; /* bytes in the last page */
+ struct bitmap_storage {
+ struct file *file; /* backing disk file */
+ struct page *sb_page; /* cached copy of the bitmap
+ * file superblock */
+ struct page **filemap; /* list of cache pages for
+ * the file */
+ unsigned long *filemap_attr; /* attributes associated
+ * w/ filemap pages */
+ unsigned long file_pages; /* number of pages in the file*/
+ unsigned long bytes; /* total bytes in the bitmap */
+ } storage;
unsigned long flags;
@@ -242,6 +257,9 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct mddev *mddev);
+
+int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
+ int chunksize, int init);
#endif
#endif
View
22 drivers/md/dm-raid.c
@@ -155,10 +155,7 @@ static void context_free(struct raid_set *rs)
for (i = 0; i < rs->md.raid_disks; i++) {
if (rs->dev[i].meta_dev)
dm_put_device(rs->ti, rs->dev[i].meta_dev);
- if (rs->dev[i].rdev.sb_page)
- put_page(rs->dev[i].rdev.sb_page);
- rs->dev[i].rdev.sb_page = NULL;
- rs->dev[i].rdev.sb_loaded = 0;
+ md_rdev_clear(&rs->dev[i].rdev);
if (rs->dev[i].data_dev)
dm_put_device(rs->ti, rs->dev[i].data_dev);
}
@@ -606,7 +603,7 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
DMERR("Failed to read superblock of device at position %d",
rdev->raid_disk);
- set_bit(Faulty, &rdev->flags);
+ md_error(rdev->mddev, rdev);
return -EINVAL;
}
@@ -617,16 +614,18 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
{
- struct md_rdev *r;
+ int i;
uint64_t failed_devices;
struct dm_raid_superblock *sb;
+ struct raid_set *rs = container_of(mddev, struct raid_set, md);
sb = page_address(rdev->sb_page);
failed_devices = le64_to_cpu(sb->failed_devices);
- rdev_for_each(r, mddev)
- if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
- failed_devices |= (1ULL << r->raid_disk);
+ for (i = 0; i < mddev->raid_disks; i++)
+ if (!rs->dev[i].data_dev ||
+ test_bit(Faulty, &(rs->dev[i].rdev.flags)))
+ failed_devices |= (1ULL << i);
memset(sb, 0, sizeof(*sb));
@@ -1252,12 +1251,13 @@ static void raid_resume(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
+ set_bit(MD_CHANGE_DEVS, &rs->md.flags);
if (!rs->bitmap_loaded) {
bitmap_load(&rs->md);
rs->bitmap_loaded = 1;
- } else
- md_wakeup_thread(rs->md.thread);
+ }
+ clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
mddev_resume(&rs->md);
}
View
370 drivers/md/md.c
@@ -402,6 +402,7 @@ void mddev_resume(struct mddev *mddev)
wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
}
@@ -452,7 +453,7 @@ static void submit_flushes(struct work_struct *ws)
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
- bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
+ bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
bi->bi_bdev = rdev->bdev;
@@ -607,6 +608,7 @@ void mddev_init(struct mddev *mddev)
init_waitqueue_head(&mddev->sb_wait);
init_waitqueue_head(&mddev->recovery_wait);
mddev->reshape_position = MaxSector;
+ mddev->reshape_backwards = 0;
mddev->resync_min = 0;
mddev->resync_max = MaxSector;
mddev->level = LEVEL_NONE;
@@ -802,7 +804,7 @@ static int alloc_disk_sb(struct md_rdev * rdev)
return 0;
}
-static void free_disk_sb(struct md_rdev * rdev)
+void md_rdev_clear(struct md_rdev *rdev)
{
if (rdev->sb_page) {
put_page(rdev->sb_page);
@@ -815,8 +817,10 @@ static void free_disk_sb(struct md_rdev * rdev)
put_page(rdev->bb_page);
rdev->bb_page = NULL;
}
+ kfree(rdev->badblocks.page);
+ rdev->badblocks.page = NULL;
}
-
+EXPORT_SYMBOL_GPL(md_rdev_clear);
static void super_written(struct bio *bio, int error)
{
@@ -887,6 +891,10 @@ int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
rdev->meta_bdev : rdev->bdev;
if (metadata_op)
bio->bi_sector = sector + rdev->sb_start;
+ else if (rdev->mddev->reshape_position != MaxSector &&
+ (rdev->mddev->reshape_backwards ==
+ (sector >= rdev->mddev->reshape_position)))
+ bio->bi_sector = sector + rdev->new_data_offset;
else
bio->bi_sector = sector + rdev->data_offset;
bio_add_page(bio, page, size, 0);
@@ -1034,12 +1042,17 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
struct super_type {
char *name;
struct module *owner;
- int (*load_super)(struct md_rdev *rdev, struct md_rdev *refdev,
+ int (*load_super)(struct md_rdev *rdev,
+ struct md_rdev *refdev,
int minor_version);
- int (*validate_super)(struct mddev *mddev, struct md_rdev *rdev);
- void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
+ int (*validate_super)(struct mddev *mddev,
+ struct md_rdev *rdev);
+ void (*sync_super)(struct mddev *mddev,
+ struct md_rdev *rdev);
unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
sector_t num_sectors);
+ int (*allow_new_offset)(struct md_rdev *rdev,
+ unsigned long long new_offset);
};
/*
@@ -1111,6 +1124,7 @@ static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor
rdev->preferred_minor = sb->md_minor;
rdev->data_offset = 0;
+ rdev->new_data_offset = 0;
rdev->sb_size = MD_SB_BYTES;
rdev->badblocks.shift = -1;
@@ -1184,7 +1198,11 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->dev_sectors = ((sector_t)sb->size) * 2;
mddev->events = ev1;
mddev->bitmap_info.offset = 0;
+ mddev->bitmap_info.space = 0;
+ /* bitmap can use 60 K after the 4K superblocks */
mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
+ mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
+ mddev->reshape_backwards = 0;
if (mddev->minor_version >= 91) {
mddev->reshape_position = sb->reshape_position;
@@ -1192,6 +1210,8 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_level = sb->new_level;
mddev->new_layout = sb->new_layout;
mddev->new_chunk_sectors = sb->new_chunk >> 9;
+ if (mddev->delta_disks < 0)
+ mddev->reshape_backwards = 1;
} else {
mddev->reshape_position = MaxSector;
mddev->delta_disks = 0;
@@ -1218,9 +1238,12 @@ static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->max_disks = MD_SB_DISKS;
if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
- mddev->bitmap_info.file == NULL)
+ mddev->bitmap_info.file == NULL) {
mddev->bitmap_info.offset =
mddev->bitmap_info.default_offset;
+ mddev->bitmap_info.space =
+ mddev->bitmap_info.space;
+ }
} else if (mddev->pers == NULL) {
/* Insist on good event counter while assembling, except
@@ -1434,6 +1457,12 @@ super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
return num_sectors;
}
+static int
+super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
+{
+ /* non-zero offset changes not possible with v0.90 */
+ return new_offset == 0;
+}
/*
* version 1 superblock
@@ -1469,6 +1498,7 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
struct mdp_superblock_1 *sb;
int ret;
sector_t sb_start;
+ sector_t sectors;
char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
int bmask;
@@ -1523,9 +1553,18 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
bdevname(rdev->bdev,b));
return -EINVAL;
}
+ if (sb->pad0 ||
+ sb->pad3[0] ||
+ memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
+ /* Some padding is non-zero, might be a new feature */
+ return -EINVAL;
rdev->preferred_minor = 0xffff;
rdev->data_offset = le64_to_cpu(sb->data_offset);
+ rdev->new_data_offset = rdev->data_offset;
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
+ (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
+ rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
@@ -1536,6 +1575,9 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
if (minor_version
&& rdev->data_offset < sb_start + (rdev->sb_size/512))
return -EINVAL;
+ if (minor_version
+ && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
+ return -EINVAL;
if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
rdev->desc_nr = -1;
@@ -1607,16 +1649,14 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
else
ret = 0;
}
- if (minor_version)
- rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
- le64_to_cpu(sb->data_offset);
- else
- rdev->sectors = rdev->sb_start;
- if (rdev->sectors < le64_to_cpu(sb->data_size))
+ if (minor_version) {
+ sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
+ sectors -= rdev->data_offset;
+ } else
+ sectors = rdev->sb_start;
+ if (sectors < le64_to_cpu(sb->data_size))
return -EINVAL;
rdev->sectors = le64_to_cpu(sb->data_size);
- if (le64_to_cpu(sb->size) > rdev->sectors)
- return -EINVAL;
return ret;
}
@@ -1644,17 +1684,37 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->dev_sectors = le64_to_cpu(sb->size);
mddev->events = ev1;
mddev->bitmap_info.offset = 0;
+ mddev->bitmap_info.space = 0;
+ /* Default location for bitmap is 1K after superblock
+ * using 3K - total of 4K
+ */
mddev->bitmap_info.default_offset = 1024 >> 9;
-
+ mddev->bitmap_info.default_space = (4096-1024) >> 9;
+ mddev->reshape_backwards = 0;
+
mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
memcpy(mddev->uuid, sb->set_uuid, 16);
mddev->max_disks = (4096-256)/2;
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
- mddev->bitmap_info.file == NULL )
+ mddev->bitmap_info.file == NULL) {
mddev->bitmap_info.offset =
(__s32)le32_to_cpu(sb->bitmap_offset);
+ /* Metadata doesn't record how much space is available.
+ * For 1.0, we assume we can use up to the superblock
+ * if before, else to 4K beyond superblock.
+ * For others, assume no change is possible.
+ */
+ if (mddev->minor_version > 0)
+ mddev->bitmap_info.space = 0;
+ else if (mddev->bitmap_info.offset > 0)
+ mddev->bitmap_info.space =
+ 8 - mddev->bitmap_info.offset;
+ else
+ mddev->bitmap_info.space =
+ -mddev->bitmap_info.offset;
+ }
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
mddev->reshape_position = le64_to_cpu(sb->reshape_position);
@@ -1662,6 +1722,11 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
mddev->new_level = le32_to_cpu(sb->new_level);
mddev->new_layout = le32_to_cpu(sb->new_layout);
mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
+ if (mddev->delta_disks < 0 ||
+ (mddev->delta_disks == 0 &&
+ (le32_to_cpu(sb->feature_map)
+ & MD_FEATURE_RESHAPE_BACKWARDS)))
+ mddev->reshape_backwards = 1;
} else {
mddev->reshape_position = MaxSector;
mddev->delta_disks = 0;
@@ -1735,7 +1800,6 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map = 0;
sb->pad0 = 0;
sb->recovery_offset = cpu_to_le64(0);
- memset(sb->pad1, 0, sizeof(sb->pad1));
memset(sb->pad3, 0, sizeof(sb->pad3));
sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1757,6 +1821,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->devflags |= WriteMostly1;
else
sb->devflags &= ~WriteMostly1;
+ sb->data_offset = cpu_to_le64(rdev->data_offset);
+ sb->data_size = cpu_to_le64(rdev->sectors);
if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
@@ -1781,6 +1847,16 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->delta_disks = cpu_to_le32(mddev->delta_disks);
sb->new_level = cpu_to_le32(mddev->new_level);
sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
+ if (mddev->delta_disks == 0 &&
+ mddev->reshape_backwards)
+ sb->feature_map
+ |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
+ if (rdev->new_data_offset != rdev->data_offset) {
+ sb->feature_map
+ |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
+ sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
+ - rdev->data_offset));
+ }
}
if (rdev->badblocks.count == 0)
@@ -1857,6 +1933,8 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
sector_t max_sectors;
if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
return 0; /* component must fit device */
+ if (rdev->data_offset != rdev->new_data_offset)
+ return 0; /* too confusing */
if (rdev->sb_start < rdev->data_offset) {
/* minor versions 1 and 2; superblock before data */
max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
@@ -1884,6 +1962,40 @@ super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
rdev->sb_page);
md_super_wait(rdev->mddev);
return num_sectors;
+
+}
+
+static int
+super_1_allow_new_offset(struct md_rdev *rdev,
+ unsigned long long new_offset)
+{
+ /* All necessary checks on new >= old have been done */
+ struct bitmap *bitmap;
+ if (new_offset >= rdev->data_offset)
+ return 1;
+
+ /* with 1.0 metadata, there is no metadata to tread on
+ * so we can always move back */
+ if (rdev->mddev->minor_version == 0)
+ return 1;
+
+ /* otherwise we must be sure not to step on
+ * any metadata, so stay:
+ * 36K beyond start of superblock
+ * beyond end of badblocks
+ * beyond write-intent bitmap
+ */
+ if (rdev->sb_start + (32+4)*2 > new_offset)
+ return 0;
+ bitmap = rdev->mddev->bitmap;
+ if (bitmap && !rdev->mddev->bitmap_info.file &&
+ rdev->sb_start + rdev->mddev->bitmap_info.offset +
+ bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
+ return 0;
+ if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
+ return 0;
+
+ return 1;
}
static struct super_type super_types[] = {
@@ -1894,6 +2006,7 @@ static struct super_type super_types[] = {
.validate_super = super_90_validate,
.sync_super = super_90_sync,
.rdev_size_change = super_90_rdev_size_change,
+ .allow_new_offset = super_90_allow_new_offset,
},
[1] = {
.name = "md-1",
@@ -1902,6 +2015,7 @@ static struct super_type super_types[] = {
.validate_super = super_1_validate,
.sync_super = super_1_sync,
.rdev_size_change = super_1_rdev_size_change,
+ .allow_new_offset = super_1_allow_new_offset,
},
};
@@ -2105,9 +2219,7 @@ static void unbind_rdev_from_array(struct md_rdev * rdev)
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
rdev->sysfs_state = NULL;
- kfree(rdev->badblocks.page);
rdev->badblocks.count = 0;
- rdev->badblocks.page = NULL;
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
@@ -2158,7 +2270,7 @@ static void export_rdev(struct md_rdev * rdev)
bdevname(rdev->bdev,b));
if (rdev->mddev)
MD_BUG();
- free_disk_sb(rdev);
+ md_rdev_clear(rdev);
#ifndef MODULE
if (test_bit(AutoDetected, &rdev->flags))
md_autodetect_dev(rdev->bdev->bd_dev);
@@ -2809,9 +2921,8 @@ offset_show(struct md_rdev *rdev, char *page)
static ssize_t
offset_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
- unsigned long long offset = simple_strtoull(buf, &e, 10);
- if (e==buf || (*e && *e != '\n'))
+ unsigned long long offset;
+ if (strict_strtoull(buf, 10, &offset) < 0)
return -EINVAL;
if (rdev->mddev->pers && rdev->raid_disk >= 0)
return -EBUSY;
@@ -2826,6 +2937,63 @@ offset_store(struct md_rdev *rdev, const char *buf, size_t len)
static struct rdev_sysfs_entry rdev_offset =
__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
+static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
+{
+ return sprintf(page, "%llu\n",
+ (unsigned long long)rdev->new_data_offset);
+}
+
+static ssize_t new_offset_store(struct md_rdev *rdev,
+ const char *buf, size_t len)
+{
+ unsigned long long new_offset;
+ struct mddev *mddev = rdev->mddev;
+
+ if (strict_strtoull(buf, 10, &new_offset) < 0)
+ return -EINVAL;
+
+ if (mddev->sync_thread)
+ return -EBUSY;
+ if (new_offset == rdev->data_offset)
+ /* reset is always permitted */
+ ;
+ else if (new_offset > rdev->data_offset) {
+ /* must not push array size beyond rdev_sectors */
+ if (new_offset - rdev->data_offset
+ + mddev->dev_sectors > rdev->sectors)
+ return -E2BIG;
+ }
+ /* Metadata worries about other space details. */
+
+ /* decreasing the offset is inconsistent with a backwards
+ * reshape.
+ */
+ if (new_offset < rdev->data_offset &&
+ mddev->reshape_backwards)
+ return -EINVAL;
+ /* Increasing offset is inconsistent with forwards
+ * reshape. reshape_direction should be set to
+ * 'backwards' first.
+ */
+ if (new_offset > rdev->data_offset &&
+ !mddev->reshape_backwards)
+ return -EINVAL;
+
+ if (mddev->pers && mddev->persistent &&
+ !super_types[mddev->major_version]
+ .allow_new_offset(rdev, new_offset))
+ return -E2BIG;
+ rdev->new_data_offset = new_offset;
+ if (new_offset > rdev->data_offset)
+ mddev->reshape_backwards = 1;
+ else if (new_offset < rdev->data_offset)
+ mddev->reshape_backwards = 0;
+
+ return len;
+}
+static struct rdev_sysfs_entry rdev_new_offset =
+__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
+
static ssize_t
rdev_size_show(struct md_rdev *rdev, char *page)
{
@@ -2870,6 +3038,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL;
+ if (rdev->data_offset != rdev->new_data_offset)
+ return -EINVAL; /* too confusing */
if (my_mddev->pers && rdev->raid_disk >= 0) {
if (my_mddev->persistent) {
sectors = super_types[my_mddev->major_version].
@@ -3006,6 +3176,7 @@ static struct attribute *rdev_default_attrs[] = {
&rdev_errors.attr,
&rdev_slot.attr,
&rdev_offset.attr,
+ &rdev_new_offset.attr,
&rdev_size.attr,
&rdev_recovery_start.attr,
&rdev_bad_blocks.attr,
@@ -3080,6 +3251,7 @@ int md_rdev_init(struct md_rdev *rdev)
rdev->raid_disk = -1;
rdev->flags = 0;
rdev->data_offset = 0;
+ rdev->new_data_offset = 0;
rdev->sb_events = 0;
rdev->last_read_error.tv_sec = 0;
rdev->last_read_error.tv_nsec = 0;
@@ -3178,8 +3350,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
abort_free:
if (rdev->bdev)
unlock_rdev(rdev);
- free_disk_sb(rdev);
- kfree(rdev->badblocks.page);
+ md_rdev_clear(rdev);
kfree(rdev);
return ERR_PTR(err);
}
@@ -3419,6 +3590,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->new_chunk_sectors = mddev->chunk_sectors;
mddev->raid_disks -= mddev->delta_disks;
mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
module_put(pers->owner);
printk(KERN_WARNING "md: %s: %s would not accept array\n",
mdname(mddev), clevel);
@@ -3492,6 +3664,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->layout = mddev->new_layout;
mddev->chunk_sectors = mddev->new_chunk_sectors;
mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
mddev->degraded = 0;
if (mddev->pers->sync_request == NULL) {
/* this is now an array without redundancy, so
@@ -3501,10 +3674,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
del_timer_sync(&mddev->safemode_timer);
}
pers->run(mddev);
- mddev_resume(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
+ mddev_resume(mddev);
sysfs_notify(&mddev->kobj, NULL, "level");
md_new_event(mddev);
return rv;
@@ -3582,9 +3753,20 @@ raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers)
rv = update_raid_disks(mddev, n);
else if (mddev->reshape_position != MaxSector) {
+ struct md_rdev *rdev;
int olddisks = mddev->raid_disks - mddev->delta_disks;
+
+ rdev_for_each(rdev, mddev) {
+ if (olddisks < n &&
+ rdev->data_offset < rdev->new_data_offset)
+ return -EINVAL;
+ if (olddisks > n &&
+ rdev->data_offset > rdev->new_data_offset)
+ return -EINVAL;
+ }
mddev->delta_disks = n - olddisks;
mddev->raid_disks = n;
+ mddev->reshape_backwards = (mddev->delta_disks < 0);
} else
mddev->raid_disks = n;
return rv ? rv : len;
@@ -4266,7 +4448,8 @@ sync_completed_show(struct mddev *mddev, char *page)
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
else
max_sectors = mddev->dev_sectors;
@@ -4428,6 +4611,7 @@ reshape_position_show(struct mddev *mddev, char *page)
static ssize_t
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
+ struct md_rdev *rdev;
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
if (mddev->pers)
@@ -4436,9 +4620,12 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
return -EINVAL;
mddev->reshape_position = new;
mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
mddev->new_level = mddev->level;
mddev->new_layout = mddev->layout;
mddev->new_chunk_sectors = mddev->chunk_sectors;
+ rdev_for_each(rdev, mddev)
+ rdev->new_data_offset = rdev->data_offset;
return len;
}
@@ -4447,6 +4634,42 @@ __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
reshape_position_store);
static ssize_t
+reshape_direction_show(struct mddev *mddev, char *page)
+{
+ return sprintf(page, "%s\n",
+ mddev->reshape_backwards ? "backwards" : "forwards");
+}
+
+static ssize_t
+reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
+{
+ int backwards = 0;
+ if (cmd_match(buf, "forwards"))
+ backwards = 0;
+ else if (cmd_match(buf, "backwards"))
+ backwards = 1;
+ else
+ return -EINVAL;
+ if (mddev->reshape_backwards == backwards)
+ return len;
+
+ /* check if we are allowed to change */
+ if (mddev->delta_disks)
+ return -EBUSY;
+
+ if (mddev->persistent &&
+ mddev->major_version == 0)
+ return -EINVAL;
+
+ mddev->reshape_backwards = backwards;
+ return len;
+}
+
+static struct md_sysfs_entry md_reshape_direction =
+__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
+ reshape_direction_store);
+
+static ssize_t
array_size_show(struct mddev *mddev, char *page)
{
if (mddev->external_size)
@@ -4501,6 +4724,7 @@ static struct attribute *md_default_attrs[] = {
&md_safe_delay.attr,
&md_array_state.attr,
&md_reshape_position.attr,
+ &md_reshape_direction.attr,
&md_array_size.attr,
&max_corr_read_errors.attr,
NULL,
@@ -4914,7 +5138,8 @@ int md_run(struct mddev *mddev)
err = -EINVAL;
mddev->pers->stop(mddev);
}
- if (err == 0 && mddev->pers->sync_request) {
+ if (err == 0 && mddev->pers->sync_request &&
+ (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
err = bitmap_create(mddev);
if (err) {
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
@@ -5064,6 +5289,7 @@ static void md_clean(struct mddev *mddev)
mddev->events = 0;
mddev->can_decrease_events = 0;
mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
mddev->new_level = LEVEL_NONE;
mddev->new_layout = 0;
mddev->new_chunk_sectors = 0;
@@ -5079,6 +5305,7 @@ static void md_clean(struct mddev *mddev)
mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
+ mddev->bitmap_info.default_space = 0;
mddev->bitmap_info.chunksize = 0;
mddev->bitmap_info.daemon_sleep = 0;
mddev->bitmap_info.max_write_behind = 0;
@@ -5421,7 +5648,7 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
goto out;
/* bitmap disabled, zero the first byte and copy out */
- if (!mddev->bitmap || !mddev->bitmap->file) {
+ if (!mddev->bitmap || !mddev->bitmap->storage.file) {
file->pathname[0] = '\0';
goto copy_out;
}
@@ -5430,7 +5657,8 @@ static int get_bitmap_file(struct mddev * mddev, void __user * arg)
if (!buf)
goto out;
- ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
+ ptr = d_path(&mddev->bitmap->storage.file->f_path,
+ buf, sizeof(file->pathname));
if (IS_ERR(ptr))
goto out;
@@ -5875,6 +6103,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
+ mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
mddev->bitmap_info.offset = 0;
mddev->reshape_position = MaxSector;
@@ -5888,6 +6117,7 @@ static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
mddev->new_chunk_sectors = mddev->chunk_sectors;
mddev->new_layout = mddev->layout;
mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
return 0;
}
@@ -5922,11 +6152,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
*/
if (mddev->sync_thread)
return -EBUSY;
- if (mddev->bitmap)
- /* Sorry, cannot grow a bitmap yet, just remove it,
- * grow, and re-add.
- */
- return -EBUSY;
+
rdev_for_each(rdev, mddev) {
sector_t avail = rdev->sectors;
@@ -5944,6 +6170,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
static int update_raid_disks(struct mddev *mddev, int raid_disks)
{
int rv;
+ struct md_rdev *rdev;
/* change the number of raid disks */
if (mddev->pers->check_reshape == NULL)
return -EINVAL;
@@ -5952,11 +6179,27 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
return -EINVAL;
if (mddev->sync_thread || mddev->reshape_position != MaxSector)
return -EBUSY;
+
+ rdev_for_each(rdev, mddev) {
+ if (mddev->raid_disks < raid_disks &&
+ rdev->data_offset < rdev->new_data_offset)
+ return -EINVAL;
+ if (mddev->raid_disks > raid_disks &&
+ rdev->data_offset > rdev->new_data_offset)
+ return -EINVAL;
+ }
+
mddev->delta_disks = raid_disks - mddev->raid_disks;
+ if (mddev->delta_disks < 0)
+ mddev->reshape_backwards = 1;
+ else if (mddev->delta_disks > 0)
+ mddev->reshape_backwards = 0;
rv = mddev->pers->check_reshape(mddev);
- if (rv < 0)
+ if (rv < 0) {
mddev->delta_disks = 0;
+ mddev->reshape_backwards = 0;
+ }
return rv;
}
@@ -6039,6 +6282,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return -EINVAL;
mddev->bitmap_info.offset =
mddev->bitmap_info.default_offset;
+ mddev->bitmap_info.space =
+ mddev->bitmap_info.default_space;
mddev->pers->quiesce(mddev, 1);
rv = bitmap_create(mddev);
if (!rv)
@@ -6050,7 +6295,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
/* remove the bitmap */
if (!mddev->bitmap)
return -ENOENT;
- if (mddev->bitmap->file)
+ if (mddev->bitmap->storage.file)
return -EINVAL;
mddev->pers->quiesce(mddev, 1);
bitmap_destroy(mddev);
@@ -6373,6 +6618,9 @@ static int md_open(struct block_device *bdev, fmode_t mode)
struct mddev *mddev = mddev_find(bdev->bd_dev);
int err;
+ if (!mddev)
+ return -ENODEV;
+
if (mddev->gendisk != bdev->bd_disk) {
/* we are racing with mddev_put which is discarding this
* bd_disk.
@@ -6584,7 +6832,8 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev)
resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
- if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
+ if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sectors = mddev->resync_max_sectors;
else
max_sectors = mddev->dev_sectors;
@@ -7147,7 +7396,7 @@ void md_do_sync(struct mddev *mddev)
j = mddev->recovery_cp;
} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
- max_sectors = mddev->dev_sectors;
+ max_sectors = mddev->resync_max_sectors;
else {
/* recovery follows the physical size of devices */
max_sectors = mddev->dev_sectors;
@@ -7598,7 +7847,7 @@ void md_check_recovery(struct mddev *mddev)
goto unlock;
if (mddev->pers->sync_request) {
- if (spares && mddev->bitmap && ! mddev->bitmap->file) {
+ if (spares) {
/* We are adding a device or devices to an array
* which has the bitmap stored on all devices.
* So make sure all bitmap pages get written
@@ -7646,6 +7895,20 @@ void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
}
EXPORT_SYMBOL(md_wait_for_blocked_rdev);
+void md_finish_reshape(struct mddev *mddev)
+{
+ /* called be personality module when reshape completes. */
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev) {
+ if (rdev->data_offset > rdev->new_data_offset)
+ rdev->sectors += rdev->data_offset - rdev->new_data_offset;
+ else
+ rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
+ rdev->data_offset = rdev->new_data_offset;
+ }
+}
+EXPORT_SYMBOL(md_finish_reshape);
/* Bad block management.
* We can record which blocks on each device are 'bad' and so just
@@ -7894,10 +8157,15 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
}
int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int acknowledged)
+ int is_new)
{
- int rv = md_set_badblocks(&rdev->badblocks,
- s + rdev->data_offset, sectors, acknowledged);
+ int rv;
+ if (is_new)
+ s += rdev->new_data_offset;
+ else
+ s += rdev->data_offset;
+ rv = md_set_badblocks(&rdev->badblocks,
+ s, sectors, 0);
if (rv) {
/* Make sure they get written out promptly */
sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -8003,11 +8271,15 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
return rv;
}
-int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors)
+int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new)
{
+ if (is_new)
+ s += rdev->new_data_offset;
+ else
+ s += rdev->data_offset;
return md_clear_badblocks(&rdev->badblocks,
- s + rdev->data_offset,
- sectors);
+ s, sectors);
}
EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
View
12 drivers/md/md.h
@@ -55,6 +55,7 @@ struct md_rdev {
int sb_loaded;
__u64 sb_events;
sector_t data_offset; /* start of data in array */
+ sector_t new_data_offset;/* only relevant while reshaping */
sector_t sb_start; /* offset of the super block (in 512byte sectors) */
int sb_size; /* bytes in the superblock */
int preferred_minor; /* autorun support */
@@ -193,8 +194,9 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
return 0;
}
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
- int acknowledged);
-extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors);
+ int is_new);
+extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
+ int is_new);
extern void md_ack_all_badblocks(struct badblocks *bb);
struct mddev {
@@ -262,6 +264,7 @@ struct mddev {
sector_t reshape_position;
int delta_disks, new_level, new_layout;
int new_chunk_sectors;
+ int reshape_backwards;
atomic_t plug_cnt; /* If device is expecting
* more bios soon.
@@ -390,10 +393,13 @@ struct mddev {
* For external metadata, offset
* from start of device.
*/
+ unsigned long space; /* space available at this offset */
loff_t default_offset; /* this is the offset to use when
* hot-adding a bitmap. It should
* eventually be settable by sysfs.
*/
+ unsigned long default_space; /* space available at
+ * default offset */
struct mutex mutex;
unsigned long chunksize;
unsigned long daemon_sleep; /* how many jiffies between updates? */
@@ -591,6 +597,7 @@ extern void md_write_start(struct mddev *mddev, struct bio *bi);
extern void md_write_end(struct mddev *mddev);
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
+extern void md_finish_reshape(struct mddev *mddev);
extern int mddev_congested(struct mddev *mddev, int bits);
extern void md_flush_request(struct mddev *mddev, struct bio *bio);
@@ -615,6 +622,7 @@ extern int md_run(struct mddev *mddev);
extern void md_stop(struct mddev *mddev);
extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev);
+extern void md_rdev_clear(struct md_rdev *rdev);
extern void mddev_suspend(struct mddev *mddev);
extern void mddev_resume(struct mddev *mddev);
View
22 drivers/md/raid1.c
@@ -1859,7 +1859,9 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
rdev = conf->mirrors[d].rdev;
if (rdev &&
- test_bit(In_sync, &rdev->flags) &&
+ (test_bit(In_sync, &rdev->flags) ||
+ (!test_bit(Faulty, &rdev->flags) &&
+ rdev->recovery_offset >= sect + s)) &&
is_badblock(rdev, sect, s,
&first_bad, &bad_sectors) == 0 &&
sync_page_io(rdev, sect, s<<9,
@@ -2024,7 +2026,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
continue;
if (test_bit(BIO_UPTODATE, &bio->bi_flags) &&
test_bit(R1BIO_MadeGood, &r1_bio->state)) {
- rdev_clear_badblocks(rdev, r1_bio->sector, s);
+ rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
}
if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
test_bit(R1BIO_WriteError, &r1_bio->state)) {
@@ -2044,7 +2046,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
struct md_rdev *rdev = conf->mirrors[m].rdev;
rdev_clear_badblocks(rdev,
r1_bio->sector,
- r1_bio->sectors);
+ r1_bio->sectors, 0);
rdev_dec_pending(rdev, conf->mddev);
} else if (r1_bio->bios[m] != NULL) {
/* This drive got a write error. We need to
@@ -2598,7 +2600,8 @@ static struct r1conf *setup_conf(struct mddev *mddev)
if (!disk->rdev ||
!test_bit(In_sync, &disk->rdev->flags)) {
disk->head_position = 0;
- if (disk->rdev)
+ if (disk->rdev &&
+ (disk->rdev->saved_raid_disk < 0))
conf->fullsync = 1;
} else if (conf->last_used < 0)
/*
@@ -2750,9 +2753,16 @@ static int raid1_resize(struct mddev *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems
* worth it.
*/
- md_set_array_sectors(mddev, raid1_size(mddev, sectors, 0));
- if (mddev->array_sectors > raid1_size(mddev, sectors, 0))
+ sector_t newsize = raid1_size(mddev, sectors, 0);
+ if (mddev->external_size &&
+ mddev->array_sectors > newsize)
return -EINVAL;
+ if (mddev->bitmap) {
+ int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0);
+ if (ret)
+ return ret;
+ }
+ md_set_array_sectors(mddev, newsize);
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
View
1,281 drivers/md/raid10.c
1,128 additions, 153 deletions not shown because the diff is too large. Please use a local Git client to view these changes.
View
34 drivers/md/raid10.h
@@ -14,32 +14,38 @@ struct mirror_info {
struct r10conf {
struct mddev *mddev;
struct mirror_info *mirrors;
- int raid_disks;
+ struct mirror_info *mirrors_new, *mirrors_old;
spinlock_t device_lock;
/* geometry */
- int near_copies; /* number of copies laid out
+ struct geom {
+ int raid_disks;
+ int near_copies; /* number of copies laid out
* raid0 style */
- int far_copies; /* number of copies laid out
+ int far_copies; /* number of copies laid out
* at large strides across drives
*/
- int far_offset; /* far_copies are offset by 1
+ int far_offset; /* far_copies are offset by 1
* stripe instead of many
*/
- int copies; /* near_copies * far_copies.
- * must be <= raid_disks
- */
- sector_t stride; /* distance between far copies.
+ sector_t stride; /* distance between far copies.
* This is size / far_copies unless
* far_offset, in which case it is
* 1 stripe.
*/
+ int chunk_shift; /* shift from chunks to sectors */
+ sector_t chunk_mask;
+ } prev, geo;
+ int copies; /* near_copies * far_copies.
+ * must be <= raid_disks
+ */
sector_t dev_sectors; /* temp copy of
* mddev->dev_sectors */
-
- int chunk_shift; /* shift from chunks to sectors */
- sector_t chunk_mask;
+ sector_t reshape_progress;
+ sector_t reshape_safe;
+ unsigned long reshape_checkpoint;
+ sector_t offset_diff;
struct list_head retry_list;
/* queue pending writes and submit them on unplug */
@@ -136,6 +142,7 @@ enum r10bio_state {
R10BIO_Uptodate,
R10BIO_IsSync,
R10BIO_IsRecover,
+ R10BIO_IsReshape,
R10BIO_Degraded,
/* Set ReadError on bios that experience a read error
* so that raid10d knows what to do with them.
@@ -146,5 +153,10 @@ enum r10bio_state {
*/
R10BIO_MadeGood,
R10BIO_WriteError,
+/* During a reshape we might be performing IO on the
+ * 'previous' part of the array, in which case this
+ * flag is set
+ */
+ R10BIO_Previous,
};
#endif
View
252 drivers/md/raid5.c
@@ -488,6 +488,27 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
return sh;
}
+/* Determine if 'data_offset' or 'new_data_offset' should be used
+ * in this stripe_head.
+ */
+static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
+{
+ sector_t progress = conf->reshape_progress;
+ /* Need a memory barrier to make sure we see the value
+ * of conf->generation, or ->data_offset that was set before
+ * reshape_progress was updated.
+ */
+ smp_rmb();
+ if (progress == MaxSector)
+ return 0;
+ if (sh->generation == conf->generation - 1)
+ return 0;
+ /* We are in a reshape, and this is a new-generation stripe,
+ * so use new_data_offset.
+ */
+ return 1;
+}
+
static void
raid5_end_read_request(struct bio *bi, int error);
static void
@@ -518,6 +539,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
replace_only = 1;
} else
continue;
+ if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
+ rw |= REQ_SYNC;
bi = &sh->dev[i].req;
rbi = &sh->dev[i].rreq; /* For writing to replacement */
@@ -603,7 +626,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
__func__, (unsigned long long)sh->sector,
bi->bi_rw, i);
atomic_inc(&sh->count);
- bi->bi_sector = sh->sector + rdev->data_offset;
+ if (use_new_offset(conf, sh))
+ bi->bi_sector = (sh->sector
+ + rdev->new_data_offset);
+ else
+ bi->bi_sector = (sh->sector
+ + rdev->data_offset);
bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_idx = 0;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -627,7 +655,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
__func__, (unsigned long long)sh->sector,
rbi->bi_rw, i);
atomic_inc(&sh->count);
- rbi->bi_sector = sh->sector + rrdev->data_offset;
+ if (use_new_offset(conf, sh))
+ rbi->bi_sector = (sh->sector
+ + rrdev->new_data_offset);
+ else
+ rbi->bi_sector = (sh->sector
+ + rrdev->data_offset);
rbi->bi_flags = 1 << BIO_UPTODATE;
rbi->bi_idx = 0;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -1114,6 +1147,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
dev->sector + STRIPE_SECTORS) {
if (wbi->bi_rw & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
+ if (wbi->bi_rw & REQ_SYNC)
+ set_bit(R5_SyncIO, &dev->flags);
tx = async_copy_data(1, wbi, dev->page,
dev->sector, tx);
wbi = r5_next_bio(wbi, dev->sector);
@@ -1131,13 +1166,15 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx;
int i;
- bool fua = false;
+ bool fua = false, sync = false;
pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector);
- for (i = disks; i--; )
+ for (i = disks; i--; ) {
fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
+ sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
+ }
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
@@ -1146,6 +1183,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
set_bit(R5_UPTODATE, &dev->flags);
if (fua)
set_bit(R5_WantFUA, &dev->flags);
+ if (sync)
+ set_bit(R5_SyncIO, &dev->flags);
}
}
@@ -1648,7 +1687,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
char b[BDEVNAME_SIZE];
struct md_rdev *rdev = NULL;
-
+ sector_t s;
for (i=0 ; i<disks; i++)
if (bi == &sh->dev[i].req)
@@ -1671,6 +1710,10 @@ static void raid5_end_read_request(struct bio * bi, int error)
if (!rdev)
rdev = conf->disks[i].rdev;
+ if (use_new_offset(conf, sh))
+ s = sh->sector + rdev->new_data_offset;
+ else
+ s = sh->sector + rdev->data_offset;
if (uptodate) {
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
@@ -1683,8 +1726,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error corrected"
" (%lu sectors at %llu on %s)\n",
mdname(conf->mddev), STRIPE_SECTORS,
- (unsigned long long)(sh->sector
- + rdev->data_offset),
+ (unsigned long long)s,
bdevname(rdev->bdev, b));
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
clear_bit(R5_ReadError, &sh->dev[i].flags);
@@ -1704,8 +1746,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error on replacement device "
"(sector %llu on %s).\n",
mdname(conf->mddev),
- (unsigned long long)(sh->sector
- + rdev->data_offset),
+ (unsigned long long)s,
bdn);
else if (conf->mddev->degraded >= conf->max_degraded)
printk_ratelimited(
@@ -1713,8 +1754,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error not correctable "
"(sector %llu on %s).\n",
mdname(conf->mddev),
- (unsigned long long)(sh->sector
- + rdev->data_offset),
+ (unsigned long long)s,
bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
/* Oh, no!!! */
@@ -1723,8 +1763,7 @@ static void raid5_end_read_request(struct bio * bi, int error)
"md/raid:%s: read error NOT corrected!! "
"(sector %llu on %s).\n",
mdname(conf->mddev),
- (unsigned long long)(sh->sector
- + rdev->data_offset),
+ (unsigned long long)s,
bdn);
else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)
@@ -3561,7 +3600,7 @@ static void handle_stripe(struct stripe_head *sh)
if (test_and_clear_bit(R5_MadeGood, &dev->flags)) {
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
- STRIPE_SECTORS);
+ STRIPE_SECTORS, 0);
rdev_dec_pending(rdev, conf->mddev);
}
if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) {
@@ -3570,7 +3609,7 @@ static void handle_stripe(struct stripe_head *sh)
/* rdev have been moved down */
rdev = conf->disks[i].rdev;
rdev_clear_badblocks(rdev, sh->sector,
- STRIPE_SECTORS);
+ STRIPE_SECTORS, 0);
rdev_dec_pending(rdev, conf->mddev);
}
}
@@ -3842,6 +3881,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
raid_bio->bi_next = (void*)rdev;
align_bi->bi_bdev = rdev->bdev;
align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
+ /* No reshape active, so we can trust rdev->data_offset */
align_bi->bi_sector += rdev->data_offset;
if (!bio_fits_rdev(align_bi) ||
@@ -3953,12 +3993,10 @@ static void make_request(struct mddev *mddev, struct bio * bi)
plugged = mddev_check_plugged(mddev);
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);
- int disks, data_disks;
int previous;
retry:
previous = 0;
- disks = conf->raid_disks;
prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
if (unlikely(conf->reshape_progress != MaxSector)) {
/* spinlock is needed as reshape_progress may be
@@ -3970,13 +4008,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* to check again.
*/
spin_lock_irq(&conf->device_lock);
- if (mddev->delta_disks < 0
+ if (mddev->reshape_backwards
? logical_sector < conf->reshape_progress
: logical_sector >= conf->reshape_progress) {
- disks = conf->previous_raid_disks;
previous = 1;
} else {
- if (mddev->delta_disks < 0
+ if (mddev->reshape_backwards
? logical_sector < conf->reshape_safe
: logical_sector >= conf->reshape_safe) {
spin_unlock_irq(&conf->device_lock);
@@ -3986,7 +4023,6 @@ static void make_request(struct mddev *mddev, struct bio * bi)
}
spin_unlock_irq(&conf->device_lock);
}
- data_disks = disks - conf->max_degraded;
new_sector = raid5_compute_sector(conf, logical_sector,
previous,
@@ -4009,7 +4045,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
*/
int must_retry = 0;
spin_lock_irq(&conf->device_lock);
- if (mddev->delta_disks < 0
+ if (mddev->reshape_backwards
? logical_sector >= conf->reshape_progress
: logical_sector < conf->reshape_progress)
/* mismatch, need to try again */
@@ -4108,11 +4144,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (sector_nr == 0) {
/* If restarting in the middle, skip the initial sectors */
- if (mddev->delta_disks < 0 &&
+ if (mddev->reshape_backwards &&
conf->reshape_progress < raid5_size(mddev, 0, 0)) {
sector_nr = raid5_size(mddev, 0, 0)
- conf->reshape_progress;
- } else if (mddev->delta_disks >= 0 &&
+ } else if (!mddev->reshape_backwards &&
conf->reshape_progress > 0)
sector_nr = conf->reshape_progress;
sector_div(sector_nr, new_data_disks);
@@ -4133,13 +4169,11 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
else
reshape_sectors = mddev->chunk_sectors;
- /* we update the metadata when there is more than 3Meg
- * in the block range (that is rather arbitrary, should
- * probably be time based) or when the data about to be
- * copied would over-write the source of the data at
- * the front of the range.
- * i.e. one new_stripe along from reshape_progress new_maps
- * to after where reshape_safe old_maps to
+ /* We update the metadata at least every 10 seconds, or when
+ * the data about to be copied would over-write the source of
+ * the data at the front of the range. i.e. one new_stripe
+ * along from reshape_progress new_maps to after where
+ * reshape_safe old_maps to
*/
writepos = conf->reshape_progress;
sector_div(writepos, new_data_disks);
@@ -4147,7 +4181,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sector_div(readpos, data_disks);
safepos = conf->reshape_safe;
sector_div(safepos, data_disks);
- if (mddev->delta_disks < 0) {
+ if (mddev->reshape_backwards) {
writepos -= min_t(sector_t, reshape_sectors, writepos);
readpos += reshape_sectors;
safepos += reshape_sectors;
@@ -4157,11 +4191,29 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
safepos -= min_t(sector_t, reshape_sectors, safepos);
}
+ /* Having calculated the 'writepos' possibly use it
+ * to set 'stripe_addr' which is where we will write to.
+ */
+ if (mddev->reshape_backwards) {
+ BUG_ON(conf->reshape_progress == 0);
+ stripe_addr = writepos;
+ BUG_ON((mddev->dev_sectors &
+ ~((sector_t)reshape_sectors - 1))
+ - reshape_sectors - stripe_addr
+ != sector_nr);
+ } else {
+ BUG_ON(writepos != sector_nr + reshape_sectors);
+ stripe_addr = sector_nr;
+ }
+
/* 'writepos' is the most advanced device address we might write.
* 'readpos' is the least advanced device address we might read.
* 'safepos' is the least address recorded in the metadata as having
* been reshaped.
- * If 'readpos' is behind 'writepos', then there is no way that we can
+ * If there is a min_offset_diff, these are adjusted either by
+ * increasing the safepos/readpos if diff is negative, or
+ * increasing writepos if diff is positive.
+ * If 'readpos' is then behind 'writepos', there is no way that we can
* ensure safety in the face of a crash - that must be done by userspace
* making a backup of the data. So in that case there is no particular
* rush to update metadata.
@@ -4174,7 +4226,13 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
* Maybe that number should be configurable, but I'm not sure it is
* worth it.... maybe it could be a multiple of safemode_delay???
*/
- if ((mddev->delta_disks < 0
+ if (conf->min_offset_diff < 0) {
+ safepos += -conf->min_offset_diff;
+ readpos += -conf->min_offset_diff;
+ } else
+ writepos += conf->min_offset_diff;
+
+ if ((mddev->reshape_backwards
? (safepos > writepos && readpos < writepos)
: (safepos < writepos && readpos > writepos)) ||
time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
@@ -4195,17 +4253,6 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
- if (mddev->delta_disks < 0) {
- BUG_ON(conf->reshape_progress == 0);
- stripe_addr = writepos;
- BUG_ON((mddev->dev_sectors &
- ~((sector_t)reshape_sectors - 1))
- - reshape_sectors - stripe_addr
- != sector_nr);
- } else {
- BUG_ON(writepos != sector_nr + reshape_sectors);
- stripe_addr = sector_nr;
- }
INIT_LIST_HEAD(&stripes);
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
@@ -4239,7 +4286,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
list_add(&sh->lru, &stripes);
}
spin_lock_irq(&conf->device_lock);
- if (mddev->delta_disks < 0)
+ if (mddev->reshape_backwards)
conf->reshape_progress -= reshape_sectors * new_data_disks;
else
conf->reshape_progress += reshape_sectors * new_data_disks;
@@ -4952,16 +4999,42 @@ static int run(struct mddev *mddev)
struct md_rdev *rdev;
sector_t reshape_offset = 0;
int i;
+ long long min_offset_diff = 0;
+ int first = 1;
if (mddev->recovery_cp != MaxSector)
printk(KERN_NOTICE "md/raid:%s: not clean"
" -- starting background reconstruction\n",
mdname(mddev));
+
+ rdev_for_each(rdev, mddev) {
+ long long diff;
+ if (rdev->raid_disk < 0)
+ continue;
+ diff = (rdev->new_data_offset - rdev->data_offset);
+ if (first) {
+ min_offset_diff = diff;
+ first = 0;
+ } else if (mddev->reshape_backwards &&
+ diff < min_offset_diff)
+ min_offset_diff = diff;
+ else if (!mddev->reshape_backwards &&
+ diff > min_offset_diff)
+ min_offset_diff = diff;
+ }
+
if (mddev->reshape_position != MaxSector) {
/* Check that we can continue the reshape.
- * Currently only disks can change, it must
- * increase, and we must be past the point where
- * a stripe over-writes itself
+ * Difficulties arise if the stripe we would write to
+ * next is at or after the stripe we would read from next.
+ * For a reshape that changes the number of devices, this
+ * is only possible for a very short time, and mdadm makes
+ * sure that time appears to have past before assembling
+ * the array. So we fail if that time hasn't passed.
+ * For a reshape that keeps the number of devices the same
+ * mdadm must be monitoring the reshape can keeping the
+ * critical areas read-only and backed up. It will start
+ * the array in read-only mode, so we check for that.
*/
sector_t here_new, here_old;
int old_disks;
@@ -4993,26 +5066,34 @@ static int run(struct mddev *mddev)
/* here_old is the first stripe that we might need to read
* from */
if (mddev->delta_disks == 0) {
+ if ((here_new * mddev->new_chunk_sectors !=
+ here_old * mddev->chunk_sectors)) {
+ printk(KERN_ERR "md/raid:%s: reshape position is"
+ " confused - aborting\n", mdname(mddev));
+ return -EINVAL;
+ }
/* We cannot be sure it is safe to start an in-place
- * reshape. It is only safe if user-space if monitoring
+ * reshape. It is only safe if user-space is monitoring
* and taking constant backups.
* mdadm always starts a situation like this in
* readonly mode so it can take control before
* allowing any writes. So just check for that.
*/
- if ((here_new * mddev->new_chunk_sectors !=
- here_old * mddev->chunk_sectors) ||
- mddev->ro == 0) {
- printk(KERN_ERR "md/raid:%s: in-place reshape must be started"
- " in read-only mode - aborting\n",
+ if (abs(min_offset_diff) >= mddev->chunk_sectors &&
+ abs(min_offset_diff) >= mddev->new_chunk_sectors)
+ /* not really in-place - so OK */;
+ else if (mddev->ro == 0) {
+ printk(KERN_ERR "md/raid:%s: in-place reshape "
+ "must be started in read-only mode "
+ "- aborting\n",
mdname(mddev));
return -EINVAL;
}
- } else if (mddev->delta_disks < 0
- ? (here_new * mddev->new_chunk_sectors <=
+ } else if (mddev->reshape_backwards
+ ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
here_old * mddev->chunk_sectors)
: (here_new * mddev->new_chunk_sectors >=
- here_old * mddev->chunk_sectors)) {
+ here_old * mddev->chunk_sectors + (-min_offset_diff))) {
/* Reading from the same stripe as writing to - bad */
printk(KERN_ERR "md/raid:%s: reshape_position too early for "
"auto-recovery - aborting.\n",
@@ -5037,6 +5118,7 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
+ conf->min_offset_diff = min_offset_diff;
mddev->thread = conf->thread;
conf->thread = NULL;
mddev->private = conf;
@@ -5182,9 +5264,12 @@ static int run(struct mddev *mddev)
blk_queue_io_opt(mddev->queue, chunk_size *
(conf->raid_disks - conf->max_degraded));
- rdev_for_each(rdev, mddev)
+ rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
+ disk_stack_limits(mddev->gendisk, rdev->bdev,
+ rdev->new_data_offset << 9);
+ }
}
return 0;
@@ -5418,12 +5503,18 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
* any io in the removed space completes, but it hardly seems
* worth it.
*/
+ sector_t newsize;
sectors &= ~((sector_t)mddev->chunk_sectors - 1);
- md_set_array_sectors(mddev, raid5_size(mddev, sectors,
- mddev->raid_disks));
- if (mddev->array_sectors >
- raid5_size(mddev, sectors, mddev->raid_disks))
+ newsize = raid5_size(mddev, sectors, mddev->raid_disks);
+ if (mddev->external_size &&
+ mddev->array_sectors > newsize)
return -EINVAL;
+ if (mddev->bitmap) {
+ int ret = bitmap_resize(mddev->bitmap, sectors, 0, 0);
+ if (ret)
+ return ret;
+ }
+ md_set_array_sectors(mddev, newsize);
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
@@ -5468,9 +5559,6 @@ static int check_reshape(struct mddev *mddev)
mddev->new_layout == mddev->layout &&
mddev->new_chunk_sectors == mddev->chunk_sectors)
return 0; /* nothing to do */