Skip to content

Commit e07b2d7

Browse files
author
Keith M Wesolowski
committed
OS-2456 sd timeout/retry settings are absurd
OS-2457 mptsas timeout subsystem needs finer granularity
1 parent 794fcf5 commit e07b2d7

4 files changed

Lines changed: 26 additions & 1 deletion

File tree

overlay/generic/etc/system

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,14 @@ set dump_metrics_on=1
144144
* automatically. See: usr/src/uts/common/io/sata/impl/sata.c:97
145145
*
146146
set sata:sata_auto_online=1
147+
148+
#
149+
# We want to limit the time spent in any one I/O to 10 seconds for targets
150+
# that are not optical. This is still a very long time; our queue depth is
151+
# typically 10 or less, and disks will usually fail a command after 2-3s.
152+
# So we'd have to have multiple reads of bad sectors queued up to have any
153+
# chance of timing out. In practice, timeouts occur because of problems with
154+
# disk controllers or firmware, not media errors, and in those cases it will
155+
# not help at all to wait longer.
156+
#
157+
set sd:sd_io_time=10
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
ddi-vhci-class="scsi_vhci";
2+
mpxio-disable="no";
3+
4+
#
5+
# Command/target timeout checking should be done at a 1-second granularity.
6+
#
7+
scsi-watchdog-tick=1;

overlay/generic/kernel/drv/sd.conf

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,15 @@ ddi-devid-registrant=1;
6060
# the controller will switch to write-through mode, and ensure that any
6161
# underlying drive cache is off. In this case, it should still be safe to
6262
# dispense with cache flush commands. Controllers for which this is not the
63-
# case should not be added here unless data loss and corruption are acceptable.
63+
# case should have cache-nonvolatile set unless data loss and corruption are
64+
# acceptable.
65+
#
66+
# In addition, *all* devices have their retries capped at 1. There are an
67+
# additional 2 retries for "victim" IOs if a reset is needed. Retrying is
68+
# very rarely successful, and it is preferable to let ZFS do it where needed.
6469
#
6570
sd-config-list=
71+
"", "retries-timeout:1,retries-busy:1,retries-reset:1,retries-victim:2",
6672
"DELL PERC H710", "cache-nonvolatile:true",
6773
"DELL PERC H700", "cache-nonvolatile:true",
6874
"DELL PERC/6i", "cache-nonvolatile:true";

overlay/generic/manifest

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ f etc/zones/SUNWdefault.xml 0444 root bin
6060
f etc/resolv.conf 0644 netadm netadm
6161
d kernel/drv 0755 root sys
6262
f kernel/drv/cpqary3.conf 0644 root sys
63+
f kernel/drv/mpt_sas.conf 0644 root sys
6364
f kernel/drv/sd.conf 0644 root sys
6465
f kernel/drv/amd64/cpqary3 0755 root sys
6566
f kernel/drv/amd64/bnx 0755 root sys

0 commit comments

Comments
 (0)