forked from illumos/illumos-gate
/
nvme.c
4822 lines (3979 loc) · 126 KB
/
nvme.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
* This file and its contents are supplied under the terms of the
* Common Development and Distribution License ("CDDL"), version 1.0.
* You may only use this file in accordance with the terms of version
* 1.0 of the CDDL.
*
* A full copy of the text of the CDDL should have accompanied this
* source. A copy of the CDDL is also available via the Internet at
* http://www.illumos.org/license/CDDL.
*/
/*
* Copyright 2018 Nexenta Systems, Inc.
* Copyright 2016 Tegile Systems, Inc. All rights reserved.
* Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
* Copyright 2019 Joyent, Inc.
* Copyright 2019 Western Digital Corporation.
*/
/*
* blkdev driver for NVMe compliant storage devices
*
* This driver was written to conform to version 1.2.1 of the NVMe
* specification. It may work with newer versions, but that is completely
* untested and disabled by default.
*
* The driver has only been tested on x86 systems and will not work on big-
* endian systems without changes to the code accessing registers and data
* structures used by the hardware.
*
*
* Interrupt Usage:
*
* The driver will use a single interrupt while configuring the device as the
* specification requires, but contrary to the specification it will try to use
* a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
* will switch to multiple-message MSI(-X) if supported. The driver wants to
* have one interrupt vector per CPU, but it will work correctly if less are
* available. Interrupts can be shared by queues, the interrupt handler will
* iterate through the I/O queue array by steps of n_intr_cnt. Usually only
* the admin queue will share an interrupt with one I/O queue. The interrupt
* handler will retrieve completed commands from all queues sharing an interrupt
* vector and will post them to a taskq for completion processing.
*
*
* Command Processing:
*
* NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
* to 65536 I/O commands. The driver will configure one I/O queue pair per
* available interrupt vector, with the queue length usually much smaller than
* the maximum of 65536. If the hardware doesn't provide enough queues, fewer
* interrupt vectors will be used.
*
* Additionally the hardware provides a single special admin queue pair that can
* hold up to 4096 admin commands.
*
* From the hardware perspective both queues of a queue pair are independent,
* but they share some driver state: the command array (holding pointers to
* commands currently being processed by the hardware) and the active command
* counter. Access to a submission queue and the shared state is protected by
* nq_mutex; completion queue is protected by ncq_mutex.
*
* When a command is submitted to a queue pair the active command counter is
* incremented and a pointer to the command is stored in the command array. The
* array index is used as command identifier (CID) in the submission queue
* entry. Some commands may take a very long time to complete, and if the queue
* wraps around in that time a submission may find the next array slot to still
* be used by a long-running command. In this case the array is sequentially
* searched for the next free slot. The length of the command array is the same
* as the configured queue length. Queue overrun is prevented by the semaphore,
* so a command submission may block if the queue is full.
*
*
* Polled I/O Support:
*
* For kernel core dump support the driver can do polled I/O. As interrupts are
* turned off while dumping the driver will just submit a command in the regular
* way, and then repeatedly attempt a command retrieval until it gets the
* command back.
*
*
* Namespace Support:
*
* NVMe devices can have multiple namespaces, each being a independent data
* store. The driver supports multiple namespaces and creates a blkdev interface
* for each namespace found. Namespaces can have various attributes to support
* protection information. This driver does not support any of this and ignores
* namespaces that have these attributes.
*
* As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
* (EUI64). This driver uses the EUI64 if present to generate the devid and
* passes it to blkdev to use it in the device node names. As this is currently
* untested namespaces with EUI64 are ignored by default.
*
* We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
* single controller. This is an artificial limit imposed by the driver to be
* able to address a reasonable number of controllers and namespaces using a
* 32bit minor node number.
*
*
* Minor nodes:
*
* For each NVMe device the driver exposes one minor node for the controller and
* one minor node for each namespace. The only operations supported by those
* minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
* interface for the nvmeadm(1M) utility.
*
*
* Blkdev Interface:
*
* This driver uses blkdev to do all the heavy lifting involved with presenting
* a disk device to the system. As a result, the processing of I/O requests is
* relatively simple as blkdev takes care of partitioning, boundary checks, DMA
* setup, and splitting of transfers into manageable chunks.
*
* I/O requests coming in from blkdev are turned into NVM commands and posted to
* an I/O queue. The queue is selected by taking the CPU id modulo the number of
* queues. There is currently no timeout handling of I/O commands.
*
* Blkdev also supports querying device/media information and generating a
* devid. The driver reports the best block size as determined by the namespace
* format back to blkdev as physical block size to support partition and block
* alignment. The devid is either based on the namespace EUI64, if present, or
* composed using the device vendor ID, model number, serial number, and the
* namespace ID.
*
*
* Error Handling:
*
* Error handling is currently limited to detecting fatal hardware errors,
* either by asynchronous events, or synchronously through command status or
* admin command timeouts. In case of severe errors the device is fenced off,
* all further requests will return EIO. FMA is then called to fault the device.
*
* The hardware has a limit for outstanding asynchronous event requests. Before
* this limit is known the driver assumes it is at least 1 and posts a single
* asynchronous request. Later when the limit is known more asynchronous event
* requests are posted to allow quicker reception of error information. When an
* asynchronous event is posted by the hardware the driver will parse the error
* status fields and log information or fault the device, depending on the
* severity of the asynchronous event. The asynchronous event request is then
* reused and posted to the admin queue again.
*
* On command completion the command status is checked for errors. In case of
* errors indicating a driver bug the driver panics. Almost all other error
* status values just cause EIO to be returned.
*
* Command timeouts are currently detected for all admin commands except
* asynchronous event requests. If a command times out and the hardware appears
* to be healthy the driver attempts to abort the command. The original command
* timeout is also applied to the abort command. If the abort times out too the
* driver assumes the device to be dead, fences it off, and calls FMA to retire
* it. In all other cases the aborted command should return immediately with a
* status indicating it was aborted, and the driver will wait indefinitely for
* that to happen. No timeout handling of normal I/O commands is presently done.
*
* Any command that times out due to the controller dropping dead will be put on
* nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
* memory being reused by the system and later be written to by a "dead" NVMe
* controller.
*
*
* Locking:
*
* Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
* when accessing shared state and submission queue registers, ncq_mutex
* is held when accessing completion queue state and registers.
* Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
* nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
* mutexes themselves.
*
* Each command also has its own nc_mutex, which is associated with the
* condition variable nc_cv. It is only used on admin commands which are run
* synchronously. In that case it must be held across calls to
* nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
* nvme_admin_cmd(). It must also be held whenever the completion state of the
* command is changed or while a admin command timeout is handled.
*
* If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
* More than one nc_mutex may only be held when aborting commands. In this case,
* the nc_mutex of the command to be aborted must be held across the call to
* nvme_abort_cmd() to prevent the command from completing while the abort is in
* progress.
*
* If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
* acquired first. More than one nq_mutex is never held by a single thread.
* The ncq_mutex is only held by nvme_retrieve_cmd() and
* nvme_process_iocq(). nvme_process_iocq() is only called from the
* interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
* mutex is non-contentious but is required for implementation completeness
* and safety.
*
* Each minor node has its own nm_mutex, which protects the open count nm_ocnt
* and exclusive-open flag nm_oexcl.
*
*
* Quiesce / Fast Reboot:
*
* The driver currently does not support fast reboot. A quiesce(9E) entry point
* is still provided which is used to send a shutdown notification to the
* device.
*
*
* NVMe Hotplug:
*
* The driver supports hot removal. The driver uses the NDI event framework
* to register a callback, nvme_remove_callback, to clean up when a disk is
* removed. In particular, the driver will unqueue outstanding I/O commands and
* set n_dead on the softstate to true so that other operations, such as ioctls
* and command submissions, fail as well.
*
* While the callback registration relies on the NDI event framework, the
* removal event itself is kicked off in the PCIe hotplug framework, when the
* PCIe bridge driver ("pcieb") gets a hotplug interrupt indicatating that a
* device was removed from the slot.
*
* The NVMe driver instance itself will remain until the final close of the
* device.
*
*
* DDI UFM Support
*
* The driver supports the DDI UFM framework for reporting information about
* the device's firmware image and slot configuration. This data can be
* queried by userland software via ioctls to the ufm driver. For more
* information, see ddi_ufm(9E).
*
*
* Driver Configuration:
*
* The following driver properties can be changed to control some aspects of the
* drivers operation:
* - strict-version: can be set to 0 to allow devices conforming to newer
* major versions to be used
* - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
* specific command status as a fatal error leading device faulting
* - admin-queue-len: the maximum length of the admin queue (16-4096)
* - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
* - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
* - async-event-limit: the maximum number of asynchronous event requests to be
* posted by the driver
* - volatile-write-cache-enable: can be set to 0 to disable the volatile write
* cache
* - min-phys-block-size: the minimum physical block size to report to blkdev,
* which is among other things the basis for ZFS vdev ashift
* - max-submission-queues: the maximum number of I/O submission queues.
* - max-completion-queues: the maximum number of I/O completion queues,
* can be less than max-submission-queues, in which case the completion
* queues are shared.
*
*
* TODO:
* - figure out sane default for I/O queue depth reported to blkdev
* - FMA handling of media errors
* - support for devices supporting very large I/O requests using chained PRPs
* - support for configuring hardware parameters like interrupt coalescing
* - support for media formatting and hard partitioning into namespaces
* - support for big-endian systems
* - support for fast reboot
* - support for NVMe Subsystem Reset (1.1)
* - support for Scatter/Gather lists (1.1)
* - support for Reservations (1.1)
* - support for power management
*/
#include <sys/byteorder.h>
#ifdef _BIG_ENDIAN
#error nvme driver needs porting for big-endian platforms
#endif
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/devops.h>
#include <sys/ddi.h>
#include <sys/ddi_ufm.h>
#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/bitmap.h>
#include <sys/sysmacros.h>
#include <sys/param.h>
#include <sys/varargs.h>
#include <sys/cpuvar.h>
#include <sys/disp.h>
#include <sys/blkdev.h>
#include <sys/atomic.h>
#include <sys/archsystm.h>
#include <sys/sata/sata_hba.h>
#include <sys/stat.h>
#include <sys/policy.h>
#include <sys/list.h>
#include <sys/nvme.h>
#ifdef __x86
#include <sys/x86_archext.h>
#endif
#include "nvme_reg.h"
#include "nvme_var.h"
/*
* Assertions to make sure that we've properly captured various aspects of the
* packed structures and haven't broken them during updates.
*/
CTASSERT(sizeof (nvme_identify_ctrl_t) == 0x1000);
CTASSERT(offsetof(nvme_identify_ctrl_t, id_oacs) == 256);
CTASSERT(offsetof(nvme_identify_ctrl_t, id_sqes) == 512);
CTASSERT(offsetof(nvme_identify_ctrl_t, id_subnqn) == 768);
CTASSERT(offsetof(nvme_identify_ctrl_t, id_nvmof) == 1792);
CTASSERT(offsetof(nvme_identify_ctrl_t, id_psd) == 2048);
CTASSERT(offsetof(nvme_identify_ctrl_t, id_vs) == 3072);
CTASSERT(sizeof (nvme_identify_nsid_t) == 0x1000);
CTASSERT(offsetof(nvme_identify_nsid_t, id_fpi) == 32);
CTASSERT(offsetof(nvme_identify_nsid_t, id_nguid) == 104);
CTASSERT(offsetof(nvme_identify_nsid_t, id_lbaf) == 128);
CTASSERT(offsetof(nvme_identify_nsid_t, id_vs) == 384);
CTASSERT(sizeof (nvme_identify_primary_caps_t) == 0x1000);
CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vqfrt) == 32);
CTASSERT(offsetof(nvme_identify_primary_caps_t, nipc_vifrt) == 64);
/* NVMe spec version supported */
static const int nvme_version_major = 1;
/* tunable for admin command timeout in seconds, default is 1s */
int nvme_admin_cmd_timeout = 1;
/* tunable for FORMAT NVM command timeout in seconds, default is 600s */
int nvme_format_cmd_timeout = 600;
/* tunable for firmware commit with NVME_FWC_SAVE, default is 15s */
int nvme_commit_save_cmd_timeout = 15;
static int nvme_attach(dev_info_t *, ddi_attach_cmd_t);
static int nvme_detach(dev_info_t *, ddi_detach_cmd_t);
static int nvme_quiesce(dev_info_t *);
static int nvme_fm_errcb(dev_info_t *, ddi_fm_error_t *, const void *);
static int nvme_setup_interrupts(nvme_t *, int, int);
static void nvme_release_interrupts(nvme_t *);
static uint_t nvme_intr(caddr_t, caddr_t);
static void nvme_shutdown(nvme_t *, int, boolean_t);
static boolean_t nvme_reset(nvme_t *, boolean_t);
static int nvme_init(nvme_t *);
static nvme_cmd_t *nvme_alloc_cmd(nvme_t *, int);
static void nvme_free_cmd(nvme_cmd_t *);
static nvme_cmd_t *nvme_create_nvm_cmd(nvme_namespace_t *, uint8_t,
bd_xfer_t *);
static void nvme_admin_cmd(nvme_cmd_t *, int);
static void nvme_submit_admin_cmd(nvme_qpair_t *, nvme_cmd_t *);
static int nvme_submit_io_cmd(nvme_qpair_t *, nvme_cmd_t *);
static void nvme_submit_cmd_common(nvme_qpair_t *, nvme_cmd_t *);
static nvme_cmd_t *nvme_unqueue_cmd(nvme_t *, nvme_qpair_t *, int);
static nvme_cmd_t *nvme_retrieve_cmd(nvme_t *, nvme_qpair_t *);
static void nvme_wait_cmd(nvme_cmd_t *, uint_t);
static void nvme_wakeup_cmd(void *);
static void nvme_async_event_task(void *);
static int nvme_check_unknown_cmd_status(nvme_cmd_t *);
static int nvme_check_vendor_cmd_status(nvme_cmd_t *);
static int nvme_check_integrity_cmd_status(nvme_cmd_t *);
static int nvme_check_specific_cmd_status(nvme_cmd_t *);
static int nvme_check_generic_cmd_status(nvme_cmd_t *);
static inline int nvme_check_cmd_status(nvme_cmd_t *);
static int nvme_abort_cmd(nvme_cmd_t *, uint_t);
static void nvme_async_event(nvme_t *);
static int nvme_format_nvm(nvme_t *, boolean_t, uint32_t, uint8_t, boolean_t,
uint8_t, boolean_t, uint8_t);
static int nvme_get_logpage(nvme_t *, boolean_t, void **, size_t *, uint8_t,
...);
static int nvme_identify(nvme_t *, boolean_t, uint32_t, void **);
static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
uint32_t *);
static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *,
void **, size_t *);
static int nvme_write_cache_set(nvme_t *, boolean_t);
static int nvme_set_nqueues(nvme_t *);
static void nvme_free_dma(nvme_dma_t *);
static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
nvme_dma_t **);
static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
nvme_dma_t **);
static void nvme_free_qpair(nvme_qpair_t *);
static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
static inline void nvme_put32(nvme_t *, uintptr_t, uint32_t);
static inline uint64_t nvme_get64(nvme_t *, uintptr_t);
static inline uint32_t nvme_get32(nvme_t *, uintptr_t);
static boolean_t nvme_check_regs_hdl(nvme_t *);
static boolean_t nvme_check_dma_hdl(nvme_dma_t *);
static int nvme_fill_prp(nvme_cmd_t *, bd_xfer_t *);
static void nvme_bd_xfer_done(void *);
static void nvme_bd_driveinfo(void *, bd_drive_t *);
static int nvme_bd_mediainfo(void *, bd_media_t *);
static int nvme_bd_cmd(nvme_namespace_t *, bd_xfer_t *, uint8_t);
static int nvme_bd_read(void *, bd_xfer_t *);
static int nvme_bd_write(void *, bd_xfer_t *);
static int nvme_bd_sync(void *, bd_xfer_t *);
static int nvme_bd_devid(void *, dev_info_t *, ddi_devid_t *);
static int nvme_prp_dma_constructor(void *, void *, int);
static void nvme_prp_dma_destructor(void *, void *);
static void nvme_prepare_devid(nvme_t *, uint32_t);
/* DDI UFM callbacks */
static int nvme_ufm_fill_image(ddi_ufm_handle_t *, void *, uint_t,
ddi_ufm_image_t *);
static int nvme_ufm_fill_slot(ddi_ufm_handle_t *, void *, uint_t, uint_t,
ddi_ufm_slot_t *);
static int nvme_ufm_getcaps(ddi_ufm_handle_t *, void *, ddi_ufm_cap_t *);
static int nvme_open(dev_t *, int, int, cred_t *);
static int nvme_close(dev_t, int, int, cred_t *);
static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
static ddi_ufm_ops_t nvme_ufm_ops = {
NULL,
nvme_ufm_fill_image,
nvme_ufm_fill_slot,
nvme_ufm_getcaps
};
#define NVME_MINOR_INST_SHIFT 9
#define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
#define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT)
#define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
#define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2)
static void *nvme_state;
static kmem_cache_t *nvme_cmd_cache;
/*
* DMA attributes for queue DMA memory
*
* Queue DMA memory must be page aligned. The maximum length of a queue is
* 65536 entries, and an entry can be 64 bytes long.
*/
static ddi_dma_attr_t nvme_queue_dma_attr = {
.dma_attr_version = DMA_ATTR_V0,
.dma_attr_addr_lo = 0,
.dma_attr_addr_hi = 0xffffffffffffffffULL,
.dma_attr_count_max = (UINT16_MAX + 1) * sizeof (nvme_sqe_t) - 1,
.dma_attr_align = 0x1000,
.dma_attr_burstsizes = 0x7ff,
.dma_attr_minxfer = 0x1000,
.dma_attr_maxxfer = (UINT16_MAX + 1) * sizeof (nvme_sqe_t),
.dma_attr_seg = 0xffffffffffffffffULL,
.dma_attr_sgllen = 1,
.dma_attr_granular = 1,
.dma_attr_flags = 0,
};
/*
* DMA attributes for transfers using Physical Region Page (PRP) entries
*
* A PRP entry describes one page of DMA memory using the page size specified
* in the controller configuration's memory page size register (CC.MPS). It uses
* a 64bit base address aligned to this page size. There is no limitation on
* chaining PRPs together for arbitrarily large DMA transfers.
*/
static ddi_dma_attr_t nvme_prp_dma_attr = {
.dma_attr_version = DMA_ATTR_V0,
.dma_attr_addr_lo = 0,
.dma_attr_addr_hi = 0xffffffffffffffffULL,
.dma_attr_count_max = 0xfff,
.dma_attr_align = 0x1000,
.dma_attr_burstsizes = 0x7ff,
.dma_attr_minxfer = 0x1000,
.dma_attr_maxxfer = 0x1000,
.dma_attr_seg = 0xfff,
.dma_attr_sgllen = -1,
.dma_attr_granular = 1,
.dma_attr_flags = 0,
};
/*
* DMA attributes for transfers using scatter/gather lists
*
* A SGL entry describes a chunk of DMA memory using a 64bit base address and a
* 32bit length field. SGL Segment and SGL Last Segment entries require the
* length to be a multiple of 16 bytes.
*/
static ddi_dma_attr_t nvme_sgl_dma_attr = {
.dma_attr_version = DMA_ATTR_V0,
.dma_attr_addr_lo = 0,
.dma_attr_addr_hi = 0xffffffffffffffffULL,
.dma_attr_count_max = 0xffffffffUL,
.dma_attr_align = 1,
.dma_attr_burstsizes = 0x7ff,
.dma_attr_minxfer = 0x10,
.dma_attr_maxxfer = 0xfffffffffULL,
.dma_attr_seg = 0xffffffffffffffffULL,
.dma_attr_sgllen = -1,
.dma_attr_granular = 0x10,
.dma_attr_flags = 0
};
static ddi_device_acc_attr_t nvme_reg_acc_attr = {
.devacc_attr_version = DDI_DEVICE_ATTR_V0,
.devacc_attr_endian_flags = DDI_STRUCTURE_LE_ACC,
.devacc_attr_dataorder = DDI_STRICTORDER_ACC
};
static struct cb_ops nvme_cb_ops = {
.cb_open = nvme_open,
.cb_close = nvme_close,
.cb_strategy = nodev,
.cb_print = nodev,
.cb_dump = nodev,
.cb_read = nodev,
.cb_write = nodev,
.cb_ioctl = nvme_ioctl,
.cb_devmap = nodev,
.cb_mmap = nodev,
.cb_segmap = nodev,
.cb_chpoll = nochpoll,
.cb_prop_op = ddi_prop_op,
.cb_str = 0,
.cb_flag = D_NEW | D_MP,
.cb_rev = CB_REV,
.cb_aread = nodev,
.cb_awrite = nodev
};
static struct dev_ops nvme_dev_ops = {
.devo_rev = DEVO_REV,
.devo_refcnt = 0,
.devo_getinfo = ddi_no_info,
.devo_identify = nulldev,
.devo_probe = nulldev,
.devo_attach = nvme_attach,
.devo_detach = nvme_detach,
.devo_reset = nodev,
.devo_cb_ops = &nvme_cb_ops,
.devo_bus_ops = NULL,
.devo_power = NULL,
.devo_quiesce = nvme_quiesce,
};
static struct modldrv nvme_modldrv = {
.drv_modops = &mod_driverops,
.drv_linkinfo = "NVMe v1.1b",
.drv_dev_ops = &nvme_dev_ops
};
static struct modlinkage nvme_modlinkage = {
.ml_rev = MODREV_1,
.ml_linkage = { &nvme_modldrv, NULL }
};
static bd_ops_t nvme_bd_ops = {
.o_version = BD_OPS_VERSION_0,
.o_drive_info = nvme_bd_driveinfo,
.o_media_info = nvme_bd_mediainfo,
.o_devid_init = nvme_bd_devid,
.o_sync_cache = nvme_bd_sync,
.o_read = nvme_bd_read,
.o_write = nvme_bd_write,
};
/*
* This list will hold commands that have timed out and couldn't be aborted.
* As we don't know what the hardware may still do with the DMA memory we can't
* free them, so we'll keep them forever on this list where we can easily look
* at them with mdb.
*/
static struct list nvme_lost_cmds;
static kmutex_t nvme_lc_mutex;
int
_init(void)
{
int error;
error = ddi_soft_state_init(&nvme_state, sizeof (nvme_t), 1);
if (error != DDI_SUCCESS)
return (error);
nvme_cmd_cache = kmem_cache_create("nvme_cmd_cache",
sizeof (nvme_cmd_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
mutex_init(&nvme_lc_mutex, NULL, MUTEX_DRIVER, NULL);
list_create(&nvme_lost_cmds, sizeof (nvme_cmd_t),
offsetof(nvme_cmd_t, nc_list));
bd_mod_init(&nvme_dev_ops);
error = mod_install(&nvme_modlinkage);
if (error != DDI_SUCCESS) {
ddi_soft_state_fini(&nvme_state);
mutex_destroy(&nvme_lc_mutex);
list_destroy(&nvme_lost_cmds);
bd_mod_fini(&nvme_dev_ops);
}
return (error);
}
int
_fini(void)
{
int error;
if (!list_is_empty(&nvme_lost_cmds))
return (DDI_FAILURE);
error = mod_remove(&nvme_modlinkage);
if (error == DDI_SUCCESS) {
ddi_soft_state_fini(&nvme_state);
kmem_cache_destroy(nvme_cmd_cache);
mutex_destroy(&nvme_lc_mutex);
list_destroy(&nvme_lost_cmds);
bd_mod_fini(&nvme_dev_ops);
}
return (error);
}
int
_info(struct modinfo *modinfop)
{
return (mod_info(&nvme_modlinkage, modinfop));
}
static inline void
nvme_put64(nvme_t *nvme, uintptr_t reg, uint64_t val)
{
ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
ddi_put64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg), val);
}
static inline void
nvme_put32(nvme_t *nvme, uintptr_t reg, uint32_t val)
{
ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
ddi_put32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg), val);
}
static inline uint64_t
nvme_get64(nvme_t *nvme, uintptr_t reg)
{
uint64_t val;
ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x7) == 0);
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
val = ddi_get64(nvme->n_regh, (uint64_t *)(nvme->n_regs + reg));
return (val);
}
static inline uint32_t
nvme_get32(nvme_t *nvme, uintptr_t reg)
{
uint32_t val;
ASSERT(((uintptr_t)(nvme->n_regs + reg) & 0x3) == 0);
/*LINTED: E_BAD_PTR_CAST_ALIGN*/
val = ddi_get32(nvme->n_regh, (uint32_t *)(nvme->n_regs + reg));
return (val);
}
static boolean_t
nvme_check_regs_hdl(nvme_t *nvme)
{
ddi_fm_error_t error;
ddi_fm_acc_err_get(nvme->n_regh, &error, DDI_FME_VERSION);
if (error.fme_status != DDI_FM_OK)
return (B_TRUE);
return (B_FALSE);
}
static boolean_t
nvme_check_dma_hdl(nvme_dma_t *dma)
{
ddi_fm_error_t error;
if (dma == NULL)
return (B_FALSE);
ddi_fm_dma_err_get(dma->nd_dmah, &error, DDI_FME_VERSION);
if (error.fme_status != DDI_FM_OK)
return (B_TRUE);
return (B_FALSE);
}
static void
nvme_free_dma_common(nvme_dma_t *dma)
{
if (dma->nd_dmah != NULL)
(void) ddi_dma_unbind_handle(dma->nd_dmah);
if (dma->nd_acch != NULL)
ddi_dma_mem_free(&dma->nd_acch);
if (dma->nd_dmah != NULL)
ddi_dma_free_handle(&dma->nd_dmah);
}
static void
nvme_free_dma(nvme_dma_t *dma)
{
nvme_free_dma_common(dma);
kmem_free(dma, sizeof (*dma));
}
/* ARGSUSED */
static void
nvme_prp_dma_destructor(void *buf, void *private)
{
nvme_dma_t *dma = (nvme_dma_t *)buf;
nvme_free_dma_common(dma);
}
static int
nvme_alloc_dma_common(nvme_t *nvme, nvme_dma_t *dma,
size_t len, uint_t flags, ddi_dma_attr_t *dma_attr)
{
if (ddi_dma_alloc_handle(nvme->n_dip, dma_attr, DDI_DMA_SLEEP, NULL,
&dma->nd_dmah) != DDI_SUCCESS) {
/*
* Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
* the only other possible error is DDI_DMA_BADATTR which
* indicates a driver bug which should cause a panic.
*/
dev_err(nvme->n_dip, CE_PANIC,
"!failed to get DMA handle, check DMA attributes");
return (DDI_FAILURE);
}
/*
* ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
* or the flags are conflicting, which isn't the case here.
*/
(void) ddi_dma_mem_alloc(dma->nd_dmah, len, &nvme->n_reg_acc_attr,
DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL, &dma->nd_memp,
&dma->nd_len, &dma->nd_acch);
if (ddi_dma_addr_bind_handle(dma->nd_dmah, NULL, dma->nd_memp,
dma->nd_len, flags | DDI_DMA_CONSISTENT, DDI_DMA_SLEEP, NULL,
&dma->nd_cookie, &dma->nd_ncookie) != DDI_DMA_MAPPED) {
dev_err(nvme->n_dip, CE_WARN,
"!failed to bind DMA memory");
atomic_inc_32(&nvme->n_dma_bind_err);
nvme_free_dma_common(dma);
return (DDI_FAILURE);
}
return (DDI_SUCCESS);
}
static int
nvme_zalloc_dma(nvme_t *nvme, size_t len, uint_t flags,
ddi_dma_attr_t *dma_attr, nvme_dma_t **ret)
{
nvme_dma_t *dma = kmem_zalloc(sizeof (nvme_dma_t), KM_SLEEP);
if (nvme_alloc_dma_common(nvme, dma, len, flags, dma_attr) !=
DDI_SUCCESS) {
*ret = NULL;
kmem_free(dma, sizeof (nvme_dma_t));
return (DDI_FAILURE);
}
bzero(dma->nd_memp, dma->nd_len);
*ret = dma;
return (DDI_SUCCESS);
}
/* ARGSUSED */
static int
nvme_prp_dma_constructor(void *buf, void *private, int flags)
{
nvme_dma_t *dma = (nvme_dma_t *)buf;
nvme_t *nvme = (nvme_t *)private;
dma->nd_dmah = NULL;
dma->nd_acch = NULL;
if (nvme_alloc_dma_common(nvme, dma, nvme->n_pagesize,
DDI_DMA_READ, &nvme->n_prp_dma_attr) != DDI_SUCCESS) {
return (-1);
}
ASSERT(dma->nd_ncookie == 1);
dma->nd_cached = B_TRUE;
return (0);
}
static int
nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
uint_t flags, nvme_dma_t **dma)
{
uint32_t len = nentry * qe_len;
ddi_dma_attr_t q_dma_attr = nvme->n_queue_dma_attr;
len = roundup(len, nvme->n_pagesize);
if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
!= DDI_SUCCESS) {
dev_err(nvme->n_dip, CE_WARN,
"!failed to get DMA memory for queue");
goto fail;
}
if ((*dma)->nd_ncookie != 1) {
dev_err(nvme->n_dip, CE_WARN,
"!got too many cookies for queue DMA");
goto fail;
}
return (DDI_SUCCESS);
fail:
if (*dma) {
nvme_free_dma(*dma);
*dma = NULL;
}
return (DDI_FAILURE);
}
static void
nvme_free_cq(nvme_cq_t *cq)
{
mutex_destroy(&cq->ncq_mutex);
if (cq->ncq_dma != NULL)
nvme_free_dma(cq->ncq_dma);
kmem_free(cq, sizeof (*cq));
}
static void
nvme_free_qpair(nvme_qpair_t *qp)
{
int i;
mutex_destroy(&qp->nq_mutex);
sema_destroy(&qp->nq_sema);
if (qp->nq_sqdma != NULL)
nvme_free_dma(qp->nq_sqdma);
if (qp->nq_active_cmds > 0)
for (i = 0; i != qp->nq_nentry; i++)
if (qp->nq_cmd[i] != NULL)
nvme_free_cmd(qp->nq_cmd[i]);
if (qp->nq_cmd != NULL)
kmem_free(qp->nq_cmd, sizeof (nvme_cmd_t *) * qp->nq_nentry);
kmem_free(qp, sizeof (nvme_qpair_t));
}
/*
* Destroy the pre-allocated cq array, but only free individual completion
* queues from the given starting index.
*/
static void
nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
{
uint_t i;
for (i = start; i < nvme->n_cq_count; i++)
if (nvme->n_cq[i] != NULL)
nvme_free_cq(nvme->n_cq[i]);
kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count);
}
static int
nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx)
{
nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
DDI_INTR_PRI(nvme->n_intr_pri));
if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS)
goto fail;
cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp;
cq->ncq_nentry = nentry;
cq->ncq_id = idx;
cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);
*cqp = cq;
return (DDI_SUCCESS);
fail:
nvme_free_cq(cq);
*cqp = NULL;
return (DDI_FAILURE);
}
/*
* Create the n_cq array big enough to hold "ncq" completion queues.
* If the array already exists it will be re-sized (but only larger).
* The admin queue is included in this array, which boosts the
* max number of entries to UINT16_MAX + 1.
*/
static int
nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry)
{
nvme_cq_t **cq;
uint_t i, cq_count;
ASSERT3U(ncq, >, nvme->n_cq_count);
cq = nvme->n_cq;
cq_count = nvme->n_cq_count;
nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP);
nvme->n_cq_count = ncq;
for (i = 0; i < cq_count; i++)
nvme->n_cq[i] = cq[i];
for (; i < nvme->n_cq_count; i++)
if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i) !=
DDI_SUCCESS)
goto fail;
if (cq != NULL)
kmem_free(cq, sizeof (*cq) * cq_count);
return (DDI_SUCCESS);
fail:
nvme_destroy_cq_array(nvme, cq_count);
/*
* Restore the original array
*/
nvme->n_cq_count = cq_count;
nvme->n_cq = cq;
return (DDI_FAILURE);
}
static int
nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
uint_t idx)
{
nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
uint_t cq_idx;
mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
DDI_INTR_PRI(nvme->n_intr_pri));
/*
* The NVMe spec defines that a full queue has one empty (unused) slot;
* initialize the semaphore accordingly.
*/
sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL);
if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
goto fail;
/*
* idx == 0 is adminq, those above 0 are shared io completion queues.
*/
cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1);
qp->nq_cq = nvme->n_cq[cq_idx];
qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
qp->nq_nentry = nentry;
qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
qp->nq_next_cmd = 0;
*nqp = qp;
return (DDI_SUCCESS);