From 6d85df9c5991c26ead6195ef6eed31e604b14db5 Mon Sep 17 00:00:00 2001 From: Robert Mustacchi Date: Mon, 24 Sep 2012 21:51:53 +0000 Subject: [PATCH] HVM-750 guest virtio drivers are racy with respect to interrupts HVM-751 CONFIGURE_ONLY check in build.sh is wrong HVM-752 Import qemu-kvm.git barrier changes --- build.sh | 4 +- hw/virtio-net.c | 176 +++++++++++++++++++++++++++++++++++++++++++++++- hw/virtio.c | 85 +++++++++++++++++++---- hw/virtio.h | 4 +- qemu-barrier.h | 61 ++++++++++++++++- qemu_mdb.c | 117 ++++++++++++++++++++++++++++++++ 6 files changed, 427 insertions(+), 20 deletions(-) diff --git a/build.sh b/build.sh index 415f13e..6618a05 100755 --- a/build.sh +++ b/build.sh @@ -75,9 +75,9 @@ KERNEL_SOURCE="${KERNEL_SOURCE:-$(pwd)/../../illumos}" CTFBINDIR="$KERNEL_SOURCE"/usr/src/tools/proto/root_i386-nd/opt/onbld/bin/i386 export PATH="$PATH:$CTFBINDIR" -if [[ -z "CONFIGURE_ONLY" ]]; then +if [[ -z "$CONFIGURE_ONLY" ]]; then echo "==> Make" - gmake + V=1 gmake else echo "Not running make per-request" fi diff --git a/hw/virtio-net.c b/hw/virtio-net.c index e9775a6..bb7f49a 100644 --- a/hw/virtio-net.c +++ b/hw/virtio-net.c @@ -26,6 +26,89 @@ #define MAC_TABLE_ENTRIES 64 #define MAX_VLAN (1 << 12) /* Per 802.1Q definition */ +/* + * Unfortunately some guest virtio drivers are a little racy with respect to + * when they notify us and when they unmask their respective interrupts. + * Currently we have to work around this in QEMU. While OSes normally work + * around pathological devices, virtual devices here will have to work around + * virtual hardware. To put this more concretely, a Linux guest will notify the + * host to do processing work before it unmasks interrupts. Therefore, by the + * time that we get to virtio_notify interrupts on the available ring won't be + * unmasked so we won't inject the interrupt, but the guest will instead wait + * indefinitely for one. This leads to us losing data. + * + * We need to note whether or not we injected an interrupt during a + * virtio_notify. If we did not and either of the following conditions about the + * ring buffers are true: + * + * o The last available index processed equals the used index + * o The last available index processed does not equal the current + * available index + * + * If this is the case, then we set up a small timer that runs for 500 ticks, + * each tick is 10ms long. If we reach 500 ticks, then we just ignore it. This + * is actually a valid position because the guest could have transmitted a small + * amount of packets, but not enough to actually cause it to need injection. If + * we get notified, aka hit virtio_net_handle_tx_timer, then we stop the timer, + * because we're about to do processing that may inject an interrupt. Finally, + * if on a tick we check two different conditions. The first is to see if the + * last processed available ring index is not equal to the current available + * ring index. If that is true, then we effectively call virtqueue_flush as + * virtio_net_tx_timer would. Finally we check if the last available ring index + * is equal to the used ring index and interrupts are not masked. If this is the + * case, then we simply inject the interrupt and continue. + * + * This is summarized by the following rough state transition diagram: + * + * Otherwise +---+ + * virtqueue_ --+ increment +---* | + * flush() | tick count \|/ | + avail ring + * finishes | +-------------+ | | index > + * without +---*-------------------->| |--+ | last avail + * injecting| | Timer | | index pro- + * an intr. | +-----*-------------| Active | | cessed + * | | | | |-----*-----------+ + * | | | +-------------+ | + * | | +- 500 ticks | | | + * | | elapse | *--+ Avail ring | + * | \|/ | | unmasked | + * +-------------+ | | | + * | |<--*-----------+ | +--------+ | + * | Timer | | | | | | + * | Inactive | +- virtio_net_ +---->| Inject | | + * | | handle_tx_ | MSI/x | | + * +-------------+ timer() runs | | | + * ^ ^ +--------+ | + * | | +- always | | + * | | | | | + * | +-----------------------*------------+ | + * | | + * | +- always +------------------+ | + * | | | | | + * +---------------*---------------| Flush Virtqueues |<-----+ + * | | + * +------------------+ + */ + + +#define REINJECT_TICK_RATE (10000000) /* 10ms in ns */ +#define REINJECT_DEADMAN 500 /* 5s in ticks */ + +typedef enum rein_act { + REIN_INJECT, + REIN_DEADMAN, + REIN_RUN +} rein_act_t; + +#define REIN_RING_MAX 64 + +typedef struct rein_event { + rein_act_t re_act; + hrtime_t re_time; + uint64_t re_other; + struct timeval re_tval; +} rein_event_t; + typedef struct VirtIONet { VirtIODevice vdev; @@ -63,8 +146,78 @@ typedef struct VirtIONet } mac_table; uint32_t *vlans; DeviceState *qdev; + QEMUTimer *rein_timer; + uint32_t rein_timer_ticks; + uint8_t rein_timer_act; + uint32_t rein_ring_idx; + rein_event_t rein_ring[REIN_RING_MAX]; + uint64_t rein_n_dead; + uint64_t rein_n_inject; + uint64_t rein_n_rerun; } VirtIONet; +static void virtio_net_handle_tx_timer(VirtIODevice *, VirtQueue *); + +static void virtio_net_rein_event(VirtIONet *n, rein_act_t act, uint64_t other) +{ + int index = n->rein_ring_idx; + n->rein_ring_idx = (n->rein_ring_idx + 1) % REIN_RING_MAX; + rein_event_t *rep = n->rein_ring + index; + rep->re_time = gethrtime(); + rep->re_act = act; + rep->re_other = other; + (void) gettimeofday(&rep->re_tval, NULL); +} + +static void virtio_net_rein_disable(VirtIONet *n) +{ + qemu_del_timer(n->rein_timer); + n->rein_timer_act = 0; +} + +static void virtio_net_rein_enable(VirtIONet *n) +{ + n->rein_timer_ticks = 0; + qemu_mod_timer(n->rein_timer, + qemu_get_clock(vm_clock) + REINJECT_TICK_RATE); + n->rein_timer_act = 1; +} + +static void virtio_net_rein_tick(void *opaque) +{ + int ret; + VirtIONet *n = opaque; + assert(n->rein_timer_act); + + n->rein_timer_ticks++; + + /* Give up, this may be completely reasonable */ + if (n->rein_timer_ticks > REINJECT_DEADMAN) { + virtio_net_rein_event(n, REIN_DEADMAN, n->rein_timer_ticks); + virtio_net_rein_disable(n); + n->rein_n_dead++; + return; + } + + ret = virtqueue_stalled(n->tx_vq); + if (ret == 1) { + virtio_net_rein_event(n, REIN_INJECT, n->rein_timer_ticks); + virtio_net_rein_disable(n); + n->rein_n_inject++; + return; + } else if (ret == 2) { + virtio_net_rein_event(n, REIN_RUN, n->rein_timer_ticks); + virtio_net_rein_disable(n); + virtio_net_handle_tx_timer(&n->vdev, n->tx_vq); + n->rein_n_rerun++; + return; + } + + assert(ret == 0); + qemu_mod_timer(n->rein_timer, + qemu_get_clock(vm_clock) + REINJECT_TICK_RATE); +} + /* TODO * - we could suppress RX interrupt if we were so inclined. */ @@ -707,6 +860,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) { VirtQueueElement elem; int32_t num_packets = 0; + int32_t inject = 1; if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) { return num_packets; } @@ -758,12 +912,16 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) len += ret; virtqueue_push(vq, &elem, len); - virtio_notify(&n->vdev, vq); + inject = virtio_notify(&n->vdev, vq); if (++num_packets >= n->tx_burst) { break; } } + + if (inject == 0 && virtqueue_handled(vq)) + virtio_net_rein_enable(n); + return num_packets; } @@ -777,6 +935,16 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq) return; } + /* + * Kill the broken guest timer. The reason we are here is because the guest + * has kicked us to send packets therefore we don't need to go back and + * consider injecting it with interrupts because we will do that again + * naturally. We also don't reset + */ + if (n->rein_timer_act) + virtio_net_rein_disable(n); + + if (n->tx_waiting) { virtio_queue_set_notification(vq, 1); qemu_del_timer(n->tx_timer); @@ -1024,6 +1192,12 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf, n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx_bh); n->tx_bh = qemu_bh_new(virtio_net_tx_bh, n); } + n->rein_timer = qemu_new_timer(vm_clock, virtio_net_rein_tick, n); + n->rein_ring_idx = 0; + bzero(n->rein_ring, sizeof (rein_event_t) * REIN_RING_MAX); + n->rein_n_dead = 0; + n->rein_n_inject = 0; + n->rein_n_rerun = 0; n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl); qemu_macaddr_default_if_unset(&conf->macaddr); memcpy(&n->mac[0], &conf->macaddr, sizeof(n->mac)); diff --git a/hw/virtio.c b/hw/virtio.c index 31bd9e3..e9f5cdc 100644 --- a/hw/virtio.c +++ b/hw/virtio.c @@ -17,20 +17,12 @@ #include "qemu-error.h" #include "virtio.h" #include "sysemu.h" +#include "qemu-barrier.h" /* The alignment to use between consumer and producer parts of vring. * x86 pagesize again. */ #define VIRTIO_PCI_VRING_ALIGN 4096 -/* QEMU doesn't strictly need write barriers since everything runs in - * lock-step. We'll leave the calls to wmb() in though to make it obvious for - * KVM or if kqemu gets SMP support. - * In any case, we must prevent the compiler from reordering the code. - * TODO: we likely need some rmb()/mb() as well. - */ - -#define wmb() __asm__ __volatile__("": : :"memory") - typedef struct VRingDesc { uint64_t addr; @@ -169,6 +161,13 @@ static inline void vring_used_idx_increment(VirtQueue *vq, uint16_t val) stw_phys(pa, vring_used_idx(vq) + val); } +static inline uint16_t vring_used_flags(VirtQueue *vq) +{ + target_phys_addr_t pa; + pa = vq->vring.used + offsetof(VRingUsed, flags); + return lduw_phys(pa); +} + static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) { target_phys_addr_t pa; @@ -235,7 +234,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, void virtqueue_flush(VirtQueue *vq, unsigned int count) { /* Make sure buffer is written before we update index. */ - wmb(); + smp_wmb(); trace_virtqueue_flush(vq, count); vring_used_idx_increment(vq, count); vq->inuse -= count; @@ -258,6 +257,11 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) idx, vring_avail_idx(vq)); exit(1); } + /* On success, callers read a descriptor at vq->last_avail_idx. + * Make sure descriptor read does not bypass avail index read. */ + if (num_heads) { + smp_rmb(); + } return num_heads; } @@ -291,7 +295,7 @@ static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa, /* Check they're not leading us off end of descriptors. */ next = vring_desc_next(desc_pa, i); /* Make sure compiler knows to grab that: we don't want it changing! */ - wmb(); + smp_wmb(); if (next >= max) { error_report("Desc next is %u", next); @@ -629,17 +633,20 @@ void virtio_irq(VirtQueue *vq) virtio_notify_vector(vq->vdev, vq->vector); } -void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) +int virtio_notify(VirtIODevice *vdev, VirtQueue *vq) { + /* We need to expose used array entries before checking used event. */ + smp_mb(); /* Always notify when queue is empty (when feature acknowledge) */ if ((vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT) && (!(vdev->guest_features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)) || (vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx))) - return; + return 0; trace_virtio_notify(vdev, vq); vdev->isr |= 0x01; virtio_notify_vector(vdev, vq->vector); + return 1; } void virtio_notify_config(VirtIODevice *vdev) @@ -880,3 +887,55 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq) { return &vq->host_notifier; } + +int virtqueue_handled(VirtQueue *vq) +{ + smp_mb(); + return (vq->last_avail_idx == vring_used_idx(vq) || + vq->last_avail_idx != vring_avail_idx(vq)); +} + +/* + * We need to go through and check if we have hit the 'stalled' condition. + * Due to the way that the virtio driver is implemented in the Linux kernel, it + * will potentially kick the guest to process data, disable the queue, but not + * enable interrupts before the host is done processing packets. When this + * happens all network traffic from the guest ends up getting corked up because + * the guest disabled the queue and is waiting for an interrupt from the host to + * go and enable it again. In fact, when in this state a little bit of libproc + * magic gets us going again rather reliably. + * + * Eventually the guest will go through and unmask interrupts saying that it + * wants an injection. If we reach a point in time where the last seen available + * index is equal to the available index ring and is equal to the used index + * ring, then we'll go ahead and install the interupt. + */ +int virtqueue_stalled(VirtQueue *vq) +{ + smp_mb(); + + if (vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT) + return (0); + + if (vring_used_flags(vq) & VRING_USED_F_NO_NOTIFY) + return (0); + + if (vq->inuse) + return (0); + + /* We could have also lost the interrupt the other way */ + if (vq->last_avail_idx != vring_avail_idx(vq)) + return (2); + + if (vq->last_avail_idx != vring_used_idx(vq)) + return (0); + + /* + * Interrupts are enabled and we're at a point in time where we would + * have stalled. Let's go ahead and inject the interrupt. + */ + trace_virtio_notify(vq->vdev, vq); + vq->vdev->isr |= 0x01; + virtio_notify_vector(vq->vdev, vq->vector); + return (1); +} diff --git a/hw/virtio.h b/hw/virtio.h index d1fb8a0..ed4743c 100644 --- a/hw/virtio.h +++ b/hw/virtio.h @@ -153,7 +153,7 @@ void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr, int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem); int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes); -void virtio_notify(VirtIODevice *vdev, VirtQueue *vq); +int virtio_notify(VirtIODevice *vdev, VirtQueue *vq); void virtio_save(VirtIODevice *vdev, QEMUFile *f); @@ -226,4 +226,6 @@ EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq); EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq); void virtio_queue_notify_vq(VirtQueue *vq); void virtio_irq(VirtQueue *vq); +int virtqueue_stalled(VirtQueue *vq); +int virtqueue_handled(VirtQueue *vq); #endif diff --git a/qemu-barrier.h b/qemu-barrier.h index b77fce2..7e11197 100644 --- a/qemu-barrier.h +++ b/qemu-barrier.h @@ -1,10 +1,65 @@ #ifndef __QEMU_BARRIER_H #define __QEMU_BARRIER_H 1 -/* FIXME: arch dependant, x86 version */ -#define smp_wmb() asm volatile("" ::: "memory") - /* Compiler barrier */ #define barrier() asm volatile("" ::: "memory") +#if defined(__i386__) + +/* + * Because of the strongly ordered x86 storage model, wmb() and rmb() are nops + * on x86(well, a compiler barrier only). Well, at least as long as + * qemu doesn't do accesses to write-combining memory or non-temporal + * load/stores from C code. + */ +#define smp_wmb() barrier() +#define smp_rmb() barrier() +/* + * We use GCC builtin if it's available, as that can use + * mfence on 32 bit as well, e.g. if built with -march=pentium-m. + * However, on i386, there seem to be known bugs as recently as 4.3. + * */ +#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 4 +#define smp_mb() __sync_synchronize() +#else +#define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") +#endif + +#elif defined(__x86_64__) + +#define smp_wmb() barrier() +#define smp_rmb() barrier() +#define smp_mb() asm volatile("mfence" ::: "memory") + +#elif defined(_ARCH_PPC) + +/* + * We use an eieio() for wmb() on powerpc. This assumes we don't + * need to order cacheable and non-cacheable stores with respect to + * each other + */ +#define smp_wmb() asm volatile("eieio" ::: "memory") + +#if defined(__powerpc64__) +#define smp_rmb() asm volatile("lwsync" ::: "memory") +#else +#define smp_rmb() asm volatile("sync" ::: "memory") +#endif + +#define smp_mb() asm volatile("sync" ::: "memory") + +#else + +/* + * For (host) platforms we don't have explicit barrier definitions + * for, we use the gcc __sync_synchronize() primitive to generate a + * full barrier. This should be safe on all platforms, though it may + * be overkill for wmb() and rmb(). + */ +#define smp_wmb() __sync_synchronize() +#define smp_mb() __sync_synchronize() +#define smp_rmb() __sync_synchronize() + +#endif + #endif diff --git a/qemu_mdb.c b/qemu_mdb.c index 96fcef5..d6daf1f 100644 --- a/qemu_mdb.c +++ b/qemu_mdb.c @@ -114,6 +114,69 @@ typedef struct VRing target_phys_addr_t used; } VRing; +/* Sigh More definitions ... */ +typedef enum rein_act { + REIN_INJECT, + REIN_DEADMAN, + REIN_RUN +} rein_act_t; + +#define REIN_RING_MAX 64 + +typedef struct rein_event { + rein_act_t re_act; + hrtime_t re_time; + uint64_t re_other; + struct timeval re_tval; +} rein_event_t; + +typedef struct VirtIONet +{ + VirtIODevice vdev; + uint8_t mac[ETH_ALEN]; + uint16_t status; + VirtQueue *rx_vq; + VirtQueue *tx_vq; + VirtQueue *ctrl_vq; + NICState *nic; + QEMUTimer *tx_timer; + QEMUBH *tx_bh; + uint32_t tx_timeout; + int32_t tx_burst; + int tx_waiting; + uint32_t has_vnet_hdr; + uint8_t has_ufo; + struct { + VirtQueueElement elem; + ssize_t len; + } async_tx; + int mergeable_rx_bufs; + uint8_t promisc; + uint8_t allmulti; + uint8_t alluni; + uint8_t nomulti; + uint8_t nouni; + uint8_t nobcast; + uint8_t vhost_started; + struct { + int in_use; + int first_multi; + uint8_t multi_overflow; + uint8_t uni_overflow; + uint8_t *macs; + } mac_table; + uint32_t *vlans; + DeviceState *qdev; + QEMUTimer *rein_timer; + uint32_t rein_timer_ticks; + uint8_t rein_timer_act; + uint32_t rein_ring_idx; + rein_event_t rein_ring[REIN_RING_MAX]; + uint64_t rein_n_dead; + uint64_t rein_n_inject; + uint64_t rein_n_rerun; +} VirtIONet; + /* * NDEVICES comes from the PCIDevice structure and should be changed if this * does ever change. @@ -624,6 +687,58 @@ qemu_mdb_vravail(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } +static const char *reintostr[] = { + "INJECT", + "DEADMAN", + "RUN" +}; + +static int +qemu_mdb_nic_reinject(uintptr_t addr, uint_t flags, int argc, + const mdb_arg_t *argv) +{ + VirtIONet *n; + uint32_t ii, end; + rein_event_t *rep; + + if (!(flags & DCMD_ADDRSPEC)) + return (DCMD_USAGE); + + if (argc > 1) + return (DCMD_USAGE); + + n = mdb_alloc(sizeof (VirtIONet), UM_SLEEP | UM_GC); + + if (mdb_vread(n, sizeof (VirtIONet), addr) != sizeof (VirtIONet)) { + mdb_warn("failed to read VirtIONet"); + return (DCMD_ERR); + } + + if (n->rein_ring_idx == 0) + end = REIN_RING_MAX; + else + end = n->rein_ring_idx - 1; + + mdb_printf("%-?s %-10s %s\n", "TIMESTAMP", "ACTION", "OTHER"); + ii = n->rein_ring_idx; + for (;;) { + rep = n->rein_ring + ii; + if (rep->re_time == 0 && rep->re_other == 0) + break; + + mdb_printf("%-?p %-10s ", rep->re_time, reintostr[rep->re_act]); + if (rep->re_other == 0) + mdb_printf("\n", " - "); + else + mdb_printf("%d\n", rep->re_other); + if (ii + 1 == end) + break; + ii = (ii + 1) % REIN_RING_MAX; + } + + return (DCMD_OK); +} + static const mdb_dcmd_t qemu_dcmds[] = { { "pcidev2virtio", NULL, "translate a virtio PCI device to its " "virtio equivalent", qemu_mdb_pcidev2virtio }, @@ -633,6 +748,8 @@ static const mdb_dcmd_t qemu_dcmds[] = { qemu_mdb_vrused }, { "qemu_vravail", NULL, "Spit out the avail event of the vring", qemu_mdb_vravail }, + { "qemu_nic_reinject", NULL, "Print all of the reinject events", + qemu_mdb_nic_reinject }, { NULL } };