c61d17e3.html

<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
<meta name="viewport" content="width=device-width">
<meta name="theme-color" content="#222" media="(prefers-color-scheme: light)">
<meta name="theme-color" content="#222" media="(prefers-color-scheme: dark)"><meta name="generator" content="Hexo 7.1.1">

  <link rel="apple-touch-icon" sizes="180x180" href="/images/favicon/favicon_io/apple-touch-icon.png">
  <link rel="icon" type="image/png" sizes="32x32" href="/images/favicon/favicon_io/favicon-32x32.png">
  <link rel="icon" type="image/png" sizes="16x16" href="/images/favicon/favicon_io/favicon-16x16.png">

<link rel="stylesheet" href="/css/main.css">


<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css" integrity="sha256-yIDrPSXHZdOZhAqiBP7CKzIwMQmRCJ8UeB8Jo17YC4o=" crossorigin="anonymous">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/animate.css/3.1.1/animate.min.css" integrity="sha256-PR7ttpcvz8qrF57fur/yAx1qXMFJeJFiA6pSzWi0OIE=" crossorigin="anonymous">
  <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/fancyapps-ui/5.0.28/fancybox/fancybox.css" integrity="sha256-6cQIC71/iBIYXFK+0RHAvwmjwWzkWd+r7v/BX3/vZDc=" crossorigin="anonymous">

<script class="next-config" data-name="main" type="application/json">{"hostname":"saicat.github.io","root":"/","images":"/images","scheme":"Gemini","darkmode":true,"version":"8.19.1","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12},"copycode":{"enable":false,"style":null},"fold":{"enable":false,"height":500},"bookmark":{"enable":false,"color":"#222","save":"auto"},"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"stickytabs":false,"motion":{"enable":true,"async":false,"transition":{"menu_item":"fadeInDown","post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"i18n":{"placeholder":"搜索...","empty":"没有找到任何搜索结果：${query}","hits_time":"找到 ${hits} 个搜索结果（用时 ${time} 毫秒）","hits":"找到 ${hits} 个搜索结果"},"path":"/search.xml","localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false}}</script><script src="/js/config.js"></script>

    <meta name="description" content="【本文已在同名微信公众号&#x2F;知乎&#x2F;个人博客同步上线】 LLM的长文本能力现在已经是各个大模型巨头的必争之地。 我们之前在《LLM长上下文的问题》简单介绍了目前把大模型理解和生成能力推广到32k+&#x2F;128k+的主流方法，在《理解Attention:从起源到MHA,MQA和GQA》一文中也解析了MQA和GQA通过节省KV缓存的方式，支持模型在长上下文情况下推理加速的方案。">
<meta property="og:type" content="article">
<meta property="og:title" content="稀疏注意力计算:sliding window attention">
<meta property="og:url" content="https://saicat.github.io/c61d17e3.html">
<meta property="og:site_name" content="Linsight">
<meta property="og:description" content="【本文已在同名微信公众号&#x2F;知乎&#x2F;个人博客同步上线】 LLM的长文本能力现在已经是各个大模型巨头的必争之地。 我们之前在《LLM长上下文的问题》简单介绍了目前把大模型理解和生成能力推广到32k+&#x2F;128k+的主流方法，在《理解Attention:从起源到MHA,MQA和GQA》一文中也解析了MQA和GQA通过节省KV缓存的方式，支持模型在长上下文情况下推理加速的方案。">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/ms_invest_mistral.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/mistral_large_performance.jpeg">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/mistral_architechture.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/receptive_field_cnn.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/mistral_swa.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/rolling_buffer.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/prefill_and_chunking.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/mistral_perf.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/longformer_attention.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/dilated_conv.png">
<meta property="og:image" content="https://saicat.github.io/c61d17e3/big_bird_attention.png">
<meta property="og:image" content="https://saicat.github.io/images/qrcode.jpg">
<meta property="article:published_time" content="2024-03-12T09:26:00.000Z">
<meta property="article:modified_time" content="2024-03-20T11:38:30.908Z">
<meta property="article:author" content="Lin">
<meta property="article:tag" content="NLP">
<meta property="article:tag" content="LLM">
<meta property="article:tag" content="transformer">
<meta property="article:tag" content="attention">
<meta property="article:tag" content="sliding window attention">
<meta property="article:tag" content="sparse attention">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://saicat.github.io/c61d17e3/ms_invest_mistral.png">


<link rel="canonical" href="https://saicat.github.io/c61d17e3.html">


<script class="next-config" data-name="page" type="application/json">{"sidebar":"","isHome":false,"isPost":true,"lang":"zh-CN","comments":true,"permalink":"https://saicat.github.io/c61d17e3.html","path":"c61d17e3.html","title":"稀疏注意力计算:sliding window attention"}</script>

<script class="next-config" data-name="calendar" type="application/json">""</script>
<title>稀疏注意力计算:sliding window attention | Linsight</title>
  

  <noscript>
    <link rel="stylesheet" href="/css/noscript.css">
  </noscript>
</head>

<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
  <div class="headband"></div>

  <main class="main">
    <div class="column">
      <header class="header" itemscope itemtype="http://schema.org/WPHeader"><div class="site-brand-container">
  <div class="site-nav-toggle">
    <div class="toggle" aria-label="切换导航栏" role="button">
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
        <span class="toggle-line"></span>
    </div>
  </div>

  <div class="site-meta">

    <a href="/" class="brand" rel="start">
      <i class="logo-line"></i>
      <p class="site-title">Linsight</p>
      <i class="logo-line"></i>
    </a>
      <p class="site-subtitle" itemprop="description">聊聊技术，也聊聊其他的</p>
  </div>

  <div class="site-nav-right">
    <div class="toggle popup-trigger" aria-label="搜索" role="button">
        <i class="fa fa-search fa-fw fa-lg"></i>
    </div>
  </div>
</div>


<nav class="site-nav">
  <ul class="main-menu menu"><li class="menu-item menu-item-home"><a href="/" rel="section"><i class="fa fa-home fa-fw"></i>首页</a></li><li class="menu-item menu-item-tags"><a href="/tags/" rel="section"><i class="fa fa-tags fa-fw"></i>标签</a></li><li class="menu-item menu-item-categories"><a href="/categories/" rel="section"><i class="fa fa-th fa-fw"></i>分类</a></li><li class="menu-item menu-item-archives"><a href="/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>归档</a></li>
      <li class="menu-item menu-item-search">
        <a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
        </a>
      </li>
  </ul>
</nav>


  <div class="search-pop-overlay">
    <div class="popup search-popup"><div class="search-header">
  <span class="search-icon">
    <i class="fa fa-search"></i>
  </span>
  <div class="search-input-container">
    <input autocomplete="off" autocapitalize="off" maxlength="80"
           placeholder="搜索..." spellcheck="false"
           type="search" class="search-input">
  </div>
  <span class="popup-btn-close" role="button">
    <i class="fa fa-times-circle"></i>
  </span>
</div>
<div class="search-result-container no-result">
  <div class="search-result-icon">
    <i class="fa fa-spinner fa-pulse fa-5x"></i>
  </div>
</div>

    </div>
  </div>

</header>
        
  
  <aside class="sidebar">

    <div class="sidebar-inner sidebar-nav-active sidebar-toc-active">
      <ul class="sidebar-nav">
        <li class="sidebar-nav-toc">
          文章目录
        </li>
        <li class="sidebar-nav-overview">
          站点概览
        </li>
      </ul>

      <div class="sidebar-panel-container">
        <!--noindex-->
        <div class="post-toc-wrap sidebar-panel">
            <div class="post-toc animated"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#swa"><span class="nav-number">1.</span> <span class="nav-text">SWA</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#mistral-7b"><span class="nav-number">1.1.</span> <span class="nav-text">Mistral 7B</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E8%AE%A1%E7%AE%97%E9%87%8F%E5%92%8C%E7%BC%93%E5%AD%98"><span class="nav-number">1.2.</span> <span class="nav-text">计算量和缓存</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#swa%E6%80%9D%E8%B7%AF"><span class="nav-number">1.3.</span> <span class="nav-text">SWA思路</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%92%8Ckv-cache%E7%9A%84%E9%85%8D%E5%90%88%E5%AE%9E%E7%8E%B0"><span class="nav-number">1.4.</span> <span class="nav-text">和KV Cache的配合实现</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E9%95%BFprompt%E7%9A%84%E5%88%86%E5%9D%97"><span class="nav-number">1.5.</span> <span class="nav-text">长Prompt的分块</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#sparse-attention"><span class="nav-number">2.</span> <span class="nav-text">Sparse Attention</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#longformer"><span class="nav-number">2.1.</span> <span class="nav-text">Longformer</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#big-bird"><span class="nav-number">2.2.</span> <span class="nav-text">Big Bird</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E5%B0%8F%E7%BB%93"><span class="nav-number">3.</span> <span class="nav-text">小结</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#reference"><span class="nav-number">4.</span> <span class="nav-text">Reference</span></a></li></ol></div>
        </div>
        <!--/noindex-->

        <div class="site-overview-wrap sidebar-panel">
          <div class="site-author animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
    <img class="site-author-image" itemprop="image" alt="Lin"
      src="/images/avatar/Picasso_Elephant.png">
  <p class="site-author-name" itemprop="name">Lin</p>
  <div class="site-description" itemprop="description">AI | NLP</div>
</div>
<div class="site-state-wrap animated">
  <nav class="site-state">
      <div class="site-state-item site-state-posts">
        <a href="/archives/">
          <span class="site-state-item-count">31</span>
          <span class="site-state-item-name">日志</span>
        </a>
      </div>
      <div class="site-state-item site-state-categories">
          <a href="/categories/">
        <span class="site-state-item-count">3</span>
        <span class="site-state-item-name">分类</span></a>
      </div>
      <div class="site-state-item site-state-tags">
          <a href="/tags/">
        <span class="site-state-item-count">37</span>
        <span class="site-state-item-name">标签</span></a>
      </div>
  </nav>
</div>
  <div class="links-of-author animated">
      <span class="links-of-author-item">
        <a href="mailto:331603034@qq.com" title="E-Mail → mailto:331603034@qq.com" rel="noopener me" target="_blank"><i class="fa-regular fa-envelope fa-fw"></i>E-Mail</a>
      </span>
  </div>
  <div class="cc-license animated" itemprop="license">
    <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" class="cc-opacity" rel="noopener" target="_blank"><img src="https://cdnjs.cloudflare.com/ajax/libs/creativecommons-vocabulary/2020.11.3/assets/license_badges/small/by_nc_sa.svg" alt="Creative Commons"></a>
  </div>

<!--
<script type="text/javascript" src="//rf.revolvermaps.com/0/0/1.js?i=5acfv0hqzp5&amp;s=220&amp;m=1&amp;v=false&amp;r=false&amp;b=000000&amp;n=false&amp;c=ff0000" async="async"></script>
-->

        </div>
      </div>
    </div>

    
  </aside>


    </div>

    <div class="main-inner post posts-expand">


<div class="post-block">
  
  
  <article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh-CN">
    <link itemprop="mainEntityOfPage" href="https://saicat.github.io/c61d17e3.html">

    <span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
      <meta itemprop="image" content="/images/avatar/Picasso_Elephant.png">
      <meta itemprop="name" content="Lin">
    </span>

    <span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
      <meta itemprop="name" content="Linsight">
      <meta itemprop="description" content="AI | NLP">
    </span>

    <span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
      <meta itemprop="name" content="稀疏注意力计算:sliding window attention | Linsight">
      <meta itemprop="description" content="">
    </span>
      <header class="post-header">
        <h1 class="post-title" itemprop="name headline">
          稀疏注意力计算:sliding window attention
        </h1>

        <div class="post-meta-container">
          <div class="post-meta">
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar"></i>
      </span>
      <span class="post-meta-item-text">发表于</span>

      <time title="创建时间：2024-03-12 17:26:00" itemprop="dateCreated datePublished" datetime="2024-03-12T17:26:00+08:00">2024-03-12</time>
    </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-calendar-check"></i>
      </span>
      <span class="post-meta-item-text">更新于</span>
      <time title="修改时间：2024-03-20 19:38:30" itemprop="dateModified" datetime="2024-03-20T19:38:30+08:00">2024-03-20</time>
    </span>
    <span class="post-meta-item">
      <span class="post-meta-item-icon">
        <i class="far fa-folder"></i>
      </span>
      <span class="post-meta-item-text">分类于</span>
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/categories/CS/" itemprop="url" rel="index"><span itemprop="name">CS</span></a>
        </span>
          ，
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/categories/CS/NLP/" itemprop="url" rel="index"><span itemprop="name">NLP</span></a>
        </span>
          ，
        <span itemprop="about" itemscope itemtype="http://schema.org/Thing">
          <a href="/categories/CS/NLP/LLM/" itemprop="url" rel="index"><span itemprop="name">LLM</span></a>
        </span>
    </span>

  
    <span class="post-meta-break"></span>
    <span class="post-meta-item" title="本文字数">
      <span class="post-meta-item-icon">
        <i class="far fa-file-word"></i>
      </span>
      <span class="post-meta-item-text">本文字数：</span>
      <span>11k</span>
    </span>
    <span class="post-meta-item" title="阅读时长">
      <span class="post-meta-item-icon">
        <i class="far fa-clock"></i>
      </span>
      <span class="post-meta-item-text">阅读时长 &asymp;</span>
      <span>20 分钟</span>
    </span>
</div>

        </div>
      </header>

    
    <div class="post-body" itemprop="articleBody"><p>【本文已在同名微信公众号/知乎/个人博客同步上线】</p>
<p>LLM的长文本能力现在已经是各个大模型巨头的必争之地。</p>
<p>我们之前在<a target="_blank" rel="noopener" href="http://www.linsight.cn/c4da56c0.html">《LLM长上下文的问题》</a>简单介绍了目前把大模型理解和生成能力推广到32k+/128k+的主流方法，在<a target="_blank" rel="noopener" href="http://www.linsight.cn/3dc22f96.html">《理解Attention:从起源到MHA,MQA和GQA》</a>一文中也解析了MQA和GQA通过节省KV缓存的方式，支持模型在长上下文情况下推理加速的方案。</p>
<p>在这讲一下另一种（理论有损）提升注意力计算效率的方法：SWA（sliding
window attention）。</p>
<p>一些效果受到广泛关注的模型，如Qwen系列和Mistral就使用了SWA。</p>
<p>关于Mistral：</p>
<p>Mistral
AI是法国一家AI独角兽公司，2023年5月才成立，但是在2023年9月和12月就分别推出了Mistral
7B和MoE模型Mistral 8x7B并开源。</p>
<p>2024年2月，微软也投资了它。</p>
<img src="/c61d17e3/ms_invest_mistral.png" class title="MS">
<p>它在2024年2月发布的Mistral Large，支持多语言 &amp;
32k的上下文长度，在MMLU上也是获得了直逼GPT4的效果</p>
<img src="/c61d17e3/mistral_large_performance.jpeg" class title="Mistral Large MMLU Performance">
<p>（大家也因此对Mistral寄予了厚望，希望它能成为大模型行业的鲶鱼，激活一下OPENAI和META加速一下开源。）</p>
<h1 id="swa">SWA</h1>
<p>虽然SWA的思路最早不是Mistral提出的，我们还是先以Mistral
7B为例来看下SWA的具体做法。</p>
<h2 id="mistral-7b">Mistral 7B</h2>
<p>2023年10月，Mistral发布了Mistral 7B的<a target="_blank" rel="noopener" href="https://arxiv.org/pdf/2310.06825.pdf">技术报告</a>。其中开篇就说到，相比Llama，Mistral在结构上做了一些改动，除了GQA，另一个用于支持长文本下高效推理的改动就是SWA。</p>
<p>来看下Mistral 7B的模型结构参数</p>
<img src="/c61d17e3/mistral_architechture.png" class title="Mistral Architechture">
<p>Mistral使用了kv组数=8的GQA，intermediate
size相比Llama2（11008）大一些，其他基本没有太大变化。</p>
<h2 id="计算量和缓存">计算量和缓存</h2>
<p>对于原始的causal
attention，其注意力矩阵是一个下三角矩阵，这样每个token都能看到自己和在自己前面的token。</p>
<p>这样随着输入长度 <span class="math inline">\(s\)</span>
增大，这个下三角矩阵中1的元素数量以 <span class="math inline">\(s^2\)</span> 的速度增长，带来的是计算量和所需的KV
Cache以平方的速度增长。</p>
<p>（我们知道计算量/缓存和长度 <span class="math inline">\(s\)</span>
成平方关系，这里放一些更具体的推算细节，已经熟悉的朋友可以跳过）</p>
<p>（1）计算量</p>
<p>对于两个这样大小的矩阵相乘： <span class="math inline">\([m,n]\times[n,p]\)</span> ，输出矩阵大小为 <span class="math inline">\([m,p]\)</span>，共有 <span class="math inline">\(m\times p\)</span> 个元素，每个元素需要 <span class="math inline">\(n\)</span> 次乘法和 <span class="math inline">\(n\)</span> 次加法，因此一次矩阵乘法有 <span class="math inline">\(2mpn\)</span> 个floating point
operations（FLOPs）。</p>
<p>计算量上，按<a target="_blank" rel="noopener" href="https://arxiv.org/pdf/2203.15556.pdf">《Training
Compute-Optimal Large Language Models》</a>的算法来。</p>
<p>对于一般MHA，输入长度为 <span class="math inline">\(s\)</span>
，层数为 <span class="math inline">\(L\)</span> ，模型hidden size为
<span class="math inline">\(d_{model}\)</span> ，每个头的维度为 <span class="math inline">\(d_{q}\)</span> ， 头的数量为 <span class="math inline">\(n_{q}\)</span>（这里假设有 <span class="math inline">\(d_{model} = n_{q}\times d_{q}\)</span>
），各个operation的FLOPs如下</p>
<center>
<table>
<colgroup>
<col style="width: 45%">
<col style="width: 54%">
</colgroup>
<thead>
<tr class="header">
<th style="text-align: left;">Operation</th>
<th style="text-align: center;">FLOPs（MHA）</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">Attention: QKV</td>
<td style="text-align: center;"><span class="math inline">\(6\times
s\times h_{model}^{2}\)</span></td>
</tr>
<tr class="even">
<td style="text-align: left;">Attention: QK logits ( <span class="math inline">\(QK^T\)</span> )</td>
<td style="text-align: center;"><span class="math inline">\(n_{q}\times
2\times s^2\times h_{q}\)</span></td>
</tr>
<tr class="odd">
<td style="text-align: left;">Attention: Softmax</td>
<td style="text-align: center;"><span class="math inline">\(n_{q}\times
3\times s^2\)</span></td>
</tr>
<tr class="even">
<td style="text-align: left;">Attention: Reduction (apply to <span class="math inline">\(V\)</span>)</td>
<td style="text-align: center;"><span class="math inline">\(n_{q}\times
2\times s^2\times h_{q}\)</span></td>
</tr>
<tr class="odd">
<td style="text-align: left;">Attention: Outupt Linear Project</td>
<td style="text-align: center;"><span class="math inline">\(2\times
s\times h_{model}^{2}\)</span></td>
</tr>
</tbody>
</table>
</center>
<p>Softmax项中，对一个 <span class="math inline">\([1,s]\)</span>
的向量做softmax，计算量为 <span class="math inline">\(3s\)</span> （一个
<span class="math inline">\(s\)</span> 是算每个元素的exp，一个 <span class="math inline">\(s\)</span> 是求和算分母，一个 <span class="math inline">\(s\)</span> 是算除法），而对 <span class="math inline">\([s,s]\)</span> 的矩阵做softmax，则计算量为 <span class="math inline">\(3s^2\)</span> ，每个头都要计算一遍，因此再乘以
<span class="math inline">\(n_{q}\)</span> 。</p>
<p>（这里忽略了其他一些operation，比如scaling，dropout等，有兴趣的朋友可以自己推算一下）</p>
<p>顺便算下对于Mistral 7B这样使用了GQA的情况。</p>
<p>其实只有第一项的KV有变化，其他都没变。假设kv头的数量为 <span class="math inline">\(n_{kv}\)</span>，则有</p>
<center>
<table>
<colgroup>
<col style="width: 45%">
<col style="width: 54%">
</colgroup>
<thead>
<tr class="header">
<th style="text-align: left;">Operation</th>
<th style="text-align: center;">FLOPs（GQA）</th>
</tr>
</thead>
<tbody>
<tr class="odd">
<td style="text-align: left;">Attention: QKV</td>
<td style="text-align: center;"><span class="math inline">\(2\times
s\times h_{model}^{2}\\+4\times s\times h_{model}\times (h_{q}\times
n_{kv})\)</span></td>
</tr>
</tbody>
</table>
</center>
<p>从上面的推算可以看到QK logits、Softmax和Reduction三项是和长度 <span class="math inline">\(s\)</span> 成平方关系的，其他则是线性关系。</p>
<p>（2）缓存</p>
<p>KV Cache需要缓存的参数量为</p>
<p><span class="math display">\[
2\times L\times s\times d_{q}\times n_{kv}
\]</span></p>
<p>如果使用的是半精度浮点数，那么总共所需的空间就是</p>
<p><span class="math display">\[
2\times 2\times L\times s\times d_{q}\times n_{kv}
\]</span></p>
<p>对于Mistral 7B，在输入长度为16k的情况下，所需的KV_Cache约为2G。</p>
<p>看来虽然用了GQA，但是在长文本（16k+）的情况下计算量和显存还是颇有压力。</p>
<h2 id="swa思路">SWA思路</h2>
<p>看来要提升attention计算效率，需要想办法减小上面推算中的 <span class="math inline">\(s\)</span> ，但是怎么在减小 <span class="math inline">\(s\)</span>
的同时，还能保持模型长上下文的理解和生成能力呢？</p>
<p>来看一下，CNN中的感受野</p>
<img src="/c61d17e3/receptive_field_cnn.png" class title="CNN Receptive Field">
<p>如上图，假设模型有3层，每层卷积核大小为 <span class="math inline">\(3\times 3\)</span>
（实际上CNN里卷积操作就是一个sliding window）。</p>
<p>那对于layer 3，每一个像素能看到layer 2中的一个 <span class="math inline">\(3\times 3\)</span> 的区域，layer
2中其他较远的像素就看到不了。</p>
<p>但我们再往前推，layer 2里的每个像素也可以看到layer 1中的一个 <span class="math inline">\(3\times 3\)</span> 区域，那么layer 2中的 <span class="math inline">\(3\times 3\)</span> 区域就可以看到layer 1中一个
<span class="math inline">\(5\times 5\)</span> 的区域，相当于layer
3中一个像素可以<u><strong>间接</strong></u>看到一个 <span class="math inline">\(5\times 5\)</span> 的输入。</p>
<p>以此类推，如果我们再增加一层layer 4，那么layer
4中一个像素就能获取输入层（layer 1） 一个 <span class="math inline">\(7\times 7\)</span> 区域的信息。</p>
<p>虽然每层只能多看周围一格的信息，但是只要我们层数够多，理论上靠近输出端的层想看多远就能看多远。</p>
<p>值得注意的一点是，我们一般认为模型低层部分提取比较基础的特征，而高层会提取高级的语义特征。</p>
<p>在CNN里，前几层提取的可能更多是关于简单的边界、颜色、形状等基础特征，而后面的层则提取较复杂的语义特征，比如在分类任务中会是和分类类别相关的花纹、物体大小、风格等特征。</p>
<p>如果我们把模型设计成，最后一层的一个像素刚好要到第一层才能接收到全局信息（在其它层都只能看到局部），那对于图像边缘的语义特征识别能力可能会受到一些限制。</p>
<p>具体来说，假设我们做猫和狗的图像分类任务，如果这个时候决定性的特征出现在图像最边缘几个像素里，那这种情况下的错误率会比特征出现在图像中间时要高。</p>
<p>而对于语言模型，一般情况下，越远距离的信息，对当前位置的重要性越低，因此只要我们的窗口大小不要太过极限小，问题应该都还不大。</p>
<p>看下Mistral的SWA具体是怎么做的</p>
<img src="/c61d17e3/mistral_swa.png" class title="Mistral SWA">
<p>左边是正常的causal
attention，每个位置能看到自己和前面的位置，attention
mask是个下三角矩阵。</p>
<p>中间则是SWA的attention
mask，这里的窗口大小为3。包括自己在内，每个位置只能往前看3个输入。</p>
<p>同CNN的感受野一样，随着层数的堆叠，模型理论上能处理的最远距离也逐层线性递增。只是LLM里递增的方向是单向的，只能往前。</p>
<p>Mistral 7B使用了4096的窗口大小，模型层数为32，则最终输出的”感受野“为
<span class="math inline">\(4096\times 32=131,072\)</span>
达到131k的长度。</p>
<p>前面我们推算了attention的计算量，其中QK
logits、Softmax和Reduction三项是和长度 <span class="math inline">\(s\)</span>
成平方关系。在使用了SWA之后，理论上，这几个operation仅使用4k的计算量，就能获得131k的上下文效果。当输入长度为131k时，除去已经缓存部分的数值，新的输入计算量相差
<span class="math inline">\(32\times 32=1024\)</span> 倍。</p>
<p>而缓存和上下文长度 <span class="math inline">\(s\)</span>
成线性关系，当上下文长度为131k时，最大也能节省 <span class="math inline">\(31/32\)</span> 的显存。</p>
<p>即SWA在上下文长度在4k以下时，和普通causal
attention一样；当上下文长度超过4k时，则相对节省资源，长度越大，节省的比例越高。</p>
<blockquote>
<p>In practice, for a sequence length of 16K and W = 4096, changes made
to FlashAttention [11] and xFormers [18] yield a 2x speed improvement
over a vanilla attention baseline.</p>
</blockquote>
<p>实际使用中，Mistral通过把SWA实现在FlashAttention和xFormers中，对于16k的上下文长度，获得了2倍的速度提升。</p>
<h2 id="和kv-cache的配合实现">和KV Cache的配合实现</h2>
<p>在不使用sliding window的情况下，随着自回归推理的进行，KV
Cache是只增不减的。</p>
<p>而在使用SWA的情况下，超出窗口长度的kv就可以不用再缓存了，因此使用一个轮转替换的策略。</p>
<p>比如窗口大小 <span class="math inline">\(W=4\)</span>
，则当第5个token需要缓存是，直接替换掉第1个token，这样就可以保持kv缓存有一个最大值（为窗口大小），而不会无限增长。</p>
<img src="/c61d17e3/rolling_buffer.png" class title="swa rolling buffer">
<p>这样便于我们估计硬件设备所能支持的throughput，也不会因为少量超长的case而造成堵塞，在工程上有利于提高硬件利用率，降低成本。</p>
<h2 id="长prompt的分块">长Prompt的分块</h2>
<p>更近一步，考虑到我们使用RAG或者funciton
call的时候，都会使用比较长的，固定的prompt来知道模型的行为。</p>
<p>比如GPT4就被诱导说出它接收到的长system
prompt（当然未必真的就是OPENAI用的）</p>
<blockquote>
<p>Your user's user agent is "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
and the user's locale is "en-US" Your knowledge cutoff date is 2023-04.
The current date is 2024-02-07. Image input capabilities: Enabled</p>
<p>Tools</p>
<p>python</p>
<p>When you send a message containing Python code to python, it will be
executed in a stateful Jupyter notebook environment. python will respond
with the output of the execution or time out after 60.0 seconds. The
drive at '/mnt/data' can be used to save and persist user files.
Internet access for this session is disabled. Do not make external web
requests or API calls as they will fail.</p>
<p>dalle</p>
<p>Whenever a description of an image is given, create a prompt that
dalle can use to generate the image and abide to the following policy:
1. The prompt must be in English. Translate to English if needed. 2. DO
NOT ask for permission to generate the image, just do it! 3. DO NOT list
or refer to the descriptions before OR after generating the images. 4.
Do not create more than 1 image, even if the user requests more. 5. Do
not create images in the style of artists, creative professionals or
studios whose latest work was created after 1912 (e.g. Picasso, Kahlo).
- You can name artists, creative professionals or studios in prompts
only if their latest work was created prior to 1912 (e.g. Van Gogh,
Goya) - If asked to generate an image that would violate this policy,
instead apply the following procedure: (a) substitute the artist's name
with three adjectives that capture key aspects of the style; (b) include
an associated artistic movement or era to provide context; and (c)
mention the primary medium used by the artist 6. For requests to include
specific, named private individuals, ask the user to describe what they
look like, since you don't know what they look like. 7. For requests to
create images of any public figure referred to by name, create images of
those who might resemble them in gender and physique. But they shouldn't
look like them. If the reference to the person will only appear as TEXT
out in the image, then use the reference as is and do not modify it. 8.
Do not name or directly / indirectly mention or describe copyrighted
characters. Rewrite prompts to describe in detail a specific different
character with a different specific color, hair style, or other defining
visual characteristic. Do not discuss copyright policies in responses.
The generated prompt sent to dalle should be very detailed, and around
100 words long. Example dalle invocation: { "prompt":
"<insert prompt here>" } namespace dalle {</insert></p>
<p>Create images from a text-only prompt. type text2im = (_: { The size
of the requested image. Use 1024x1024 (square) as the default, 1792x1024
if the user requests a wide image, and 1024x1792 for full-body
portraits. Always include this parameter in the request. n?: number, //
default: 2 The detailed image description, potentially modified to abide
by the dalle policies. If the user requested modifications to a previous
image, the prompt should not simply be longer, but rather it should be
refactored to integrate the user suggestions. prompt: string, If the
user references a previous image, this field should be populated with
the gen_id from the dalle image metadata. referenced_image_ids?:
string[], }) =&gt; any; } // namespace dalle</p>
<p>voice_mode Voice mode functions are not available in text
conversations. namespace voice_mode { } // namespace voice_mode</p>
<p>browser</p>
<p>You have the tool <code>browser</code>. Use <code>browser</code> in
the following circumstances: - User is asking about current events or
something that requires real-time information (weather, sports scores,
etc.) - User is asking about some term you are totally unfamiliar with
(it might be new) - User explicitly asks you to browse or provide links
to references</p>
<p>Given a query that requires retrieval, your turn will consist of
three steps: 1. Call the search function to get a list of results. 2.
Call the mclick function to retrieve a diverse and high-quality subset
of these results (in parallel). Remember to SELECT AT LEAST 3 sources
when using <code>mclick</code>. 3. Write a response to the user based on
these results. In your response, cite sources using the citation format
below.</p>
<p>In some cases, you should repeat step 1 twice, if the initial results
are unsatisfactory, and you believe that you can refine the query to get
better results.</p>
<p>You can also open a url directly if one is provided by the user. Only
use the <code>open_url</code> command for this purpose; do not open urls
returned by the search function or found on webpages.</p>
<p>The <code>browser</code> tool has the following commands:
<code>search(query: str, recency_days: int)</code> Issues a query to a
search engine and displays the results.
<code>mclick(ids: list[str])</code>. Retrieves the contents of the
webpages with provided IDs (indices). You should ALWAYS SELECT AT LEAST
3 and at most 10 pages. Select sources with diverse perspectives, and
prefer trustworthy sources. Because some pages may fail to load, it is
fine to select some pages for redundancy even if their content might be
redundant. <code>open_url(url: str)</code> Opens the given URL and
displays it.</p>
<p>For citing quotes from the 'browser' tool: please render in this
format: 【{message idx}†{link text}】. For long citations: please render
in this format: <a href="message%20idx">link text</a>. Otherwise do not
render links.</p>
</blockquote>
<p>除了预先计算好system
prompt的kv值，并保存在缓存中方便每次用户输入使用外，如果system
prompt很长（比sliding window大），还可以通过对system
prompt的kv值进行切分来进一步优化计算。</p>
<p>比如窗口大小 <span class="math inline">\(W=4\)</span>，system
prompt大小为9时，就可以把system prompt的kv缓存切成 [4,4,1] 三块。</p>
<p>第一块由于和当前的输入距离超过了一个window的大小，所以是完全看不见的，对应的attention
mask全为0，因此可以完全忽略。</p>
<p>第二块的attention
mask则是一个上三角矩阵，当前的输入需要用到这部分信息。</p>
<p>第三块是一个下三角矩阵（的左边部分），包含了当前的输入在内。</p>
<p>在推理的时候，我们只需要用到第二块和第三块的内容，这就节省了缓存的操作。</p>
<p>而且无论prompt有多长，只要我们按窗口大小分块，一定只会用到最后两块。</p>
<img src="/c61d17e3/prefill_and_chunking.png" class title="prefill and chunking">
<p>（实际上现在推理框架基本上都有FlashAttention/PagedAttention等技术加持，能够进一步节省资源，提高效率，这个后续再开一篇讲）</p>
<p>Mistral
7B整体的效果上的效果相比Llama是有优势的，部分任务甚至超过了Llama
34B。</p>
<img src="/c61d17e3/mistral_perf.png" class title="mistral performance">
<p>Mistral认为大语言模型压缩知识的能力实际超过我们的认知，7B这个规模的效果还有提升空间。</p>
<h1 id="sparse-attention">Sparse Attention</h1>
<p>SWA实际上是一种sparse attention，而sparse
attention也有许多工作做了深入探索。</p>
<p>这里简单说一小部分，有机会再完整梳理一遍sparse
attention的理论和实践。</p>
<h2 id="longformer">Longformer</h2>
<p>前面提到，Mistral并不是第一个使用SWA的。</p>
<p>2020年，<a target="_blank" rel="noopener" href="https://arxiv.org/pdf/2004.05150.pdf">《Longformer:
The Long-Document Transformer》</a>就提出包含SWA在内的一系列sparse
attention的做法。</p>
<p>从文章名字就看到出来，Longformer主要目的也是为了解决长上下文的问题。</p>
<img src="/c61d17e3/longformer_attention.png" class title="longformer">
<p>上图中的（b）就是SWA，只是用在Bert中的时候它是双向的。</p>
<p>在SWA的基础上，还可以进行空洞滑窗（dilated sliding
window），在不增加计算量的情况下，提升感受野。这也是从空洞卷积（下图）来的灵感了。</p>
<img src="/c61d17e3/dilated_conv.png" class title="dilated convolution">
<p>还可以更进一步优化attention。无论是SWA还是dilated sliding
window，每个位置都只能看到局部的信息。</p>
<p>但是实际上有些位置就是对全局信息有很高的需求。</p>
<p>在Bert中，[CLS]
token就常常作为分类token或者相似度向量使用，这种情况下就需要它能获取整个上下文的完整信息。</p>
<p>而在GPT中，instruction，或者说prompt的部分也对全局信息有更高要求，因为我们希望在整个对话过程中，模型都能遵循我们给出的规则。</p>
<p>对于这些token，我们让它可以看到其他所有位置，使用完整的global
attention，而其他位置则使用sliding window，如（d）中所示。</p>
<h2 id="big-bird">Big Bird</h2>
<p>无独有偶，同样在2020年，和Longformer差不多在同一时期，也有另外一个通过sparse
attention来优化长文本效果的工作，<a target="_blank" rel="noopener" href="https://arxiv.org/abs/2007.14062">《Big Bird: Transformers for
Longer Sequences》</a>。</p>
<p>其中sliding window和global attention结合的思路和Longformer相似。Big
Bird还额外加入了一个random attention的做法。</p>
<img src="/c61d17e3/big_bird_attention.png" class title="big bird attention">
<p>上图中 <span class="math inline">\(r=2\)</span>
即每个位置使用2个随机注意力。</p>
<h1 id="小结">小结</h1>
<p>SWA在优化长上下文的计算效率上有明显的收益。而在模型效果上，目前基本没有看到不可接受的损失。对长上下文有需求的业务，值得探索。</p>
<p>除了SWA，sparse
attention还有许多其他探索。目前来看，这些做法都有一定的理论基础，效果也不错。但是阻碍这些方案大规模使用的一个原因就是<big><strong>工程实现</strong></big>，比如如何高效计算global
+ local attention，在flash attention中能够支持random
attention，这都是要考虑的内容。</p>
<hr>
<p>读到这了，来一发点赞收藏关注吧~</p>
<p>博客：<a target="_blank" rel="noopener" href="http://www.linsight.cn/">http://www.linsight.cn/</a><br>
知乎：<a target="_blank" rel="noopener" href="https://www.zhihu.com/people/us4ever">Linsight</a><br>
微信公众号：Linsight<br>
<img src="/images/qrcode.jpg"></p>
<h1 id="reference">Reference</h1>
<p>【1】Mistral 7B https://arxiv.org/pdf/2310.06825.pdf<br>
【2】Longformer: The Long-Document Transformer
https://arxiv.org/pdf/2004.05150.pdf<br>
【3】Training Compute-Optimal Large Language Models
https://arxiv.org/pdf/2203.15556.pdf<br>
【4】GPT-4 System Prompt Revealed
https://patmcguinness.substack.com/p/gpt-4-system-prompt-revealed<br>
【5】Big Bird: Transformers for Longer Sequences
https://arxiv.org/abs/2007.14062</p>

    </div>

    
    <footer class="post-footer">
          

<div class="post-copyright">
<ul>
  <li class="post-copyright-author">
      <strong>本文作者： </strong>Lin
  </li>
  <li class="post-copyright-link">
      <strong>本文链接：</strong>
      <a href="https://saicat.github.io/c61d17e3.html" title="稀疏注意力计算:sliding window attention">https://saicat.github.io/c61d17e3.html</a>
  </li>
  <li class="post-copyright-license">
      <strong>版权声明： </strong>本博客所有文章除特别声明外，均采用 <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" rel="noopener" target="_blank"><i class="fab fa-fw fa-creative-commons"></i>BY-NC-SA</a> 许可协议。转载请注明出处！
  </li>
</ul>
</div>

          <div class="post-tags">
              <a href="/tags/NLP/" rel="tag"><i class="fa fa-tag"></i> NLP</a>
              <a href="/tags/LLM/" rel="tag"><i class="fa fa-tag"></i> LLM</a>
              <a href="/tags/transformer/" rel="tag"><i class="fa fa-tag"></i> transformer</a>
              <a href="/tags/attention/" rel="tag"><i class="fa fa-tag"></i> attention</a>
              <a href="/tags/sliding-window-attention/" rel="tag"><i class="fa fa-tag"></i> sliding window attention</a>
              <a href="/tags/sparse-attention/" rel="tag"><i class="fa fa-tag"></i> sparse attention</a>
          </div>

        
          <div class="post-nav">
            <div class="post-nav-item">
                <a href="/3dc22f96.html" rel="prev" title="理解Attention:从起源到MHA,MQA和GQA">
                  <i class="fa fa-angle-left"></i> 理解Attention:从起源到MHA,MQA和GQA
                </a>
            </div>
            <div class="post-nav-item">
                <a href="/3345028a.html" rel="next" title="大模型算法题(1)">
                  大模型算法题(1) <i class="fa fa-angle-right"></i>
                </a>
            </div>
          </div>
    </footer>
  </article>
</div>


    <div class="comments utterances-container"></div>
</div>
  </main>

  <footer class="footer">
    <div class="footer-inner">

  <div class="copyright">
    &copy; 
    <span itemprop="copyrightYear">2024</span>
    <span class="with-love">
      <i class="fa fa-heart"></i>
    </span>
    <span class="author" itemprop="copyrightHolder">Lin</span>
  </div>
<div class="wordcount">
  <span class="post-meta-item">
    <span class="post-meta-item-icon">
      <i class="fa fa-chart-line"></i>
    </span>
    <span title="站点总字数">242k</span>
  </span>
  <span class="post-meta-item">
    <span class="post-meta-item-icon">
      <i class="fa fa-coffee"></i>
    </span>
    <span title="站点阅读时长">7:20</span>
  </span>
</div>
<div class="busuanzi-count">
</div>

<!--
-->


<!-- 网站运行时间的设置 -->
<span id="timeDate">载入天数...</span>
<span id="times">载入时分秒...</span>
<script>
    var now = new Date();
    function createtime() {
        var grt= new Date("03/01/2023 10:00:00"); //此处修改你的建站时间或者网站上线时间
        now.setTime(now.getTime()+250);
        days = (now - grt ) / 1000 / 60 / 60 / 24; dnum = Math.floor(days);
        hours = (now - grt ) / 1000 / 60 / 60 - (24 * dnum); hnum = Math.floor(hours);
        if(String(hnum).length ==1 ){hnum = "0" + hnum;} minutes = (now - grt ) / 1000 /60 - (24 * 60 * dnum) - (60 * hnum);
        mnum = Math.floor(minutes); if(String(mnum).length ==1 ){mnum = "0" + mnum;}
        seconds = (now - grt ) / 1000 - (24 * 60 * 60 * dnum) - (60 * 60 * hnum) - (60 * mnum);
        snum = Math.round(seconds); if(String(snum).length ==1 ){snum = "0" + snum;}
        document.getElementById("timeDate").innerHTML = "本站已安全运行 "+dnum+" 天 ";
        document.getElementById("times").innerHTML = hnum + " 小时 " + mnum + " 分 " + snum + " 秒.";
    }
setInterval("createtime()",250);
</script>

    </div>
  </footer>

  
  <div class="back-to-top" role="button" aria-label="返回顶部">
    <i class="fa fa-arrow-up fa-lg"></i>
    <span>0%</span>
  </div>

<noscript>
  <div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>


  <script src="https://cdnjs.cloudflare.com/ajax/libs/animejs/3.2.1/anime.min.js" integrity="sha256-XL2inqUJaslATFnHdJOi9GfQ60on8Wx1C2H8DYiN1xY=" crossorigin="anonymous"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/fancyapps-ui/5.0.28/fancybox/fancybox.umd.js" integrity="sha256-ytMJGN3toR+a84u7g7NuHm91VIR06Q41kMWDr2pq7Zo=" crossorigin="anonymous"></script>
<script src="/js/comments.js"></script><script src="/js/utils.js"></script><script src="/js/motion.js"></script><script src="/js/next-boot.js"></script>

  <script src="https://cdnjs.cloudflare.com/ajax/libs/hexo-generator-searchdb/1.4.1/search.js" integrity="sha256-1kfA5uHPf65M5cphT2dvymhkuyHPQp5A53EGZOnOLmc=" crossorigin="anonymous"></script>
<script src="/js/third-party/search/local-search.js"></script>


  <script src="/js/third-party/fancybox.js"></script>


  <script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>


  <script class="next-config" data-name="enableMath" type="application/json">true</script><script class="next-config" data-name="mathjax" type="application/json">{"enable":true,"tags":"ams","js":{"url":"https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.2/es5/tex-mml-chtml.js","integrity":"sha256-MASABpB4tYktI2Oitl4t+78w/lyA+D7b/s9GEP0JOGI="}}</script>
<script src="/js/third-party/math/mathjax.js"></script>


<script class="next-config" data-name="utterances" type="application/json">{"enable":true,"repo":"Saicat/comment-utterance","issue_term":"pathname","theme":"github-light"}</script>
<script src="/js/third-party/comments/utterances.js"></script>

</body>
</html>