-
Notifications
You must be signed in to change notification settings - Fork 0
/
f5fb75e4.html
639 lines (524 loc) · 37.1 KB
/
f5fb75e4.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width">
<meta name="theme-color" content="#222" media="(prefers-color-scheme: light)">
<meta name="theme-color" content="#222" media="(prefers-color-scheme: dark)"><meta name="generator" content="Hexo 7.1.1">
<link rel="apple-touch-icon" sizes="180x180" href="/images/favicon/favicon_io/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon/favicon_io/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon/favicon_io/favicon-16x16.png">
<link rel="stylesheet" href="/css/main.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css" integrity="sha256-yIDrPSXHZdOZhAqiBP7CKzIwMQmRCJ8UeB8Jo17YC4o=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/animate.css/3.1.1/animate.min.css" integrity="sha256-PR7ttpcvz8qrF57fur/yAx1qXMFJeJFiA6pSzWi0OIE=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/fancyapps-ui/5.0.28/fancybox/fancybox.css" integrity="sha256-6cQIC71/iBIYXFK+0RHAvwmjwWzkWd+r7v/BX3/vZDc=" crossorigin="anonymous">
<script class="next-config" data-name="main" type="application/json">{"hostname":"saicat.github.io","root":"/","images":"/images","scheme":"Gemini","darkmode":true,"version":"8.19.1","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12},"copycode":{"enable":false,"style":null},"fold":{"enable":false,"height":500},"bookmark":{"enable":false,"color":"#222","save":"auto"},"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"stickytabs":false,"motion":{"enable":true,"async":false,"transition":{"menu_item":"fadeInDown","post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"i18n":{"placeholder":"搜索...","empty":"没有找到任何搜索结果:${query}","hits_time":"找到 ${hits} 个搜索结果(用时 ${time} 毫秒)","hits":"找到 ${hits} 个搜索结果"},"path":"/search.xml","localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false}}</script><script src="/js/config.js"></script>
<meta name="description" content="【本文已在同名 微信公众号 / 知乎 / 个人博客linsight.cn 上线】 智谱在《Understanding Emergent Abilities of Language Models from the Loss Perspective》中提出一个观察大模型涌现能力的视角 -- 预训练loss,主要内容是通过一系列实验结果来解释一些关于涌现能力的观察。可以作为一个理解大模型的参考角度,也">
<meta property="og:type" content="article">
<meta property="og:title" content="从loss视角理解大模型涌现能力">
<meta property="og:url" content="https://saicat.github.io/f5fb75e4.html">
<meta property="og:site_name" content="Linsight">
<meta property="og:description" content="【本文已在同名 微信公众号 / 知乎 / 个人博客linsight.cn 上线】 智谱在《Understanding Emergent Abilities of Language Models from the Loss Perspective》中提出一个观察大模型涌现能力的视角 -- 预训练loss,主要内容是通过一系列实验结果来解释一些关于涌现能力的观察。可以作为一个理解大模型的参考角度,也">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/eng_data.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/downstream_dataset.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/downstream_dataset_num.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/exp1_param.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/exp1_plot.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/exp1_compute.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/exp2_param.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/exp2_plot.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/exp3_plot.png">
<meta property="og:image" content="https://saicat.github.io/f5fb75e4/metrics.png">
<meta property="og:image" content="https://saicat.github.io/images/qrcode.jpg">
<meta property="article:published_time" content="2024-06-15T08:13:55.000Z">
<meta property="article:modified_time" content="2024-06-16T09:16:55.156Z">
<meta property="article:author" content="Lin">
<meta property="article:tag" content="NLP">
<meta property="article:tag" content="LLM">
<meta property="article:tag" content="transformer">
<meta property="article:tag" content="涌现能力">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://saicat.github.io/f5fb75e4/eng_data.png">
<link rel="canonical" href="https://saicat.github.io/f5fb75e4.html">
<script class="next-config" data-name="page" type="application/json">{"sidebar":"","isHome":false,"isPost":true,"lang":"zh-CN","comments":true,"permalink":"https://saicat.github.io/f5fb75e4.html","path":"f5fb75e4.html","title":"从loss视角理解大模型涌现能力"}</script>
<script class="next-config" data-name="calendar" type="application/json">""</script>
<title>从loss视角理解大模型涌现能力 | Linsight</title>
<noscript>
<link rel="stylesheet" href="/css/noscript.css">
</noscript>
</head>
<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
<div class="headband"></div>
<main class="main">
<div class="column">
<header class="header" itemscope itemtype="http://schema.org/WPHeader"><div class="site-brand-container">
<div class="site-nav-toggle">
<div class="toggle" aria-label="切换导航栏" role="button">
<span class="toggle-line"></span>
<span class="toggle-line"></span>
<span class="toggle-line"></span>
</div>
</div>
<div class="site-meta">
<a href="/" class="brand" rel="start">
<i class="logo-line"></i>
<p class="site-title">Linsight</p>
<i class="logo-line"></i>
</a>
<p class="site-subtitle" itemprop="description">聊聊技术,也聊聊其他的</p>
</div>
<div class="site-nav-right">
<div class="toggle popup-trigger" aria-label="搜索" role="button">
<i class="fa fa-search fa-fw fa-lg"></i>
</div>
</div>
</div>
<nav class="site-nav">
<ul class="main-menu menu"><li class="menu-item menu-item-home"><a href="/" rel="section"><i class="fa fa-home fa-fw"></i>首页</a></li><li class="menu-item menu-item-tags"><a href="/tags/" rel="section"><i class="fa fa-tags fa-fw"></i>标签</a></li><li class="menu-item menu-item-categories"><a href="/categories/" rel="section"><i class="fa fa-th fa-fw"></i>分类</a></li><li class="menu-item menu-item-archives"><a href="/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>归档</a></li>
<li class="menu-item menu-item-search">
<a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
</a>
</li>
</ul>
</nav>
<div class="search-pop-overlay">
<div class="popup search-popup"><div class="search-header">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<div class="search-input-container">
<input autocomplete="off" autocapitalize="off" maxlength="80"
placeholder="搜索..." spellcheck="false"
type="search" class="search-input">
</div>
<span class="popup-btn-close" role="button">
<i class="fa fa-times-circle"></i>
</span>
</div>
<div class="search-result-container no-result">
<div class="search-result-icon">
<i class="fa fa-spinner fa-pulse fa-5x"></i>
</div>
</div>
</div>
</div>
</header>
<aside class="sidebar">
<div class="sidebar-inner sidebar-nav-active sidebar-toc-active">
<ul class="sidebar-nav">
<li class="sidebar-nav-toc">
文章目录
</li>
<li class="sidebar-nav-overview">
站点概览
</li>
</ul>
<div class="sidebar-panel-container">
<!--noindex-->
<div class="post-toc-wrap sidebar-panel">
<div class="post-toc animated"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#%E8%83%8C%E6%99%AF"><span class="nav-number">1.</span> <span class="nav-text">背景</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#pretraining-loss%E5%92%8C%E4%B8%8B%E6%B8%B8%E4%BB%BB%E5%8A%A1%E8%A1%A8%E7%8E%B0%E7%9A%84%E5%85%B3%E7%B3%BB"><span class="nav-number">2.</span> <span class="nav-text">pretraining
loss和下游任务表现的关系</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#%E8%AE%BE%E7%BD%AE"><span class="nav-number">2.1.</span> <span class="nav-text">设置</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%AE%9E%E9%AA%8C%E4%B8%80pretraining-loss-vs.-performance"><span class="nav-number">2.2.</span> <span class="nav-text">实验一:pretraining loss
vs. performance</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E5%AE%9E%E9%AA%8C%E4%BA%8Ctraining-token-count-vs.-performance"><span class="nav-number">2.3.</span> <span class="nav-text">实验二:training
token count vs. performance</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#llamas-loss-vs.-performance"><span class="nav-number">2.4.</span> <span class="nav-text">LLaMA’s loss vs. performance</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E8%BF%9B%E4%B8%80%E6%AD%A5%E5%88%86%E6%9E%90"><span class="nav-number">3.</span> <span class="nav-text">进一步分析</span></a><ol class="nav-child"><li class="nav-item nav-level-2"><a class="nav-link" href="#%E4%B8%8D%E5%90%8C%E4%BB%BB%E5%8A%A1%E7%9A%84%E8%B6%8B%E5%8A%BF"><span class="nav-number">3.1.</span> <span class="nav-text">不同任务的趋势</span></a></li><li class="nav-item nav-level-2"><a class="nav-link" href="#%E8%AF%84%E6%B5%8B%E6%8C%87%E6%A0%87%E7%9A%84%E5%BD%B1%E5%93%8D"><span class="nav-number">3.2.</span> <span class="nav-text">评测指标的影响</span></a></li></ol></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E4%BB%8Eloss%E8%A7%92%E5%BA%A6%E5%AE%9A%E4%B9%89emergent-abilities"><span class="nav-number">4.</span> <span class="nav-text">从loss角度定义emergent
abilities</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E5%B0%8F%E7%BB%93"><span class="nav-number">5.</span> <span class="nav-text">小结</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#reference"><span class="nav-number">6.</span> <span class="nav-text">Reference</span></a></li></ol></div>
</div>
<!--/noindex-->
<div class="site-overview-wrap sidebar-panel">
<div class="site-author animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
<img class="site-author-image" itemprop="image" alt="Lin"
src="/images/avatar/Picasso_Elephant.png">
<p class="site-author-name" itemprop="name">Lin</p>
<div class="site-description" itemprop="description">AI | NLP</div>
</div>
<div class="site-state-wrap animated">
<nav class="site-state">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">31</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
<div class="site-state-item site-state-categories">
<a href="/categories/">
<span class="site-state-item-count">3</span>
<span class="site-state-item-name">分类</span></a>
</div>
<div class="site-state-item site-state-tags">
<a href="/tags/">
<span class="site-state-item-count">37</span>
<span class="site-state-item-name">标签</span></a>
</div>
</nav>
</div>
<div class="links-of-author animated">
<span class="links-of-author-item">
<a href="mailto:331603034@qq.com" title="E-Mail → mailto:331603034@qq.com" rel="noopener me" target="_blank"><i class="fa-regular fa-envelope fa-fw"></i>E-Mail</a>
</span>
</div>
<div class="cc-license animated" itemprop="license">
<a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" class="cc-opacity" rel="noopener" target="_blank"><img src="https://cdnjs.cloudflare.com/ajax/libs/creativecommons-vocabulary/2020.11.3/assets/license_badges/small/by_nc_sa.svg" alt="Creative Commons"></a>
</div>
<!--
<script type="text/javascript" src="//rf.revolvermaps.com/0/0/1.js?i=5acfv0hqzp5&s=220&m=1&v=false&r=false&b=000000&n=false&c=ff0000" async="async"></script>
-->
</div>
</div>
</div>
</aside>
</div>
<div class="main-inner post posts-expand">
<div class="post-block">
<article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh-CN">
<link itemprop="mainEntityOfPage" href="https://saicat.github.io/f5fb75e4.html">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="image" content="/images/avatar/Picasso_Elephant.png">
<meta itemprop="name" content="Lin">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Linsight">
<meta itemprop="description" content="AI | NLP">
</span>
<span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
<meta itemprop="name" content="从loss视角理解大模型涌现能力 | Linsight">
<meta itemprop="description" content="">
</span>
<header class="post-header">
<h1 class="post-title" itemprop="name headline">
从loss视角理解大模型涌现能力
</h1>
<div class="post-meta-container">
<div class="post-meta">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2024-06-15 16:13:55" itemprop="dateCreated datePublished" datetime="2024-06-15T16:13:55+08:00">2024-06-15</time>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar-check"></i>
</span>
<span class="post-meta-item-text">更新于</span>
<time title="修改时间:2024-06-16 17:16:55" itemprop="dateModified" datetime="2024-06-16T17:16:55+08:00">2024-06-16</time>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-folder"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/CS/" itemprop="url" rel="index"><span itemprop="name">CS</span></a>
</span>
,
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/CS/NLP/" itemprop="url" rel="index"><span itemprop="name">NLP</span></a>
</span>
,
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/CS/NLP/LLM/" itemprop="url" rel="index"><span itemprop="name">LLM</span></a>
</span>
</span>
<span class="post-meta-break"></span>
<span class="post-meta-item" title="本文字数">
<span class="post-meta-item-icon">
<i class="far fa-file-word"></i>
</span>
<span class="post-meta-item-text">本文字数:</span>
<span>5.4k</span>
</span>
<span class="post-meta-item" title="阅读时长">
<span class="post-meta-item-icon">
<i class="far fa-clock"></i>
</span>
<span class="post-meta-item-text">阅读时长 ≈</span>
<span>10 分钟</span>
</span>
</div>
</div>
</header>
<div class="post-body" itemprop="articleBody"><p>【本文已在同名 微信公众号 / 知乎 / <a target="_blank" rel="noopener" href="http://www.linsight.cn/">个人博客linsight.cn</a> 上线】</p>
<hr>
<p>智谱在《Understanding Emergent Abilities of Language Models from the
Loss Perspective》中提出一个观察大模型涌现能力的视角 --
预训练loss,主要内容是通过一系列实验结果来解释一些关于涌现能力的观察。可以作为一个理解大模型的参考角度,也可以用于指导预训练模型的开发和优化。</p>
<h1 id="背景">背景</h1>
<p>《Emergent abilities of large language models》把emergent
ability定义为在大规模模型中有,而在参数量较小的模型没有的能力。</p>
<p>这个看法现在受到一些挑战:<br>
-
目前很多在更大规模数据集训练出来的小模型,展现出比之前大规模模型更强的能力,比如LLaMA3在大部分评测指标上就比GPT-3强,很多以前千亿模型才能做到的任务,现在百亿甚至十亿的模型也能做好。<br>
- 《Are emergent abilities of large language models a
mirage?》认为产生涌现能力现象的因为是数据评测指标的非线性和不连续性带来的,如果使用更细粒度的连续指标,就能观察到指标的平滑提升。</p>
<p>而《Training compute-optimal large language
models》指出,相同的计算量下,不同的模型规模和数据量的组合会产生不同的效果。这说明单纯的模型规模或者数据规模并不是一个好的下游任务能力的indicator,预训练loss才是更合适的指标。</p>
<p>但是训练loss和下游任务表现具体是什么关系却还没有确定的说法,智谱针对这个问题做了一些预训练实验,并从预训练loss角度定义了emergent
ability。</p>
<h1 id="pretraining-loss和下游任务表现的关系">pretraining
loss和下游任务表现的关系</h1>
<h2 id="设置">设置</h2>
<p>后续所有预训练实验使用相同的模型结构和同一份预训练数据(但是训练数据量可能有区别),一些通用设置如下:<br>
- 分词用BPE<br>
- 模型结构在LLaMA基础上,全部使用GQA,而RoPE只在一半的Q/K上应用<br>
- 使用AdamW优化器,<span class="math inline">\(\beta_1=0.9\)</span>,<span class="math inline">\(\beta_2=0.95\)</span><br>
- 训练窗口长度为2048<br>
-
所有模型都在中英文比例为1:4的预训练数据集上训练,英文数据集分布如下</p>
<img src="/f5fb75e4/eng_data.png" class title="英文数据">
<p>所有模型都是从零开始预训练。</p>
<p>评测模型的下游任务共有6类12个数据集,具体信息如下</p>
<img src="/f5fb75e4/downstream_dataset.png" class title="下游任务">
<img src="/f5fb75e4/downstream_dataset_num.png" class title="下游任务">
<h2 id="实验一pretraining-loss-vs.-performance">实验一:pretraining loss
vs. performance</h2>
<p>第一个实验训练了3个规模的模型:1.5B、6B、32B,训练数据量分别为3T、3T、2.5T。具体设置如下</p>
<img src="/f5fb75e4/exp1_param.png" class title="实验设置">
<p>大约每训练43B
token就会保存一次checkpoint。把3个模型所有checkpoint下,对应的预训练loss和下游任务评测结果画出来,如下所示</p>
<img src="/f5fb75e4/exp1_plot.png" class title="loss vs. performance">
<p>从上图可以观察到3个现象:<br>
-
无论模型规模如何,下游任务评测结果都随着预训练loss的降低而提升。从提升的具体情况可以分成两类,这个后面部分再分析。<br>
-
各个规模的模型所画出的点都落在了同一条曲线上,这说明下游任务的评测结果和预训练loss高度相关,而和模型规模没有直接关系。这点很重要。<br>
-
预训练loss对下游任务指标的表征能力同时适用于中英文,这说明中英文token在多语言预训练中具有相似的learning
dynamics。</p>
<p>而把计算量和下游任务指标的关系画出来,则有如下结果</p>
<img src="/f5fb75e4/exp1_compute.png" class title="下游任务效果和预训练计算量的关系">
<p>可以看到各个规模的模型所画出的点并没有落在同一条曲线上,这说明计算量并不是表征下游任务效果的好指标。</p>
<h2 id="实验二training-token-count-vs.-performance">实验二:training
token count vs. performance</h2>
<p>第二个实验使用了不同的数据量训练了28个小一些的模型,具体设置如下</p>
<img src="/f5fb75e4/exp2_param.png" class title="实验设置">
<p>第一个实验中,每个规模的模型设置了一个固定的训练token数,然后取中间checkpoint进行评测。第二个实验是对每个规模的模型设置了多个不同的总训练token数。二者的区别在于,预训练的最后阶段会逐渐把学习率decay到最小值,而这样的学习率退火策略对效果有很大的影响。</p>
<p>取28个模型的最终checkpoint,画出对应的预训练loss和下游任务评测结果如下</p>
<img src="/f5fb75e4/exp2_plot.png" class title="token count vs. performance">
<p>结果和实验一类似,各个模型的点都落在了同一条曲线上。说明无论模型规模和训练量如何,只要loss相同,在下游任务上就有相同的表现。</p>
<p>由于这28个模型相比实验一的较小,在图中最后一排的任务上效果都接近于随机。这个现象后续分析。</p>
<h2 id="llamas-loss-vs.-performance">LLaMA’s loss vs. performance</h2>
<p>实验一和二是在从零开始训练的模型上评测的,这里用LLaMA来验证前面得到的结论。</p>
<p>由于LLaMA没有放出中间checkpoint,这里直接从LLaMA的报告里抽出相应的数据点,在6个下游任务上的结果如下图</p>
<img src="/f5fb75e4/exp3_plot.png" class title="loss vs. performance">
<p>可以看到基本上各个模型的点也是落在同一条曲线上。LLaMA和实验一实验二的训练框架、模型结构、训练数据都有所不同,但是也有相同的结论,说明这样的结论是具有普遍性的。</p>
<blockquote>
<p>pre-training loss is a good indicator of LMs’ performance on
downstream tasks, independent of model sizes, training tokens,
languages, and pretraining frameworks</p>
</blockquote>
<h1 id="进一步分析">进一步分析</h1>
<h2 id="不同任务的趋势">不同任务的趋势</h2>
<p>12个下游任务可以分为2类:<br>
- 第一类:TriviaQA, HellaSwag, RACE, WinoGrande, NLPCC-KBQA, ClozeT,
CLUEWSC, C3。这些任务的效果随着预训练loss的下降,平滑上升。<br>
- 第二类:MMLU, C-Eval, GSM8K,
GSM8K-Chinese。这些任务上,只有当预训练loss低于一定阈值,评测结果才开始提升。可以观察到,在实验一实验二的配置下,大概在预训练loss小于2.2这个阈值之后,下游任务表现开始提升。整体来说,第二类任务难度是大于第一类的。所以虽然第一类中有些任务的prompt或者形式与第二类中的任务有些相似,但是依然有不同的表现。</p>
<p>第二类任务这个现象和《Grokking: Generalization beyond overfitting on
small algorithmic datasets》提出的grokking有关联。</p>
<p>grokking描述了下游任务的效果从随机水平(乱猜)提高到perfect
generalization的improvement。这种improvement只有在过拟合到一定程度才会发生。在预训练中,模型整体上通常是欠拟合的。不过由于预训练语料库是不同文档的混合,因此模型可能在某些能力上过拟合(比如数值计算的能力,情感分析的能力),而在整体上依然欠拟合。</p>
<p>当然第二类任务这个现象也和emergent ability有关联。按scaling
law的说法,在训练token数固定的情况下,预训练loss与模型规模呈幂律关系。也就是说,模型大小和预训练损失之间存在单调关系。对于第二类任务,存在一个与预训练loss中的临界点相对应的模型规模阈值。当模型大小超过这个阈值时,模型就可以展现出超过随机猜测的能力。</p>
<h2 id="评测指标的影响">评测指标的影响</h2>
<p>前面提到,emergent
ability这个现象有可能是因为评测指标的非线性和不连续性带来的。比如
MMLU这样的多项选择题,打分结果只能是0分或者满分。</p>
<p>现在把这个评测指标换成两个连续的指标:<br>
- 一个是probability of the correct answer (CorrectChoiceProb)<br>
- 第二个是《Are emergent abilities of large language models a
mirage?》中提出的Brier Score:</p>
<p><span class="math display">\[\text{BrierScore}=\frac1N\sum_{i=1}^N\sum_{j=1}^C(y_{ij}-\hat{y}_{ij})^2\]</span></p>
<p>N是样本数,C的类别数。</p>
<p>把MMLU和C-Eval在这两个新指标上的评测结果画出来,如下所示</p>
<img src="/f5fb75e4/metrics.png" class title="指标">
<p>可以发现涌现能力的现象依然存在。</p>
<p>值得注意的是,Brier Score的下降并不总是表示下游任务效果的提升。</p>
<p>比如对于有A/B/C/D四个选项的多项选择题任务,假设正确答案的分布是均匀的。现在有两个模型,一个总是预测A,即(1,0,0,0),另一个总是给出平均分布的预测,即(0.25,0.25,0.25,0.25,0.25)。</p>
<p>那么前者的Brier
Score是1.5,而后者是0.75,但这并不能说明后者就更好。对于这个任务,实际上高于0.75的Brier
Score都说明比随机猜测差。而低于随机猜测的指标变化并不能当做真正的提升,比如Brier
Score从1.5提升到1.0并不能算作提升。</p>
<p>另外《Training trajectories of language models across
scales》提出用perplexity of correct
options来作为评测,可以看到平滑的提升。但perplexity of correct
options其实不能作为一个合适的指标。</p>
<p>比如对于多项选择题,区分各个答案的能力才是我们想要的。而随着预训练进行,正确答案和错误答案的perplexity都在下降,只有当训练到二者的perplexity差异开始变大的时候,才能算是有提升。因此单纯的正确答案perplexity下降也能作为能力提升的指标,因为错误答案的perplexity可能下降更多。</p>
<h1 id="从loss角度定义emergent-abilities">从loss角度定义emergent
abilities</h1>
<p>基于前面的实验和分析,现在从预训练loss角度重新定义emergent
ability:</p>
<blockquote>
<p>Definition. An ability is emergent if it is not present in models
with higher pre-training loss but is present in models with lower
pre-training loss.</p>
</blockquote>
<p>一个emergent ability的normalized
performance(比如多项选择题随机猜测的得分是0.25分,那这个任务原始的0.25分在normalized
performance下就是0分)是预训练loss <span class="math inline">\(L\)</span> 的函数</p>
<p><span class="math display">\[\begin{cases}f(L)&\mathrm{if~}L<\eta\\0&\mathrm{otherwise}&\end{cases}\]</span></p>
<p>其中f是一个单调递减函数,<span class="math inline">\(\eta\)</span>
是阈值。</p>
<p>《Scaling laws for autoregressive generative
modeling》中提出,在固定的训练token数 <span class="math inline">\(D\)</span> 下,模型规模 <span class="math inline">\(N\)</span> 和预训练损失的关系是</p>
<p><span class="math display">\[L(N)=L_\infty+\left(\frac{N_0}N\right)^{\alpha_N}\]</span></p>
<p>其中 <span class="math inline">\(L_{\infty}\)</span> 是irreducible
loss,<span class="math inline">\(\alpha_{N}\)</span> 是固定的系数。</p>
<p>把上面两个式子结合起来,就有</p>
<p><span class="math display">\[\begin{cases}f\left(L_\infty+\left(\frac{N_0}N\right)^{\alpha_N}\right)&\text{if
}N\geq
N_0\cdot\left(\eta-L_\infty\right)^{-\frac1{\alpha_N}}\\0&\text{otherwise}&\end{cases}\]</span></p>
<p>当模型规模小于 <span class="math inline">\(N_0\cdot(\eta-L_\infty)^{-1/\alpha_N}\)</span>
这个阈值时,normalized
performance为0;当模型规模超过这个阈值时,模型规模的增长带来了预训练loss的下降,从而带来了normalized
performance的提升。</p>
<h1 id="小结">小结</h1>
<p>通过预训练loss来预测下游任务的提升,这点用在预训练模型的分析和优化上还是有些帮助的。比如在loss较高的时候,在下游任务上的效果的变化可能更多是随机波动而不是真正的提升。</p>
<p>不过文中只对一个model
family做了实验,而loss和模型结构,词表等都有关系,因此还需要进一步的探索。</p>
<hr>
<p>读到这了,来一发点赞收藏关注吧~</p>
<p>博客:<a target="_blank" rel="noopener" href="http://www.linsight.cn/">http://www.linsight.cn/</a><br>
知乎:<a target="_blank" rel="noopener" href="https://www.zhihu.com/people/us4ever">Linsight</a><br>
微信公众号:Linsight<br>
<img src="/images/qrcode.jpg"></p>
<hr>
<p>【往期文章】</p>
<p><a target="_blank" rel="noopener" href="http://www.linsight.cn/44e38c1b.html">MoE模型的前世今生</a><br>
<a target="_blank" rel="noopener" href="https://www.linsight.cn/1d5bcd45.html">昆仑万维-SkyworkMoE</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/c4da56c0.html">LLM长上下文的问题</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/cc852861.html">解锁大模型长上下文能力</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/45ee1a6d.html">大模型推理窗口-从有限到无限大</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/3dc22f96.html">理解Attention:从起源到MHA,MQA和GQA</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/f5c015c.html">大模型推理加速-投机解码</a><br>
<a target="_blank" rel="noopener" href="https://www.linsight.cn/7bbe2df6.html">大模型推理加速-MEDUSA</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/473f2b43.html">大模型偏好对齐-DPO</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/da871ebe.html">大模型偏好对齐-ODPO</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/280fa97a.html">大模型偏好对齐-simPO</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/4fe7b810.html">大模型偏好对齐-IPO</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/41b6a819.html">Yi技术报告-划重点看细节</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/6a40bfa5.html">transformer中normalization的二三事</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/b70b4a2d.html">从代码实现看normalization-到底做了什么</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/c61d17e3.html">稀疏注意力计算:sliding
window attention</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/a051710f.html">理解LLM位置编码:RoPE</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/3345028a.html">大模型算法题(1)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/ad0bba9d.html">大模型算法题(2)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/1736008.html">大模型算法题(3)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/1736008.html">大模型算法题(4)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/336f2f3e.html">大模型算法题(5)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/7c04944d.html">大模型算法题(6)</a><br>
<a target="_blank" rel="noopener" href="https://www.linsight.cn/dd614e12.html">大模型算法题(7)</a></p>
<hr>
<h1 id="reference">Reference</h1>
<p>【1】Understanding Emergent Abilities of Language Models from the
Loss Perspective https://arxiv.org/abs/2403.15796</p>
</div>
<footer class="post-footer">
<div class="post-copyright">
<ul>
<li class="post-copyright-author">
<strong>本文作者: </strong>Lin
</li>
<li class="post-copyright-link">
<strong>本文链接:</strong>
<a href="https://saicat.github.io/f5fb75e4.html" title="从loss视角理解大模型涌现能力">https://saicat.github.io/f5fb75e4.html</a>
</li>
<li class="post-copyright-license">
<strong>版权声明: </strong>本博客所有文章除特别声明外,均采用 <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" rel="noopener" target="_blank"><i class="fab fa-fw fa-creative-commons"></i>BY-NC-SA</a> 许可协议。转载请注明出处!
</li>
</ul>
</div>
<div class="post-tags">
<a href="/tags/NLP/" rel="tag"><i class="fa fa-tag"></i> NLP</a>
<a href="/tags/LLM/" rel="tag"><i class="fa fa-tag"></i> LLM</a>
<a href="/tags/transformer/" rel="tag"><i class="fa fa-tag"></i> transformer</a>
<a href="/tags/%E6%B6%8C%E7%8E%B0%E8%83%BD%E5%8A%9B/" rel="tag"><i class="fa fa-tag"></i> 涌现能力</a>
</div>
<div class="post-nav">
<div class="post-nav-item">
<a href="/dd614e12.html" rel="prev" title="大模型算法题(7)">
<i class="fa fa-angle-left"></i> 大模型算法题(7)
</a>
</div>
<div class="post-nav-item">
<a href="/7381cae3.html" rel="next" title="LLM的重复生成和ICL">
LLM的重复生成和ICL <i class="fa fa-angle-right"></i>
</a>
</div>
</div>
</footer>
</article>
</div>
<div class="comments utterances-container"></div>
</div>
</main>
<footer class="footer">
<div class="footer-inner">
<div class="copyright">
©
<span itemprop="copyrightYear">2024</span>
<span class="with-love">
<i class="fa fa-heart"></i>
</span>
<span class="author" itemprop="copyrightHolder">Lin</span>
</div>
<div class="wordcount">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="fa fa-chart-line"></i>
</span>
<span title="站点总字数">242k</span>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="fa fa-coffee"></i>
</span>
<span title="站点阅读时长">7:20</span>
</span>
</div>
<div class="busuanzi-count">
</div>
<!--
-->
<!-- 网站运行时间的设置 -->
<span id="timeDate">载入天数...</span>
<span id="times">载入时分秒...</span>
<script>
var now = new Date();
function createtime() {
var grt= new Date("03/01/2023 10:00:00"); //此处修改你的建站时间或者网站上线时间
now.setTime(now.getTime()+250);
days = (now - grt ) / 1000 / 60 / 60 / 24; dnum = Math.floor(days);
hours = (now - grt ) / 1000 / 60 / 60 - (24 * dnum); hnum = Math.floor(hours);
if(String(hnum).length ==1 ){hnum = "0" + hnum;} minutes = (now - grt ) / 1000 /60 - (24 * 60 * dnum) - (60 * hnum);
mnum = Math.floor(minutes); if(String(mnum).length ==1 ){mnum = "0" + mnum;}
seconds = (now - grt ) / 1000 - (24 * 60 * 60 * dnum) - (60 * 60 * hnum) - (60 * mnum);
snum = Math.round(seconds); if(String(snum).length ==1 ){snum = "0" + snum;}
document.getElementById("timeDate").innerHTML = "本站已安全运行 "+dnum+" 天 ";
document.getElementById("times").innerHTML = hnum + " 小时 " + mnum + " 分 " + snum + " 秒.";
}
setInterval("createtime()",250);
</script>
</div>
</footer>
<div class="back-to-top" role="button" aria-label="返回顶部">
<i class="fa fa-arrow-up fa-lg"></i>
<span>0%</span>
</div>
<noscript>
<div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>
<script src="https://cdnjs.cloudflare.com/ajax/libs/animejs/3.2.1/anime.min.js" integrity="sha256-XL2inqUJaslATFnHdJOi9GfQ60on8Wx1C2H8DYiN1xY=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/fancyapps-ui/5.0.28/fancybox/fancybox.umd.js" integrity="sha256-ytMJGN3toR+a84u7g7NuHm91VIR06Q41kMWDr2pq7Zo=" crossorigin="anonymous"></script>
<script src="/js/comments.js"></script><script src="/js/utils.js"></script><script src="/js/motion.js"></script><script src="/js/next-boot.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/hexo-generator-searchdb/1.4.1/search.js" integrity="sha256-1kfA5uHPf65M5cphT2dvymhkuyHPQp5A53EGZOnOLmc=" crossorigin="anonymous"></script>
<script src="/js/third-party/search/local-search.js"></script>
<script src="/js/third-party/fancybox.js"></script>
<script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
<script class="next-config" data-name="enableMath" type="application/json">true</script><script class="next-config" data-name="mathjax" type="application/json">{"enable":true,"tags":"ams","js":{"url":"https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.2/es5/tex-mml-chtml.js","integrity":"sha256-MASABpB4tYktI2Oitl4t+78w/lyA+D7b/s9GEP0JOGI="}}</script>
<script src="/js/third-party/math/mathjax.js"></script>
<script class="next-config" data-name="utterances" type="application/json">{"enable":true,"repo":"Saicat/comment-utterance","issue_term":"pathname","theme":"github-light"}</script>
<script src="/js/third-party/comments/utterances.js"></script>
</body>
</html>