-
Notifications
You must be signed in to change notification settings - Fork 0
/
4fe7b810.html
557 lines (442 loc) · 27.3 KB
/
4fe7b810.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width">
<meta name="theme-color" content="#222" media="(prefers-color-scheme: light)">
<meta name="theme-color" content="#222" media="(prefers-color-scheme: dark)"><meta name="generator" content="Hexo 7.1.1">
<link rel="apple-touch-icon" sizes="180x180" href="/images/favicon/favicon_io/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="/images/favicon/favicon_io/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="/images/favicon/favicon_io/favicon-16x16.png">
<link rel="stylesheet" href="/css/main.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.5.0/css/all.min.css" integrity="sha256-yIDrPSXHZdOZhAqiBP7CKzIwMQmRCJ8UeB8Jo17YC4o=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/animate.css/3.1.1/animate.min.css" integrity="sha256-PR7ttpcvz8qrF57fur/yAx1qXMFJeJFiA6pSzWi0OIE=" crossorigin="anonymous">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/fancyapps-ui/5.0.28/fancybox/fancybox.css" integrity="sha256-6cQIC71/iBIYXFK+0RHAvwmjwWzkWd+r7v/BX3/vZDc=" crossorigin="anonymous">
<script class="next-config" data-name="main" type="application/json">{"hostname":"saicat.github.io","root":"/","images":"/images","scheme":"Gemini","darkmode":true,"version":"8.19.1","exturl":false,"sidebar":{"position":"left","display":"post","padding":18,"offset":12},"copycode":{"enable":false,"style":null},"fold":{"enable":false,"height":500},"bookmark":{"enable":false,"color":"#222","save":"auto"},"mediumzoom":false,"lazyload":false,"pangu":false,"comments":{"style":"tabs","active":null,"storage":true,"lazyload":false,"nav":null},"stickytabs":false,"motion":{"enable":true,"async":false,"transition":{"menu_item":"fadeInDown","post_block":"fadeIn","post_header":"fadeInDown","post_body":"fadeInDown","coll_header":"fadeInLeft","sidebar":"fadeInUp"}},"i18n":{"placeholder":"搜索...","empty":"没有找到任何搜索结果:${query}","hits_time":"找到 ${hits} 个搜索结果(用时 ${time} 毫秒)","hits":"找到 ${hits} 个搜索结果"},"path":"/search.xml","localsearch":{"enable":true,"trigger":"auto","top_n_per_article":1,"unescape":false,"preload":false}}</script><script src="/js/config.js"></script>
<meta name="description" content="【本文已在同名 微信公众号 / 知乎 / 个人博客linsight.cn 上线】 前面我们对DPO、ODPO、simPO的思路做了整理:大模型偏好对齐-DPO,大模型偏好对齐-ODPO,大模型偏好对齐-simPO。">
<meta property="og:type" content="article">
<meta property="og:title" content="大模型偏好对齐-IPO">
<meta property="og:url" content="https://saicat.github.io/4fe7b810.html">
<meta property="og:site_name" content="Linsight">
<meta property="og:description" content="【本文已在同名 微信公众号 / 知乎 / 个人博客linsight.cn 上线】 前面我们对DPO、ODPO、simPO的思路做了整理:大模型偏好对齐-DPO,大模型偏好对齐-ODPO,大模型偏好对齐-simPO。">
<meta property="og:locale" content="zh_CN">
<meta property="og:image" content="https://saicat.github.io/4fe7b810/curve.png">
<meta property="og:image" content="https://saicat.github.io/images/qrcode.jpg">
<meta property="article:published_time" content="2024-06-02T03:58:52.000Z">
<meta property="article:modified_time" content="2024-06-06T14:52:00.000Z">
<meta property="article:author" content="Lin">
<meta property="article:tag" content="NLP">
<meta property="article:tag" content="LLM">
<meta property="article:tag" content="transformer">
<meta property="article:tag" content="强化学习">
<meta property="article:tag" content="微调">
<meta property="article:tag" content="SFT">
<meta property="article:tag" content="偏好对齐">
<meta name="twitter:card" content="summary">
<meta name="twitter:image" content="https://saicat.github.io/4fe7b810/curve.png">
<link rel="canonical" href="https://saicat.github.io/4fe7b810.html">
<script class="next-config" data-name="page" type="application/json">{"sidebar":"","isHome":false,"isPost":true,"lang":"zh-CN","comments":true,"permalink":"https://saicat.github.io/4fe7b810.html","path":"4fe7b810.html","title":"大模型偏好对齐-IPO"}</script>
<script class="next-config" data-name="calendar" type="application/json">""</script>
<title>大模型偏好对齐-IPO | Linsight</title>
<noscript>
<link rel="stylesheet" href="/css/noscript.css">
</noscript>
</head>
<body itemscope itemtype="http://schema.org/WebPage" class="use-motion">
<div class="headband"></div>
<main class="main">
<div class="column">
<header class="header" itemscope itemtype="http://schema.org/WPHeader"><div class="site-brand-container">
<div class="site-nav-toggle">
<div class="toggle" aria-label="切换导航栏" role="button">
<span class="toggle-line"></span>
<span class="toggle-line"></span>
<span class="toggle-line"></span>
</div>
</div>
<div class="site-meta">
<a href="/" class="brand" rel="start">
<i class="logo-line"></i>
<p class="site-title">Linsight</p>
<i class="logo-line"></i>
</a>
<p class="site-subtitle" itemprop="description">聊聊技术,也聊聊其他的</p>
</div>
<div class="site-nav-right">
<div class="toggle popup-trigger" aria-label="搜索" role="button">
<i class="fa fa-search fa-fw fa-lg"></i>
</div>
</div>
</div>
<nav class="site-nav">
<ul class="main-menu menu"><li class="menu-item menu-item-home"><a href="/" rel="section"><i class="fa fa-home fa-fw"></i>首页</a></li><li class="menu-item menu-item-tags"><a href="/tags/" rel="section"><i class="fa fa-tags fa-fw"></i>标签</a></li><li class="menu-item menu-item-categories"><a href="/categories/" rel="section"><i class="fa fa-th fa-fw"></i>分类</a></li><li class="menu-item menu-item-archives"><a href="/archives/" rel="section"><i class="fa fa-archive fa-fw"></i>归档</a></li>
<li class="menu-item menu-item-search">
<a role="button" class="popup-trigger"><i class="fa fa-search fa-fw"></i>搜索
</a>
</li>
</ul>
</nav>
<div class="search-pop-overlay">
<div class="popup search-popup"><div class="search-header">
<span class="search-icon">
<i class="fa fa-search"></i>
</span>
<div class="search-input-container">
<input autocomplete="off" autocapitalize="off" maxlength="80"
placeholder="搜索..." spellcheck="false"
type="search" class="search-input">
</div>
<span class="popup-btn-close" role="button">
<i class="fa fa-times-circle"></i>
</span>
</div>
<div class="search-result-container no-result">
<div class="search-result-icon">
<i class="fa fa-spinner fa-pulse fa-5x"></i>
</div>
</div>
</div>
</div>
</header>
<aside class="sidebar">
<div class="sidebar-inner sidebar-nav-active sidebar-toc-active">
<ul class="sidebar-nav">
<li class="sidebar-nav-toc">
文章目录
</li>
<li class="sidebar-nav-overview">
站点概览
</li>
</ul>
<div class="sidebar-panel-container">
<!--noindex-->
<div class="post-toc-wrap sidebar-panel">
<div class="post-toc animated"><ol class="nav"><li class="nav-item nav-level-1"><a class="nav-link" href="#%CF%88po"><span class="nav-number">1.</span> <span class="nav-text">ΨPO</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#ipo"><span class="nav-number">2.</span> <span class="nav-text">IPO</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#%E5%B0%8F%E7%BB%93"><span class="nav-number">3.</span> <span class="nav-text">小结</span></a></li><li class="nav-item nav-level-1"><a class="nav-link" href="#reference"><span class="nav-number">4.</span> <span class="nav-text">Reference</span></a></li></ol></div>
</div>
<!--/noindex-->
<div class="site-overview-wrap sidebar-panel">
<div class="site-author animated" itemprop="author" itemscope itemtype="http://schema.org/Person">
<img class="site-author-image" itemprop="image" alt="Lin"
src="/images/avatar/Picasso_Elephant.png">
<p class="site-author-name" itemprop="name">Lin</p>
<div class="site-description" itemprop="description">AI | NLP</div>
</div>
<div class="site-state-wrap animated">
<nav class="site-state">
<div class="site-state-item site-state-posts">
<a href="/archives/">
<span class="site-state-item-count">62</span>
<span class="site-state-item-name">日志</span>
</a>
</div>
<div class="site-state-item site-state-categories">
<a href="/categories/">
<span class="site-state-item-count">5</span>
<span class="site-state-item-name">分类</span></a>
</div>
<div class="site-state-item site-state-tags">
<a href="/tags/">
<span class="site-state-item-count">66</span>
<span class="site-state-item-name">标签</span></a>
</div>
</nav>
</div>
<div class="links-of-author animated">
<span class="links-of-author-item">
<a href="mailto:331603034@qq.com" title="E-Mail → mailto:331603034@qq.com" rel="noopener me" target="_blank"><i class="fa-regular fa-envelope fa-fw"></i>E-Mail</a>
</span>
</div>
<div class="cc-license animated" itemprop="license">
<a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" class="cc-opacity" rel="noopener" target="_blank"><img src="https://cdnjs.cloudflare.com/ajax/libs/creativecommons-vocabulary/2020.11.3/assets/license_badges/small/by_nc_sa.svg" alt="Creative Commons"></a>
</div>
<!--
<script type="text/javascript" src="//rf.revolvermaps.com/0/0/1.js?i=5acfv0hqzp5&s=220&m=1&v=false&r=false&b=000000&n=false&c=ff0000" async="async"></script>
-->
</div>
</div>
</div>
</aside>
</div>
<div class="main-inner post posts-expand">
<div class="post-block">
<article itemscope itemtype="http://schema.org/Article" class="post-content" lang="zh-CN">
<link itemprop="mainEntityOfPage" href="https://saicat.github.io/4fe7b810.html">
<span hidden itemprop="author" itemscope itemtype="http://schema.org/Person">
<meta itemprop="image" content="/images/avatar/Picasso_Elephant.png">
<meta itemprop="name" content="Lin">
</span>
<span hidden itemprop="publisher" itemscope itemtype="http://schema.org/Organization">
<meta itemprop="name" content="Linsight">
<meta itemprop="description" content="AI | NLP">
</span>
<span hidden itemprop="post" itemscope itemtype="http://schema.org/CreativeWork">
<meta itemprop="name" content="大模型偏好对齐-IPO | Linsight">
<meta itemprop="description" content="">
</span>
<header class="post-header">
<h1 class="post-title" itemprop="name headline">
大模型偏好对齐-IPO
</h1>
<div class="post-meta-container">
<div class="post-meta">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar"></i>
</span>
<span class="post-meta-item-text">发表于</span>
<time title="创建时间:2024-06-02 11:58:52" itemprop="dateCreated datePublished" datetime="2024-06-02T11:58:52+08:00">2024-06-02</time>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-calendar-check"></i>
</span>
<span class="post-meta-item-text">更新于</span>
<time title="修改时间:2024-06-06 22:52:00" itemprop="dateModified" datetime="2024-06-06T22:52:00+08:00">2024-06-06</time>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="far fa-folder"></i>
</span>
<span class="post-meta-item-text">分类于</span>
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/CS/" itemprop="url" rel="index"><span itemprop="name">CS</span></a>
</span>
,
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/CS/NLP/" itemprop="url" rel="index"><span itemprop="name">NLP</span></a>
</span>
,
<span itemprop="about" itemscope itemtype="http://schema.org/Thing">
<a href="/categories/CS/NLP/LLM/" itemprop="url" rel="index"><span itemprop="name">LLM</span></a>
</span>
</span>
<span class="post-meta-break"></span>
<span class="post-meta-item" title="本文字数">
<span class="post-meta-item-icon">
<i class="far fa-file-word"></i>
</span>
<span class="post-meta-item-text">本文字数:</span>
<span>3.5k</span>
</span>
<span class="post-meta-item" title="阅读时长">
<span class="post-meta-item-icon">
<i class="far fa-clock"></i>
</span>
<span class="post-meta-item-text">阅读时长 ≈</span>
<span>6 分钟</span>
</span>
</div>
</div>
</header>
<div class="post-body" itemprop="articleBody"><p>【本文已在同名 微信公众号 / 知乎 / <a target="_blank" rel="noopener" href="http://www.linsight.cn/">个人博客linsight.cn</a> 上线】</p>
<hr>
<p>前面我们对DPO、ODPO、simPO的思路做了整理:<a target="_blank" rel="noopener" href="http://www.linsight.cn/473f2b43.html">大模型偏好对齐-DPO</a>,<a target="_blank" rel="noopener" href="http://www.linsight.cn/da871ebe.html">大模型偏好对齐-ODPO</a>,<a target="_blank" rel="noopener" href="http://www.linsight.cn/280fa97a.html">大模型偏好对齐-simPO</a>。</p>
<p>而《A General Theoretical Paradigm to Understand Learning from Human
Preferences》提出了可以将RLHF和DPO的目标函数视为其中一个特例的更general的目标函数ΨPO,并对ΨPO的一些问题进行了分析,最终设计了Identity-PO
(IPO)来绕过这些问题。</p>
<h1 id="ψpo">ΨPO</h1>
<p>回顾一下RLHF,它的目标函数是</p>
<p><span class="math display">\[\mathbb{E}_\pi[r(x,y)]-\beta
D_{\text{KL}}(\pi\mid\mid\pi_{\text{ref}})\]</span></p>
<p>而DPO从等价的目标函数推导出DPO的损失函数如下</p>
<p><span class="math display">\[\begin{aligned}\min_{\pi}\mathbb{E}_{(x,y_w,y_l)\sim\mathcal{D}}\Bigg[-\log\sigma\Bigg(\beta\log\Bigg(\frac{\pi(y_w|x)}{\pi(y_l|x)}\Bigg)-\beta\log\left(\frac{\pi_{\mathrm{ref}}(y_w|x)}{\pi_{\mathrm{ref}}(y_l|x)}\Bigg)\Bigg)\right]\end{aligned}\]</span></p>
<p>IPO这篇论文则提出一个general的目标函数。考虑一个对preference
probability进行非线性变换的non-decreasing function Ψ</p>
<p><span class="math display">\[\Psi:\begin{bmatrix}0,1\end{bmatrix}\to\mathbb{R}\]</span></p>
<p>Ψ-preference optimisation objective定义为</p>
<p><span class="math display">\[\max_\pi\quad\mathbb{E}_{x\thicksim\rho}\quad[\Psi(p^*(y\succ
y'|x))]-\beta
D_{\mathrm{KL}}(\pi\mid\mid\pi_{\mathrm{ref}})\]</span></p>
<p>如果我们给Ψ一个具体定义,如下式</p>
<p><span class="math display">\[\Psi(q)=\log(q/(1-q))\]</span></p>
<p>那么在Bradley-Terry model的假设下,我们有</p>
<p><span class="math display">\[\begin{aligned}
\mathbb{E}_{y'\thicksim\mu}[\Psi(p^*(y\succ y'))]&
=\underset{y'\thicksim\mu}{\operatorname*{\mathbb{E}}}\left[\Psi\left(\frac{e^{r(y)}}{e^{r(y)}+e^{r(y')}}\right)\right] \\
&=\mathbb{E}_{y^{\prime}\thicksim\mu}[\log(e^{r(y)}/e^{r(y^{\prime})})]
\\
&=\mathbb{E}_{y'\thicksim\mu}[r(y)-r(y')] \\
&=r(y)-\underset{y'\thicksim\mu}{\mathbb{E}}[r(y')]
\end{aligned}\]</span></p>
<p>右边最终结果里的第二项可视为常数。除去这个常数,ΨPO的优化目标和RLHF的优化目标是等价的,同时也就和DPO的目标是等价的。</p>
<p>同DPO的做法一样,这里我们可以推出ΨPO在Bradley-Terry
model下的解析解</p>
<p><span class="math display">\[\pi^*(y)\propto\pi_{\mathrm{ref}}(y)\exp\left(\beta^{-1}\mathbb{E}_{y^{\prime}\thicksim\mu}[\Psi(p^*(y\succ
y^{\prime}))]\right)\]</span></p>
<p>我们把Ψ(q)的图像画出来,如下所示</p>
<img src="/4fe7b810/curve.png" class title="log">
<p>可以看到在两端,Ψ(q)的曲线有很强的非线性化特征,并且值会趋向于无穷大。</p>
<p>那么当我们对一对质量差异很大的样本,即</p>
<p><span class="math display">\[p^*(y\succ y')=1\]</span></p>
<p>进行学习时,在BT模型的假设下,就有</p>
<p><span class="math display">\[(r(y)-r(y'))\to+\infty\]</span></p>
<p>把 <span class="math inline">\((r(y)-r(y'))\to+\infty\)</span>
代入到ΨPO上面退出来的解析解里,有</p>
<p><span class="math display">\[\begin{aligned}
&\frac{\pi^*(y_l)}{\pi^*(y_w)}\\
=&\frac{\pi_{\mathrm{ref}}(y_l)}{\pi_{\mathrm{ref}}(y_w)}\mathrm{exp}\left(\beta^{-1}\sum_{y^{\prime}}[\Psi(p(y_l\succ
y^{\prime}))-\Psi(p(y_w\succ y^{\prime}))]\right)\\
=&\frac{\pi_{\mathrm{ref}}(y_l)}{\pi_{\mathrm{ref}}(y_w)}\mathrm{exp}(\beta^{-1}\sum_{y^{\prime}}[r(y_l)-r(y_w)])\\
=&\frac{\pi_{\mathrm{ref}}(y_l)}{\pi_{\mathrm{ref}}(y_w)}\mathrm{exp}(\beta^{-1}\sum_{y^{\prime}}[-\infty])\\
=&0
\end{aligned}\]</span></p>
<p>那么此时无论 <span class="math inline">\(\beta\)</span>
取什么值,都有 <span class="math inline">\(\pi^*(y_l)=0\)</span>。说明当偏好越确定,KL项的约束能力越弱,模型就很容易摆脱KL项的约束,过度追求reward的最大化,最终导致过拟合。</p>
<p>不过RLHF在实践上并没有表现出如这里推算结果一样特别容易过拟合的特性,原因是因为训练出来的reward模型通常由于欠拟合,没有给出那么极端的偏好概率。反而是DPO因为节省了reward模型的训练,因此更加容易受到这种过拟合的困扰。</p>
<h1 id="ipo">IPO</h1>
<p>既然高度非线性化(且极值无限大)的Ψ(q)会导致DPO容易过拟合,那么一个自然的想法就是把Ψ(q)替换成一个有界的函数,identity
mapping恒等变换就是一个符合要求的选择。这样就得到IPO的目标函数</p>
<p><span class="math display">\[\max_\pi\quad\mathbb{E}_{x\thicksim\rho}\quad[p^*(y\succ
y'|x)]-\beta
D_{\mathrm{KL}}(\pi\mid\mid\pi_{\mathrm{ref}})\]</span></p>
<p>根据这个,可以推导出IPO的损失函数为</p>
<p><span class="math display">\[\mathbb{E}_{(y_w,y_l,x)\thicksim
D}\left(h_\pi(y_w,y_l,x)-\frac{\beta^{-1}}2\right)^2\]</span></p>
<p><span class="math display">\[h_\pi(y,y',x)=\log\left(\frac{\pi(y|x)\pi_{\text{ref}}(y'|x)}{\pi(y'|x)\pi_{\text{ref}}(y|x)}\right)\]</span></p>
<h1 id="小结">小结</h1>
<p>ΨPO/IPO从理论上对DPO进行了一系列的分析,也推出了一个相对更不容易过拟合的偏好学习方法。不过在实践上的证明没有完善,可以作为一个理解的DPO的角度来参考吧。</p>
<hr>
<p>读到这了,来一发点赞收藏关注吧~</p>
<p>博客:<a target="_blank" rel="noopener" href="http://www.linsight.cn/">http://www.linsight.cn/</a><br>
知乎:<a target="_blank" rel="noopener" href="https://www.zhihu.com/people/us4ever">Linsight</a><br>
微信公众号:Linsight<br>
<img src="/images/qrcode.jpg"></p>
<hr>
<p>【往期文章】</p>
<p><a target="_blank" rel="noopener" href="http://www.linsight.cn/44e38c1b.html">MoE模型的前世今生</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/c4da56c0.html">LLM长上下文的问题</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/cc852861.html">解锁大模型长上下文能力</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/45ee1a6d.html">大模型推理窗口-从有限到无限大</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/3dc22f96.html">理解Attention:从起源到MHA,MQA和GQA</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/f5c015c.html">大模型推理加速-投机解码</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/473f2b43.html">大模型偏好对齐-DPO</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/da871ebe.html">大模型偏好对齐-ODPO</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/280fa97a.html">大模型偏好对齐-simPO</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/41b6a819.html">Yi技术报告-划重点看细节</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/6a40bfa5.html">transformer中normalization的二三事</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/b70b4a2d.html">从代码实现看normalization-到底做了什么</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/c61d17e3.html">稀疏注意力计算:sliding
window attention</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/a051710f.html">理解LLM位置编码:RoPE</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/3345028a.html">大模型算法题(1)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/ad0bba9d.html">大模型算法题(2)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/1736008.html">大模型算法题(3)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/1736008.html">大模型算法题(4)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/336f2f3e.html">大模型算法题(5)</a><br>
<a target="_blank" rel="noopener" href="http://www.linsight.cn/7c04944d.html">大模型算法题(6)</a></p>
<hr>
<h1 id="reference">Reference</h1>
<p>【1】A General Theoretical Paradigm to Understand Learning from Human
Preferences https://arxiv.org/abs/2310.12036</p>
</div>
<footer class="post-footer">
<div class="post-copyright">
<ul>
<li class="post-copyright-author">
<strong>本文作者: </strong>Lin
</li>
<li class="post-copyright-link">
<strong>本文链接:</strong>
<a href="https://saicat.github.io/4fe7b810.html" title="大模型偏好对齐-IPO">https://saicat.github.io/4fe7b810.html</a>
</li>
<li class="post-copyright-license">
<strong>版权声明: </strong>本博客所有文章除特别声明外,均采用 <a href="https://creativecommons.org/licenses/by-nc-sa/4.0/" rel="noopener" target="_blank"><i class="fab fa-fw fa-creative-commons"></i>BY-NC-SA</a> 许可协议。转载请注明出处!
</li>
</ul>
</div>
<div class="post-tags">
<a href="/tags/NLP/" rel="tag"><i class="fa fa-tag"></i> NLP</a>
<a href="/tags/LLM/" rel="tag"><i class="fa fa-tag"></i> LLM</a>
<a href="/tags/transformer/" rel="tag"><i class="fa fa-tag"></i> transformer</a>
<a href="/tags/%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0/" rel="tag"><i class="fa fa-tag"></i> 强化学习</a>
<a href="/tags/%E5%BE%AE%E8%B0%83/" rel="tag"><i class="fa fa-tag"></i> 微调</a>
<a href="/tags/SFT/" rel="tag"><i class="fa fa-tag"></i> SFT</a>
<a href="/tags/%E5%81%8F%E5%A5%BD%E5%AF%B9%E9%BD%90/" rel="tag"><i class="fa fa-tag"></i> 偏好对齐</a>
</div>
<div class="post-nav">
<div class="post-nav-item">
<a href="/280fa97a.html" rel="prev" title="大模型偏好对齐-simPO">
<i class="fa fa-angle-left"></i> 大模型偏好对齐-simPO
</a>
</div>
<div class="post-nav-item">
<a href="/1d5bcd45.html" rel="next" title="昆仑万维-SkyworkMoE">
昆仑万维-SkyworkMoE <i class="fa fa-angle-right"></i>
</a>
</div>
</div>
</footer>
</article>
</div>
<div class="comments utterances-container"></div>
</div>
</main>
<footer class="footer">
<div class="footer-inner">
<div class="copyright">
©
<span itemprop="copyrightYear">2024</span>
<span class="with-love">
<i class="fa fa-heart"></i>
</span>
<span class="author" itemprop="copyrightHolder">Lin</span>
</div>
<div class="wordcount">
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="fa fa-chart-line"></i>
</span>
<span title="站点总字数">486k</span>
</span>
<span class="post-meta-item">
<span class="post-meta-item-icon">
<i class="fa fa-coffee"></i>
</span>
<span title="站点阅读时长">14:44</span>
</span>
</div>
<div class="busuanzi-count">
</div>
<!--
-->
<!-- 网站运行时间的设置 -->
<span id="timeDate">载入天数...</span>
<span id="times">载入时分秒...</span>
<script>
var now = new Date();
function createtime() {
var grt= new Date("03/01/2023 10:00:00"); //此处修改你的建站时间或者网站上线时间
now.setTime(now.getTime()+250);
days = (now - grt ) / 1000 / 60 / 60 / 24; dnum = Math.floor(days);
hours = (now - grt ) / 1000 / 60 / 60 - (24 * dnum); hnum = Math.floor(hours);
if(String(hnum).length ==1 ){hnum = "0" + hnum;} minutes = (now - grt ) / 1000 /60 - (24 * 60 * dnum) - (60 * hnum);
mnum = Math.floor(minutes); if(String(mnum).length ==1 ){mnum = "0" + mnum;}
seconds = (now - grt ) / 1000 - (24 * 60 * 60 * dnum) - (60 * 60 * hnum) - (60 * mnum);
snum = Math.round(seconds); if(String(snum).length ==1 ){snum = "0" + snum;}
document.getElementById("timeDate").innerHTML = "本站已安全运行 "+dnum+" 天 ";
document.getElementById("times").innerHTML = hnum + " 小时 " + mnum + " 分 " + snum + " 秒.";
}
setInterval("createtime()",250);
</script>
</div>
</footer>
<div class="back-to-top" role="button" aria-label="返回顶部">
<i class="fa fa-arrow-up fa-lg"></i>
<span>0%</span>
</div>
<noscript>
<div class="noscript-warning">Theme NexT works best with JavaScript enabled</div>
</noscript>
<script src="https://cdnjs.cloudflare.com/ajax/libs/animejs/3.2.1/anime.min.js" integrity="sha256-XL2inqUJaslATFnHdJOi9GfQ60on8Wx1C2H8DYiN1xY=" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/fancyapps-ui/5.0.28/fancybox/fancybox.umd.js" integrity="sha256-ytMJGN3toR+a84u7g7NuHm91VIR06Q41kMWDr2pq7Zo=" crossorigin="anonymous"></script>
<script src="/js/comments.js"></script><script src="/js/utils.js"></script><script src="/js/motion.js"></script><script src="/js/next-boot.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/hexo-generator-searchdb/1.4.1/search.js" integrity="sha256-1kfA5uHPf65M5cphT2dvymhkuyHPQp5A53EGZOnOLmc=" crossorigin="anonymous"></script>
<script src="/js/third-party/search/local-search.js"></script>
<script src="/js/third-party/fancybox.js"></script>
<script async src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>
<script class="next-config" data-name="enableMath" type="application/json">true</script><script class="next-config" data-name="mathjax" type="application/json">{"enable":true,"tags":"ams","js":{"url":"https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.2/es5/tex-mml-chtml.js","integrity":"sha256-MASABpB4tYktI2Oitl4t+78w/lyA+D7b/s9GEP0JOGI="}}</script>
<script src="/js/third-party/math/mathjax.js"></script>
<script class="next-config" data-name="utterances" type="application/json">{"enable":true,"repo":"Saicat/comment-utterance","issue_term":"pathname","theme":"github-light"}</script>
<script src="/js/third-party/comments/utterances.js"></script>
</body>
</html>