# Graphormer论文笔记（二）
> 代码来自于：https://github.com/ytchx1999/Graphormer/blob/main/graphormer/
本篇笔记主要分析Graphormer模型中的几段代码，包含主要的transformer的组件，最短距离嵌入bias，节点度的嵌入方法等。


### 1、主要组件：
```python
class FeedForwardNetwork(nn.Module):
    def __init__(self, hidden_size, ffn_size, dropout_rate):
        super(FeedForwardNetwork, self).__init__()

        self.layer1 = nn.Linear(hidden_size, ffn_size)
        self.gelu = nn.GELU()
        self.layer2 = nn.Linear(ffn_size, hidden_size)

    def forward(self, x):
        x = self.layer1(x)
        x = self.gelu(x)
        x = self.layer2(x)
        return x
```
FeedForwardNetwork类主要是FFN层，了解了transformer的架构该模块不需要单独介绍，只需要注意几点，这里中间的激活函数用的gelu，并且论文中提过这里并没有用传统的升四维再降维的方式而是中间隐藏层维度不变，原论文经过测试这种方式没有明显的性能损失。

```python
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, attention_dropout_rate, num_heads):
        super(MultiHeadAttention, self).__init__()

        self.num_heads = num_heads

        self.att_size = att_size = hidden_size // num_heads
        self.scale = att_size ** -0.5

        self.linear_q = nn.Linear(hidden_size, num_heads * att_size)
        self.linear_k = nn.Linear(hidden_size, num_heads * att_size)
        self.linear_v = nn.Linear(hidden_size, num_heads * att_size)
        self.att_dropout = nn.Dropout(attention_dropout_rate)

        self.output_layer = nn.Linear(num_heads * att_size, hidden_size)

    def forward(self, q, k, v, attn_bias=None):
        orig_q_size = q.size()

        d_k = self.att_size
        d_v = self.att_size
        batch_size = q.size(0)

        # head_i = Attention(Q(W^Q)_i, K(W^K)_i, V(W^V)_i)
        q = self.linear_q(q).view(batch_size, -1, self.num_heads, d_k)
        k = self.linear_k(k).view(batch_size, -1, self.num_heads, d_k)
        v = self.linear_v(v).view(batch_size, -1, self.num_heads, d_v)

        q = q.transpose(1, 2)                  # [b, h, q_len, d_k]
        v = v.transpose(1, 2)                  # [b, h, v_len, d_v]
        k = k.transpose(1, 2).transpose(2, 3)  # [b, h, d_k, k_len]

        # Scaled Dot-Product Attention.
        # Attention(Q, K, V) = softmax((QK^T)/sqrt(d_k))V
        q = q * self.scale
        x = torch.matmul(q, k)  # [b, h, q_len, k_len]
        if attn_bias is not None:
            x = x + attn_bias

        x = torch.softmax(x, dim=3)
        x = self.att_dropout(x)
        x = x.matmul(v)  # [b, h, q_len, attn]

        x = x.transpose(1, 2).contiguous()  # [b, q_len, h, attn]
        x = x.view(batch_size, -1, self.num_heads * d_v)

        x = self.output_layer(x)

        assert x.size() == orig_q_size
        return x
```
这里的MultiHeadAttention层也不需要多介绍，q、k、v的生成是直接使用linear层生成的，这里的多头直接在隐藏层基础上分开头的维度即可，最后还加了一层linear层，防止向下取整后隐藏层维度对不齐的情况。

```python
class EncoderLayer(nn.Module):
    def __init__(self, hidden_size, ffn_size, dropout_rate, attention_dropout_rate, num_heads):
        super(EncoderLayer, self).__init__()

        self.self_attention_norm = nn.LayerNorm(hidden_size)
        self.self_attention = MultiHeadAttention(
            hidden_size, attention_dropout_rate, num_heads)
        self.self_attention_dropout = nn.Dropout(dropout_rate)

        self.ffn_norm = nn.LayerNorm(hidden_size)
        self.ffn = FeedForwardNetwork(hidden_size, ffn_size, dropout_rate)
        self.ffn_dropout = nn.Dropout(dropout_rate)

    def forward(self, x, attn_bias=None):
        y = self.self_attention_norm(x)
        y = self.self_attention(y, y, y, attn_bias)
        y = self.self_attention_dropout(y)
        x = x + y

        y = self.ffn_norm(x)
        y = self.ffn(y)
        y = self.ffn_dropout(y)
        x = x + y
        return x
```
这里的EncoderLayer层就是将左右组件构成的Graphormer的编码层，只需要注意16行添加的bias是最短距离bias和edge-feature bias。

### 2、Enmbedding层：
```python
if dataset_name == 'ZINC':
    self.atom_encoder = nn.Embedding(64, hidden_dim, padding_idx=0)
    self.edge_encoder = nn.Embedding(64, num_heads, padding_idx=0)
    self.edge_type = edge_type
    if self.edge_type == 'multi_hop':
        self.edge_dis_encoder = nn.Embedding(
            40 * num_heads * num_heads, 1)
    self.rel_pos_encoder = nn.Embedding(40, num_heads, padding_idx=0)
    self.in_degree_encoder = nn.Embedding(
        64, hidden_dim, padding_idx=0)
    self.out_degree_encoder = nn.Embedding(
        64, hidden_dim, padding_idx=0)
else:
    self.atom_encoder = nn.Embedding(
        512 * 9 + 1, hidden_dim, padding_idx=0)
    self.edge_encoder = nn.Embedding(
        512 * 3 + 1, num_heads, padding_idx=0)
    self.edge_type = edge_type
    if self.edge_type == 'multi_hop':
        self.edge_dis_encoder = nn.Embedding(
            128 * num_heads * num_heads, 1)
    self.rel_pos_encoder = nn.Embedding(512, num_heads, padding_idx=0)
    self.in_degree_encoder = nn.Embedding(
        512, hidden_dim, padding_idx=0)
    self.out_degree_encoder = nn.Embedding(
        512, hidden_dim, padding_idx=0)
```
这里可以看到zinc数据集Embedding层输入维度为64，应该表示字典加上unk最大为64种标号，其他的数据集输入维度原子嵌入为519\*9个维度，因为ogbg-mol\*数据的特征维度是9维，边的维度是3维，所以边数据的嵌入维度是512\*3，由于输入原始特征是多维的（9维或3维）因此还需要做一个编码将维度加到一维再做Embedding代码如下：

```python
def convert_to_single_emb(x, offset=512):
    feature_num = x.size(1) if len(x.size()) > 1 else 1
    feature_offset = 1 + \
        torch.arange(0, feature_num * offset, offset, dtype=torch.long)
    x = x + feature_offset
    return x
```

另外中心度的bias计算也是使用的Embedding层，但是输出维度不再是隐藏层的特征维度了，是多头的头数，因为这个bias是直接加到attention矩阵中的。最后的到的bias矩阵是[num_of_nodes, num_of_nodes, heads]大小的。以及最短距离bias输入维度也是512，因为最短距离范围只能在0到图中最大节点总数个数之间。