# Transformer 기초내용

`-` 이번 내용 : Transformer 전의 RNN 과 관련하여 긴 시퀀스에 대해서 레이어가 깊어져도 학습이 잘 되는 방법론들 소개

## ExtendedNeuralGPU
```python
import torch
import torch.nn as nn

class ExtendedNeuralGPU(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers):
        super(ExtendedNeuralGPU, self).__init__()
        self.convs = nn.ModuleList([nn.Conv1d(input_dim if i == 0 else hidden_dim, hidden_dim, kernel_size=3, padding=1) for i in range(n_layers)])
        self.glu = nn.GLU()
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        for conv in self.convs:
            x = self.glu(conv(x))
        x = x.mean(dim=2)
        x = self.fc(x)
        return x

# Example usage
model = ExtendedNeuralGPU(input_dim=10, hidden_dim=20, output_dim=30, n_layers=5)
input_seq = torch.rand(32, 10, 50)  # batch_size, input_dim, seq_length
output = model(input_seq)
print(output.shape)  # Should be (32, 30)

```


## ByteNet

```python
import torch
import torch.nn as nn

class ByteNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers):
        super(ByteNet, self).__init__()
        self.convs = nn.ModuleList([nn.Conv1d(input_dim if i == 0 else hidden_dim, hidden_dim, kernel_size=3, padding=1, dilation=2**i) for i in range(n_layers)])
        self.residual_conns = nn.ModuleList([nn.Conv1d(hidden_dim, hidden_dim, kernel_size=1) for _ in range(n_layers)])
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        for conv, res_conn in zip(self.convs, self.residual_conns):
            residual = x
            x = conv(x)
            x = torch.relu(x)
            x = res_conn(x) + residual
        x = x.mean(dim=2)
        x = self.fc(x)
        return x

# Example usage
model = ByteNet(input_dim=10, hidden_dim=20, output_dim=30, n_layers=5)
input_seq = torch.rand(32, 10, 50)  # batch_size, input_dim, seq_length
output = model(input_seq)
print(output.shape)  # Should be (32, 30)

```

## ConvS2S (Convolutional Sequence to Sequence)

```python
import torch
import torch.nn as nn

class ConvS2S(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, max_seq_len):
        super(ConvS2S, self).__init__()
        self.embed = nn.Embedding(max_seq_len, input_dim)
        self.convs = nn.ModuleList([nn.Conv1d(input_dim if i == 0 else hidden_dim, hidden_dim, kernel_size=3, padding=1) for i in range(n_layers)])
        self.glu = nn.GLU()
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = self.embed(x)
        x = x.transpose(1, 2)
        for conv in self.convs:
            x = self.glu(conv(x))
        x = x.mean(dim=2)
        x = self.fc(x)
        return x

# Example usage
model = ConvS2S(input_dim=10, hidden_dim=20, output_dim=30, n_layers=5, max_seq_len=50)
input_seq = torch.randint(0, 50, (32, 50))  # batch_size, seq_length
output = model(input_seq)
print(output.shape)  # Should be (32, 30)
```