### Running Inference on Different Fine-Tuning Models

This notebook is intended for CS 182/282A project reviewers to verify that the models run. To re-run experiments to see how the models were trained, please take a look at the subdirectories after.

In [1]:
import h5py
import torch
from torch import nn
from collections import OrderedDict

#### 0. Test Embeddings

In [2]:
test_path = f'./sample_data/test_chunk_X1.h5'
f = h5py.File(test_path, 'r')
dset = f['embeddings']

#### 1. Linear Transformation

In [3]:
class LinearTransform(nn.Module):
    """Takes in input (B, 1536, 896) and outputs predictions (B, 18, 896)."""

    def __init__(self):
        super().__init__()
        self.conv_layer = nn.Conv1d(in_channels=1536, out_channels=18, kernel_size=1)
        nn.init.kaiming_normal_(self.conv_layer.weight, nonlinearity='relu')
        nn.init.zeros_(self.conv_layer.bias)
        self.activation = nn.Softplus()

    def forward(self, x):
        out = None
        out = self.activation(self.conv_layer(x))
        return out

In [4]:
trained_probe = LinearTransform()
trained_probe.load_state_dict(torch.load('../cs282a_linear-probing/first_full_run.pth', map_location=torch.device('cpu')))
trained_probe.eval()

LinearTransform(
  (conv_layer): Conv1d(1536, 18, kernel_size=(1,), stride=(1,))
  (activation): Softplus(beta=1, threshold=20)
)

In [6]:
for i in range(len(dset)):
    inputs = torch.Tensor(dset[i])
    predictions = trained_probe(inputs.transpose(0,1))
    print(predictions)

tensor([[185.4840, 191.4664, 192.6241,  ..., 151.3887, 155.1280, 152.4800],
        [161.7277, 163.1386, 169.3251,  ..., 104.0094, 109.2395, 103.1794],
        [164.3876, 167.4644, 170.2731,  ..., 130.6068, 136.7782, 134.1537],
        ...,
        [  5.4140,   5.1493,   7.5385,  ...,   4.1214,   4.5688,   6.1868],
        [ 21.0667,  32.2378,  33.6077,  ...,  15.7496,  31.8367,  35.1272],
        [ 10.4077,  13.6044,  13.3284,  ...,   6.8675,  11.5857,  14.0709]],
       grad_fn=<SoftplusBackward0>)
tensor([[149.2984, 153.1806, 147.6395,  ..., 161.8470, 160.1565, 155.4014],
        [167.3268, 170.5730, 169.7932,  ..., 183.3823, 172.8373, 173.1581],
        [163.4417, 167.6719, 162.6482,  ..., 163.1466, 160.1153, 159.9908],
        ...,
        [  8.0236,   8.1998,   8.9862,  ...,   6.2350,   2.3821,   4.0134],
        [ 21.2210,  16.4974,  23.3127,  ...,  24.2986,   2.7081,  23.4745],
        [  9.4221,   6.9032,   8.4551,  ...,  10.5540,   6.5561,  13.4993]],
       grad_fn=<Softplus

#### 2. 1D CNN + Perceptron

In [7]:
class MLPModel(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super(MLPModel, self).__init__(*args, **kwargs)

        self.layers = nn.Sequential(OrderedDict([
            ('conv1x1', nn.Conv1d(1536, 500, 1)),
            ('gelu1', nn.GELU()),
            ('flatten', nn.Flatten()),
            ('fc1', nn.Linear(448000, 18))
        ]))
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [8]:
mlp_model = MLPModel()
mlp_model.load_state_dict(torch.load('../cs282a_conv1d_perceptron/model_20231128_063541_2'))
mlp_model.eval()

MLPModel(
  (layers): Sequential(
    (conv1x1): Conv1d(1536, 500, kernel_size=(1,), stride=(1,))
    (gelu1): GELU(approximate='none')
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (fc1): Linear(in_features=448000, out_features=18, bias=True)
  )
)

In [9]:
for i in range(len(dset)):
    inputs = torch.Tensor(dset[i]).reshape(1,896,1536)
    predictions = mlp_model(inputs.transpose(1,2))
    print(predictions)

tensor([[185.1219, 165.0662, 172.5271, 169.7588, 183.6932, 167.8994, 186.5149,
         182.3538, 168.5229, 176.6938, 176.6521, 160.2308,  13.9360,   9.6113,
          67.5173,   3.1071,  77.7010,   9.5490]], grad_fn=<AddmmBackward0>)
tensor([[150.9066, 182.2299, 153.4019, 159.0182, 148.4989, 159.1315, 152.8746,
         141.5418, 171.3732, 160.7533, 153.8532, 165.8867,  14.1168,   9.8270,
         125.0091,   3.1626,  92.7511,   8.0396]], grad_fn=<AddmmBackward0>)
tensor([[130.3526, 155.1663, 149.7936, 150.6678, 148.9104, 131.7747, 154.3013,
         136.7864, 141.5564, 164.5454, 136.9123, 130.3914,   9.1082,   4.9956,
         150.3599,   3.4720, 143.1677,   9.8213]], grad_fn=<AddmmBackward0>)
tensor([[102.7417, 111.9067, 121.7186, 124.8033, 115.8242, 115.5144, 118.2789,
         117.0127, 105.2756, 111.5346, 111.8413, 114.4374,   2.5230,   3.3681,
         207.3947,   9.7393, 218.1850,  23.6601]], grad_fn=<AddmmBackward0>)
tensor([[ 84.6395,  82.0686,  96.8328, 113.9138, 110.0792,  

#### 3. 1d CNN + Max Pooling + Perceptron

In [10]:
class MLPModelPooling(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super(MLPModelPooling, self).__init__(*args, **kwargs)

        self.layers = nn.Sequential(OrderedDict([
            ('conv1x1', nn.Conv1d(1536, 500, 1)),
            ('gelu1', nn.GELU()),
            ('maxpool1', nn.MaxPool1d(896)),
            ('flatten', nn.Flatten()),
            ('fc1', nn.Linear(500, 18))
        ]))
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [11]:
pool_model = MLPModelPooling()
pool_model.load_state_dict(torch.load('../cs282a_perceptron-maxpool/model_20231128_072156_3'))
pool_model.eval()

MLPModelPooling(
  (layers): Sequential(
    (conv1x1): Conv1d(1536, 500, kernel_size=(1,), stride=(1,))
    (gelu1): GELU(approximate='none')
    (maxpool1): MaxPool1d(kernel_size=896, stride=896, padding=0, dilation=1, ceil_mode=False)
    (flatten): Flatten(start_dim=1, end_dim=-1)
    (fc1): Linear(in_features=500, out_features=18, bias=True)
  )
)

In [12]:
for i in range(len(dset)):
    inputs = torch.Tensor(dset[i]).reshape(1,896,1536)
    predictions = pool_model(inputs.transpose(1,2))
    print(predictions)

tensor([[187.6417, 182.9631, 172.3025, 171.0777, 177.0425, 178.5789, 174.7300,
         177.8415, 186.8805, 172.4592, 180.7357, 180.9333,  16.4236,  11.8089,
          96.5649,   4.4737,  74.6781,  10.4619]], grad_fn=<AddmmBackward0>)
tensor([[167.4127, 174.4773, 159.5704, 157.5585, 155.1230, 167.7140, 155.4463,
         158.8519, 173.2176, 157.5184, 164.5945, 172.0140,  14.6018,  10.5576,
         115.2723,   3.8817,  91.8531,   8.5334]], grad_fn=<AddmmBackward0>)
tensor([[128.7370, 131.3702, 131.6922, 136.2482, 134.5864, 123.2989, 139.5895,
         132.2911, 127.2798, 139.2241, 126.8751, 122.9097,   7.8661,   5.7660,
         150.1024,   6.3933, 123.9708,  14.5684]], grad_fn=<AddmmBackward0>)
tensor([[101.0909,  97.1315, 115.4432, 117.8345, 105.8991, 114.1162, 112.4615,
         119.6248, 104.3572, 105.1508, 104.3456, 112.9376,   2.9304,   3.1745,
         226.8024,  11.2567, 221.3595,  22.9852]], grad_fn=<AddmmBackward0>)
tensor([[ 87.5435,  77.3884,  94.4952, 111.0738, 109.6526,  

#### 4. Transformer

In [13]:
class TransformerDecoder(nn.Module):
    def __init__(self, d_model, heads, forward_expansion, dropout, max_length):
        super(TransformerDecoder, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=heads, dropout=dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, forward_expansion * d_model),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(forward_expansion * d_model, d_model)
        )

        self.dropout = nn.Dropout(dropout)

        # Additional linear layer for output transformation
        self.output_transform = nn.Linear(d_model, 18)

        # Adaptive pooling layer to handle sequence length
        self.sequence_pooling = nn.AdaptiveAvgPool1d(1)

    def forward(self, x, enc_out=None, src_mask=None, trg_mask=None):
        attention_output, _ = self.attention(x, x, x, attn_mask=trg_mask)
        query = self.dropout(self.norm1(attention_output + x))

        out = self.feed_forward(query)
        out = self.dropout(self.norm2(out + query))

        out_transformed = self.output_transform(out)

        out_pooled = self.sequence_pooling(out_transformed.transpose(1, 2)).transpose(1, 2)

        return out_pooled

In [14]:
trained_basenji_transformer = TransformerDecoder(d_model=1536, heads=6, forward_expansion=2, dropout=0.2, max_length=896)
trained_filepath  = '../cs282a_self-attention/model_20231128_080512_7'
trained_basenji_transformer.load_state_dict(torch.load(trained_filepath))
trained_basenji_transformer.eval()

TransformerDecoder(
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=1536, out_features=1536, bias=True)
  )
  (norm1): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  (norm2): LayerNorm((1536,), eps=1e-05, elementwise_affine=True)
  (feed_forward): Sequential(
    (0): Linear(in_features=1536, out_features=3072, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=3072, out_features=1536, bias=True)
  )
  (dropout): Dropout(p=0.2, inplace=False)
  (output_transform): Linear(in_features=1536, out_features=18, bias=True)
  (sequence_pooling): AdaptiveAvgPool1d(output_size=1)
)

In [15]:
for i in range(len(dset)):
    inputs = torch.Tensor(dset[i]).reshape(1,896,1536)
    predictions = trained_basenji_transformer(inputs)
    print(predictions)

tensor([[[192.1179, 150.0023, 172.0852, 163.9631, 183.5785, 166.4874, 183.9130,
          188.9735, 162.8639, 170.7770, 177.2941, 156.4945,  17.7902,  10.1166,
           76.4997,   5.4054,  74.5288,  11.7602]]],
       grad_fn=<TransposeBackward0>)
tensor([[[151.0708, 179.3533, 151.1176, 154.5428, 150.8366, 157.7057, 142.9166,
          144.1196, 171.6321, 152.7803, 152.4716, 163.7518,  16.7840,  10.1042,
          117.5346,   3.4626, 103.1409,   9.8003]]],
       grad_fn=<TransposeBackward0>)
tensor([[[130.2856, 147.0574, 146.2804, 143.7901, 145.7762, 128.9431, 154.5514,
          136.5906, 134.7770, 159.8839, 138.5390, 124.8242,  10.4996,   5.6922,
          157.1832,   4.0811, 139.3578,  10.1028]]],
       grad_fn=<TransposeBackward0>)
tensor([[[ 98.6346, 100.6480, 113.1690, 117.4675, 112.4247, 112.6396, 109.6569,
          111.0721, 103.6091, 102.4488, 107.0185, 110.5289,   4.1152,   2.7382,
          218.2716,  11.6817, 227.7730,  24.4503]]],
       grad_fn=<TransposeBackward0>)
