# MyModels, contains the models and code for creating and recreating model arcitecture!

## Libraries

In [1]:
import torch
from torch import nn
import math
from torch.nn.utils.rnn import pad_sequence

## Models

In [2]:
# A simnple neural network that tries to learn with four hidden layers.
class NeuralNetwork(nn.Module):
    def __init__(self, parameters):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear_relu_stack(x)
        return x

    

In [3]:
# A simnple shape based neural network that tries to learn with four hidden layers.
class ShapeBasedNeuralNetwork(nn.Module):
    def __init__(self, parameters):
        super(ShapeBasedNeuralNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        x = self.linear_relu_stack(x)
        return x

    

In [4]:
# A simnple convolutional neural network.
class CONVNet(nn.Module):
    def __init__(self, parameters):
        super(CONVNet, self).__init__()
        self.relu = nn.ReLU()
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 4 * 4, 120)  
        self.fc2 = nn.Linear(120, 84) 
        self.fc3 = nn.Linear(84,10)
        

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))  
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x
        # #print(x.shape) 
        # x = self.pool(F.relu(self.conv1(x)))
        # #print(f"After conv1 and pool: {x.shape}")
        # x = self.pool(F.relu(self.conv2(x)))
        # #print(f"After conv2 and pool: {x.shape}")
        # x = x.view(-1, 16 * 4 * 4)
        # #print(f"After flattening: {x.shape}")
        # x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        # x = self.fc3(x)
        # return x

In [5]:
# A simnple neural network that tries to learn with coustom hidden layers. Not tested
class CustomNeuralNetwork(nn.Module):
    def __init__(self, parameters):
        super(CustomNeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.array_of_layers = []
        for inputs, outputs in parameters.layers[:-1]:
            array_of_layers.append(nn.Linear(inputs, outputs))
            array_of_layers.append(nn.ReLU())
        array_of_layers.append( nn.Linear(paramenters.layer_sizes[-1][0],paramenters.layer_sizes[-1][1]))
                

    def forward(self, x):
        x = self.flatten(x)
        for layer in array_of_layers:
            x = layer(x)
        return x

    

In [6]:
# An autoencoder network to be able to reacreate 
class SimpleAutoencoder(nn.Module):
    def __init__(self, parameters):
        super(SimpleAutoencoder, self).__init__()
        self.encode_only = False
        
        # first the encoder
        self.encoder = nn.Sequential(
            nn.Flatten(),

            nn.Linear(28*28, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32), # Compressed feature vector
        )

        # then the decoder
        self.decoder = nn.Sequential(
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 28 * 28),
            nn.Sigmoid(),  # Outputs pixel values between 0 and 1
        )

    
    def forward(self, x):
        if (self.encode_only):
            with torch.no_grad():
                encoded = self.encoder(x)
            return encoded 
        else:
            encoded = self.encoder(x)
            decoded =  self.decoder(encoded)
            return decoded.view(-1, 1, 28, 28)


In [7]:
#Classifier that takes encoded data 
class ClassifierOfEncodedData(nn.Module):
    def __init__(self,  parameters):
        super(ClassifierOfEncodedData, self).__init__()
        self.encoder = parameters["encoder"]  # Pretrained encoder from Autoencoder (frozen)
        self.encoder.encode_only = True
        self.classifier = nn.Sequential(
            nn.Linear(32, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 10) # 10 output classes for digits 0-9
           
        )

    def forward(self, x):
        x = self.encoder(x) # The model takes care of no_grad
        x = self.classifier(x)
        return x


In [25]:
#Classifier that uses the tranformer model

class PatchEmbedding(nn.Module):
    def __init__(self, img_size=28, patch_size=7, in_channels=1, embed_dim=64):
        super().__init__()
        self.num_patches = (img_size // patch_size) ** 2
        self.patch_size = patch_size
        
        # Linear projection of flattened patches
        self.projection = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        
    def forward(self, x):
        x = self.projection(x)  # (batch, embed_dim, num_patches^(1/2), num_patches^(1/2))
        x = x.flatten(2)  # Flatten spatial dimensions
        x = x.transpose(1, 2)  # (batch, num_patches, embed_dim)
        return x

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()

        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.out_linear = nn.Linear(embed_dim, embed_dim)

        #linear transformations for q, k and v
        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)

        self.out_linear = nn.Linear(embed_dim, embed_dim)

        self.scale = 1.0 / math.sqrt(self.head_dim)

    
    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        Q = self.q_linear(x)  # [batch_size, seq_len, embed_dim]
        K = self.k_linear(x)  # [batch_size, seq_len, embed_dim]
        V = self.v_linear(x)  # [batch_size, seq_len, embed_dim]

        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale

        attn_weights = nn.functional.softmax(attn_scores, dim=-1)

        x = torch.matmul(attn_weights, V)

        x = x.transpose(1, 2).contiguous().view(
            batch_size, seq_len, self.embed_dim
        )
        x = self.out_linear(x)
       
        return x

class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        
        # Feedforward MLP
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, embed_dim),
        )
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.dropout(self.attn(self.norm1(x)))  # Add & Norm
        x = x + self.dropout(self.mlp(self.norm2(x)))   # Add & Norm
        return x




class VisualTransformer(nn.Module):
    def __init__(self,  parameters, img_size=28, patch_size=28, in_channels=1, num_classes=10, 
                 embed_dim=64, num_heads=4, depth=6, mlp_dim=128):
        super(VisualTransformer, self).__init__()

        self.patch_embed = PatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.randn(1, num_patches + 1, embed_dim))

        self.transformer = nn.Sequential(
            *[TransformerEncoderBlock(embed_dim, num_heads, mlp_dim) for _ in range(depth)]
        )
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
        

    def forward(self, x):
        x = self.patch_embed(x) 
        
        batch_size = x.shape[0]
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)

        # Add positional embedding
        x += self.pos_embed

        # Pass through Transformer Encoder
        x = self.transformer(x)

        x = self.norm(x[:, 0])
        return self.head(x)


In [18]:
#Classifier that uses the alternative tranformer model

class AltPatchEmbedding(nn.Module):
    def __init__(self, img_size=28, patch_size=7, in_channels=1, embed_dim=64):
        super().__init__()
        self.num_patches = (img_size // patch_size) ** 2
        self.patch_size = patch_size
        
        # Linear projection of flattened patches
        self.projection = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        
    def forward(self, x):
        x = self.projection(x)  # (batch, embed_dim, num_patches^(1/2), num_patches^(1/2))
        x = x.flatten(2)  # Flatten spatial dimensions
        x = x.transpose(1, 2)  # (batch, num_patches, embed_dim)
        return x

class AltMultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()

        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
        
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.out_linear = nn.Linear(embed_dim, embed_dim)

        #linear transformations for q, k and v
        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)

        self.out_linear = nn.Linear(embed_dim, embed_dim)

        self.scale = 1.0 / math.sqrt(self.head_dim)

    
    def forward(self, x):
        batch_size, seq_len, _ = x.size()

        Q = self.q_linear(x)  # [batch_size, seq_len, embed_dim]
        K = self.k_linear(x)  # [batch_size, seq_len, embed_dim]
        V = self.v_linear(x)  # [batch_size, seq_len, embed_dim]

        Q = Q.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale

        attn_weights = nn.functional.softmax(attn_scores, dim=-1)

        x = torch.matmul(attn_weights, V)

        x = x.transpose(1, 2).contiguous().view(
            batch_size, seq_len, self.embed_dim
        )
        x = self.out_linear(x)
       
        return x

class AltTransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super().__init__()
        self.attn = AltMultiHeadSelfAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        
        # Feedforward MLP
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, embed_dim),
        )
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.dropout(self.attn(self.norm1(x)))  # Add & Norm
        x = x + self.dropout(self.mlp(self.norm2(x)))   # Add & Norm
        return x


class AltVisualTransformer(nn.Module):
    def __init__(self,  parameters, img_size=28, patch_size=28, in_channels=1, num_classes=10, 
                 embed_dim=64, num_heads=4, depth=6, mlp_dim=128):
        super(AltVisualTransformer, self).__init__()

        self.patch_embed = AltPatchEmbedding(img_size, patch_size, in_channels, embed_dim)
        num_patches = self.patch_embed.num_patches

        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim))
        self.pos_embed = nn.Parameter(torch.randn(1, num_patches + 1, embed_dim))

        self.transformer = nn.Sequential(
            *[AltTransformerEncoderBlock(embed_dim, num_heads, mlp_dim) for _ in range(depth)]
        )
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
        

    def forward(self, x):
        x = self.patch_embed(x) 
        
        batch_size = x.shape[0]
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)

        # Add positional embedding
        x += self.pos_embed

        # Pass through Transformer Encoder
        x = self.transformer(x)

        x = self.norm(x[:, 0])
        return self.head(x)


In [19]:
class ShapeSelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.qkv_linear = nn.Linear(embed_dim, 3 * embed_dim)  # Combine Q, K, V
        self.out_linear = nn.Linear(embed_dim, embed_dim)
        self.scale = 1.0 / math.sqrt(embed_dim)

    def forward(self, x):
        Q, K, V = self.qkv_linear(x).chunk(3, dim=-1)
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) * self.scale
        attn_weights = torch.softmax(attn_scores, dim=-1)
        return self.out_linear(torch.matmul(attn_weights, V))

class ShapeTransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, mlp_dim, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = ShapeSelfAttention(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, embed_dim)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))  
        x = x + self.mlp(self.norm2(x))   
        return x

class ShapeVisualTransformer(nn.Module):
    def __init__(self, parameter, input_dim=3, embed_dim=8, num_classes=10, depth=3, mlp_dim=8):
        super().__init__()

        # **Input Projection Layer**: Maps input_dim (3) → embed_dim (32)
        self.input_proj = nn.Linear(input_dim, embed_dim)  

        self.transformer = nn.Sequential(
            *[ShapeTransformerEncoderBlock(embed_dim, mlp_dim) for _ in range(depth)]
        )

        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, batch):
        # print("Using forward")
        # print("batch" , batch)
        result = torch.zeros(8)
        for x  in batch:
            x = self.input_proj(x)  # Convert (batch, seq_len, 3) → (batch, seq_len, 32)
            result += self.transformer(x)
        return self.head(self.norm(result.mean(dim=1)))  # Global Average Pooling

In [20]:
class ShapeTransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, mlp_dim):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads=2, batch_first=True)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, embed_dim)
        )

    def forward(self, x, mask=None):
        attn_out, _ = self.attn(self.norm1(x), self.norm1(x), self.norm1(x), key_padding_mask=mask)
        x = x + attn_out
        x = x + self.mlp(self.norm2(x))
        return x

class ShapeTransformer(nn.Module):
    def __init__(self, parameter, input_dim=3, embed_dim=128, num_classes=10, depth=4, mlp_dim=64, max_features=50):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, embed_dim)  
        self.encoder_blocks = nn.ModuleList(
            [ShapeTransformerEncoderBlock(embed_dim, mlp_dim) for _ in range(depth)]
        )
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)
        self.max_features = max_features

    def forward(self, batch):
        """
        batch: list of feature tensors (each tensor is (num_features, 3))
        """
        sampled_batch = []
        for x in batch:
            # Randomly sample up to `max_features`
            if x.shape[0] > self.max_features:
                indices = torch.randperm(x.shape[0])[:self.max_features]  # Random indices
                x = x[indices]
            sampled_batch.append(x)

        # Project inputs to embedding space
        batch_embedded = [self.input_proj(x) for x in sampled_batch]

        # Pad sequences to max length in batch
        padded_batch = pad_sequence(batch_embedded, batch_first=True, padding_value=0)
        mask = (padded_batch.abs().sum(dim=-1) == 0)  # Mask padding

        # Apply Transformer Encoder blocks
        for block in self.encoder_blocks:
            padded_batch = block(padded_batch, mask=mask)

        # Global Average Pooling over valid features (ignoring padding)
        valid_counts = (~mask).sum(dim=1, keepdim=True).float()
        pooled = padded_batch.sum(dim=1) / valid_counts

        return self.head(self.norm(pooled))


## Functions

In [21]:
def create_empty_model(model_name , parameters= {}, device= "cpu"):
    """Takes a name of a model and the device to return a model with aritecture, but does not contain trained weights, loss function or optimizer"""
    models_class_map = {
        "NeuralNetwork": NeuralNetwork,
        "ShapeBasedNeuralNetwork": ShapeBasedNeuralNetwork,
        "CONVNet": CONVNet,
        "SimpleAutoencoder": SimpleAutoencoder,
        "ClassifierOfEncodedData": ClassifierOfEncodedData,
        "VisualTransformer": VisualTransformer,
        "AltVisualTransformer": AltVisualTransformer,
        "ShapeVisualTransformer": ShapeVisualTransformer,
        "ShapeTransformer": ShapeTransformer,
    }
    return models_class_map[model_name](parameters).to(device)
    

In [22]:
def create_loss_function(loss_function ):
    """Takes a name of a loss function to return a loss function"""
    loss_functions_map = {
    "CrossEntropyLoss": nn.CrossEntropyLoss,
    "MSELoss": nn.MSELoss,
    }
    return loss_functions_map[loss_function]()

In [23]:
def create_optimizer(model, optimizer_name, optimizer_params ):
    """Takes a name of a optimizer and its parmameters and to returns an opmtimizer"""
    optimizers_map = {
    "SGD": torch.optim.SGD,
    "Adam": torch.optim.Adam,
        
    }
    return optimizers_map[optimizer_name](model.parameters(), **optimizer_params)