In [1]:
import numpy as np

import torch
from torch import nn

from transformers import AlbertTokenizer, AlbertModel

In [2]:
# activation functions
def gelu(x):
    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def gelu_new(x):
    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
        Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


def swish(x):
    return x * torch.sigmoid(x)


def mish(x):
    return x * torch.tanh(nn.functional.softplus(x))


ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}

In [4]:
# Decoder for Model
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.layernorm = nn.LayerNorm(128)
        self.bias = nn.Parameter(torch.zeros(30000))
        self.dense = nn.Linear(768, 30000)
        self.decoder = nn.Linear(128, 30000)
        self.activation = ACT2FN['gelu_new']
        
        self.decoder.bias = self.bias
        
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.activation(hidden_states)
        hidden_states = self.layernorm(hidden_states)
        hidden_states = self.decoder(hidden_states)
        
        prediction_scores = hidden_states + self.bias
        
        return prediction_scores

In [None]:
# SpellCheck Model
class SpellCheckModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.albert