In [None]:
######MoE

In [15]:
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from transformers import BertTokenizer, BertModel
from torch.autograd import Function


class TopKBinarizer(Function):
    """
    Top-k Binarizer.
    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
    is among the k% highest values of S.
    """
    @staticmethod
    def forward(ctx, inputs: torch.tensor, threshold: float):
        # Get the subnetwork by sorting the inputs and using the top threshold %
        mask = inputs.clone()
        _, idx = inputs.flatten().sort(descending=True)
        j = int(threshold * inputs.numel())

        # flat_out and mask access the same memory.
        flat_out = mask.flatten()
        flat_out[idx[j:]] = 0
        flat_out[idx[:j]] = 1
        return mask

    @staticmethod
    def backward(ctx, gradOutput):
        return gradOutput, None

class SelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.hidden_size % config.num_attention_heads == 0
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        self.out = nn.Linear(config.hidden_size, config.hidden_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        # Trainable mask scores for attention heads
        self.mask_score_heads_1 = nn.Parameter(torch.ones(self.num_attention_heads))
        self.mask_score_heads_2 = nn.Parameter(torch.ones(self.num_attention_heads))

        self.topk_threshold = config.topk_threshold

        # Gate for selecting mask: small two-layer fully connected network
        self.gate_heads = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 2)  # 2 possible masks for heads
        )

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask=None):
        # Generate the corresponding binary mask for attention heads
        mask_heads_1 = TopKBinarizer.apply(self.mask_score_heads_1, self.topk_threshold)
        mask_heads_2 = TopKBinarizer.apply(self.mask_score_heads_2, self.topk_threshold)

        # Compute the gate scores for attention heads
        gate_scores_heads = self.gate_heads(hidden_states.mean(dim=1))
        gate_probs_heads = F.softmax(gate_scores_heads, dim=-1)

        # Select the mask with the highest probability for attention heads
        selected_mask_index_heads = torch.argmax(gate_probs_heads, dim=1)

        if selected_mask_index_heads[0] == 0:
            selected_mask_heads = mask_heads_1
        else:
            selected_mask_heads = mask_heads_2

        # Apply mask to the attention heads
        # `selected_mask_heads` has shape [num_attention_heads]
        # We need to expand it to [batch_size, num_attention_heads, seq_length, seq_length]
        batch_size, seq_length, _ = hidden_states.size()
        masked_attention_heads = selected_mask_heads.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand(batch_size, self.num_attention_heads, seq_length, seq_length)

        # Forward pass without modifying q, k, v
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        # Apply mask to the attention probs
        attention_probs = attention_probs * masked_attention_heads

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        attention_output = self.out(context_layer)

        return attention_output


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.fc1 = nn.Linear(config.hidden_size, 4 * config.hidden_size) 
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(4 * config.hidden_size, config.hidden_size) 
        
        # Trainable mask scores
        self.mask_score_1 = nn.Parameter(torch.ones(4 * config.hidden_size, config.hidden_size))
        self.mask_score_2 = nn.Parameter(torch.ones(4 * config.hidden_size, config.hidden_size))
        
        self.topk_threshold = config.topk_threshold

        # Gate for selecting mask: small two-layer fully connected network
        self.gate = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 2)
        )

    def forward(self, x):
        # Generate the corresponding binary mask
        mask1 = TopKBinarizer.apply(self.mask_score_1, self.topk_threshold)
        mask2 = TopKBinarizer.apply(self.mask_score_2, self.topk_threshold)
        
        # Compute the gate scores
        gate_scores = self.gate(x.mean(dim=1))  # Using mean of x as a simple representation
        gate_probs = F.softmax(gate_scores, dim=-1)
        
        # Select the mask with the highest probability
        selected_mask_index = torch.argmax(gate_probs, dim=1)
        
        if selected_mask_index[0] == 0:
            selected_mask = mask1
        else:
            selected_mask = mask2
        
        # Apply mask to the dense layer's weights
        masked_weights = self.fc1.weight * selected_mask

        # Forward pass with masked weights
        x = F.linear(x, masked_weights, self.fc1.bias)
        x = self.gelu(x)
        x = self.fc2(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.hidden_size)
        self.attn = SelfAttention(config)
        self.ln2 = nn.LayerNorm(config.hidden_size)
        self.mlp = MLP(config)

    def forward(self, x, attention_mask=None):
        attn_output = self.attn(self.ln1(x), attention_mask)
        x = x + attn_output
        mlp_output = self.mlp(self.ln2(x))
        x = x + mlp_output
        return x

@dataclass
class BERTConfig:
    hidden_size: int = 768
    num_attention_heads: int = 12
    intermediate_size: int = 4 * hidden_size 
    hidden_dropout_prob: float = 0.1
    attention_probs_dropout_prob: float = 0.1
    num_hidden_layers: int = 12
    vocab_size: int = 30522
    max_position_embeddings: int = 512
    topk_threshold: float = 0.5

class BERT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layers = nn.ModuleList([Block(config) for _ in range(config.num_hidden_layers)])
        self.ln_f = nn.LayerNorm(config.hidden_size)

    def forward(self, input_ids, attention_mask=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        token_embeddings = self.embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        x = token_embeddings + position_embeddings

        for layer in self.layers:
            x = layer(x, attention_mask)

        x = self.ln_f(x)
        return x

    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained BERT model weights from HuggingFace"""
        print(f"Loading weights from pretrained BERT: {model_type}")

        # Instantiate the BERT model
        config_args = {
            'bert-base-uncased': dict(num_hidden_layers=12, num_attention_heads=12, hidden_size=768, intermediate_size=4 * 768),
            'bert-large-uncased': dict(num_hidden_layers=24, num_attention_heads=16, hidden_size=1024, intermediate_size=4 * 1024),
        }[model_type]
        config_args['vocab_size'] = 30522
        config_args['max_position_embeddings'] = 512
        config = BERTConfig(**config_args)
        model = BERT(config)

        # Load HuggingFace BERT model for weight extraction
        model_hf = BertModel.from_pretrained(model_type)

        # Copy weights from HuggingFace model to our model
        pretrained_dict = model_hf.state_dict()
        model_dict = model.state_dict()

        # Filter out unnecessary keys
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}

        # Overwrite entries in the existing state dict
        model_dict.update(pretrained_dict)

        # Load the new state dict
        model.load_state_dict(model_dict)

        return model



In [103]:
#####for test

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function

class TopKBinarizer(Function):
    @staticmethod
    def forward(ctx, inputs: torch.Tensor, threshold: float):
        mask = inputs.clone()
        _, idx = inputs.flatten().sort(descending=True)
        j = int(threshold * inputs.numel())
        
        flat_out = mask.flatten()
        flat_out[idx[j:]] = 0
        flat_out[idx[:j]] = 1
        return mask

    @staticmethod
    def backward(ctx, gradOutput):
        return gradOutput, None

class MLP(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.fc1 = nn.Linear(hidden_size, 4 * hidden_size) 
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(4 * hidden_size, hidden_size) 
        
        # Trainable mask scores
        self.mask_score_1 = nn.Parameter(torch.rand(4 * hidden_size, hidden_size))
        self.mask_score_2 = nn.Parameter(torch.rand(4 * hidden_size, hidden_size))
        
        self.topk_threshold = 0.2

        # Gate for selecting mask: small two-layer fully connected network
        self.gate = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2)
        )

    def forward(self, x):
        # Generate the corresponding binary mask
        mask1 = TopKBinarizer.apply(self.mask_score_1, self.topk_threshold)
        mask2 = TopKBinarizer.apply(self.mask_score_2, self.topk_threshold)
        
        # Compute the gate scores
        gate_scores = self.gate(x.mean(dim=1))  # Using mean of x as a simple representation
        gate_probs = F.softmax(gate_scores, dim=-1)
        
        # Select the mask with the highest probability
        selected_mask_index = torch.argmax(gate_probs, dim=1) 
        
        # Apply the appropriate mask to the dense layer's weights for each sample
        masked_weights = self.fc1.weight.unsqueeze(0).repeat(x.size(0), 1, 1)  # Repeat weights for batch size

        for i in range(x.size(0)):
            if selected_mask_index[i] == 0:
                masked_weights[i] *= mask1
            else:
                masked_weights[i] *= mask2

        # Forward pass with masked weights
        x = torch.bmm(x, masked_weights.transpose(1, 2))  # Batch matrix multiplication
        x = self.gelu(x)
        x = self.fc2(x)

        return selected_mask_index

#example
hidden_size = 3
mlp = MLP(hidden_size)
x = torch.randn(2, 3, hidden_size)  # 2个sample，3个时间步，每个时间步hidden_size维度

# 计算selected_mask
output = mlp(x)
print(output) 


tensor([1, 1])


In [None]:
#### init

In [30]:
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from transformers import BertTokenizer, BertModel
from torch.autograd import Function
import torch.nn.init as init


class TopKBinarizer(Function):
    """
    Top-k Binarizer.
    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
    is among the k% highest values of S.
    """
    @staticmethod
    def forward(ctx, inputs: torch.Tensor, threshold: float):
        # Get the subnetwork by sorting the inputs and using the top threshold %
        mask = inputs.clone()
        _, idx = inputs.flatten().sort(descending=True)
        j = int(threshold * inputs.numel())

        # flat_out and mask access the same memory.
        flat_out = mask.flatten()
        flat_out[idx[j:]] = 0
        flat_out[idx[:j]] = 1
        ctx.save_for_backward(inputs, mask)
        return mask

    @staticmethod
    def backward(ctx, grad_output):
        inputs, mask = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[mask == 0] = 0  # Only pass gradients for elements that are 1 in the mask
        return grad_input, None

class SelfAttention(nn.Module):
    def __init__(self, config, mask_scale=1.0):
        super().__init__()
        assert config.hidden_size % config.num_attention_heads == 0
        self.num_attention_heads = config.num_attention_heads
        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = nn.Linear(config.hidden_size, self.all_head_size)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)
        self.out = nn.Linear(config.hidden_size, config.hidden_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

        # Trainable mask scores for attention heads
        self.mask_score_heads_1 = nn.Parameter(torch.Tensor(self.num_attention_heads))
        self.mask_score_heads_2 = nn.Parameter(torch.Tensor(self.num_attention_heads))

        self.mask_scale = mask_scale
        self.init_mask()

        self.topk_threshold = config.topk_threshold

        # Gate for selecting mask: small two-layer fully connected network
        self.gate_heads = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 2)  # 2 possible masks for heads
        )

    def init_mask(self):
        init.constant_(self.mask_score_heads_1, val=self.mask_scale)
        init.constant_(self.mask_score_heads_2, val=self.mask_scale)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
        x = x.view(*new_x_shape)
        return x.permute(0, 2, 1, 3)

    def forward(self, hidden_states, attention_mask=None):
        # Generate the corresponding binary mask for attention heads
        mask_heads_1 = TopKBinarizer.apply(self.mask_score_heads_1, self.topk_threshold)
        mask_heads_2 = TopKBinarizer.apply(self.mask_score_heads_2, self.topk_threshold)

        # Compute the gate scores for attention heads
        gate_scores_heads = self.gate_heads(hidden_states.mean(dim=1))
        gate_probs_heads = F.softmax(gate_scores_heads, dim=-1)

        # Select the mask with the highest probability for attention heads
        selected_mask_index_heads = torch.argmax(gate_probs_heads, dim=1)

        if selected_mask_index_heads[0] == 0:
            selected_mask_heads = mask_heads_1
        else:
            selected_mask_heads = mask_heads_2

        # Apply mask to the attention heads
        # `selected_mask_heads` has shape [num_attention_heads]
        # We need to expand it to [batch_size, num_attention_heads, seq_length, seq_length]
        batch_size, seq_length, _ = hidden_states.size()
        masked_attention_heads = selected_mask_heads.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand(batch_size, self.num_attention_heads, seq_length, seq_length)

        # Forward pass without modifying q, k, v
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)

        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask

        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        # Apply mask to the attention probs
        attention_probs = attention_probs * masked_attention_heads

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        attention_output = self.out(context_layer)

        return attention_output

class MLP(nn.Module):
    def __init__(self, config, mask_scale=1.0):
        super().__init__()
        self.fc1 = nn.Linear(config.hidden_size, 4 * config.hidden_size)
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(4 * config.hidden_size, config.hidden_size)

        # Trainable mask scores
        self.mask_score_1 = nn.Parameter(torch.Tensor(4 * config.hidden_size, config.hidden_size))
        self.mask_score_2 = nn.Parameter(torch.Tensor(4 * config.hidden_size, config.hidden_size))

        self.mask_scale = mask_scale
        self.init_mask()

        self.topk_threshold = config.topk_threshold

        # Gate for selecting mask: small two-layer fully connected network
        self.gate_mlp = nn.Sequential(
            nn.Linear(config.hidden_size, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 2)
        )

    def init_mask(self):
        init.constant_(self.mask_score_1, val=self.mask_scale)
        init.constant_(self.mask_score_2, val=self.mask_scale)

    def forward(self, x):
        # Generate the corresponding binary mask
        mask1 = TopKBinarizer.apply(self.mask_score_1, self.topk_threshold)
        mask2 = TopKBinarizer.apply(self.mask_score_2, self.topk_threshold)

        # Compute the gate scores
        gate_scores = self.gate_mlp(x.mean(dim=1))  # Using mean of x as a simple representation
        gate_probs = F.softmax(gate_scores, dim=-1)

        # Select the mask with the highest probability
        selected_mask_index = torch.argmax(gate_probs, dim=1)

        if selected_mask_index[0] == 0:
            selected_mask = mask1
        else:
            selected_mask = mask2

        # Apply mask to the dense layer's weights
        masked_weights = self.fc1.weight * selected_mask

        # Forward pass with masked weights
        x = F.linear(x, masked_weights, self.fc1.bias)
        x = self.gelu(x)
        x = self.fc2(x)
        return x

class Block(nn.Module):
    def __init__(self, config, mask_scale=1.0):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.hidden_size)
        self.attn = SelfAttention(config, mask_scale)
        self.ln2 = nn.LayerNorm(config.hidden_size)
        self.mlp = MLP(config, mask_scale)

    def forward(self, x, attention_mask=None):
        attn_output = self.attn(self.ln1(x), attention_mask)
        x = x + attn_output
        mlp_output = self.mlp(self.ln2(x))
        x = x + mlp_output
        return x

@dataclass
class BERTConfig:
    hidden_size: int = 768
    num_attention_heads: int = 12
    intermediate_size: int = 4 * hidden_size
    hidden_dropout_prob: float = 0.1
    attention_probs_dropout_prob: float = 0.1
    num_hidden_layers: int = 12
    vocab_size: int = 30522
    max_position_embeddings: int = 512
    topk_threshold: float = 0.5

class BERT(nn.Module):
    def __init__(self, config, mask_scale=1.0):
        super().__init__()
        self.config = config

        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.layers = nn.ModuleList([Block(config, mask_scale) for _ in range(config.num_hidden_layers)])
        self.ln_f = nn.LayerNorm(config.hidden_size)

    def forward(self, input_ids, attention_mask=None):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        token_embeddings = self.embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        x = token_embeddings + position_embeddings

        for layer in self.layers:
            x = layer(x, attention_mask)

        x = self.ln_f(x)
        return x

    @classmethod
    def from_pretrained(cls, model_type, mask_scale=1.0):
        """Loads pretrained BERT model weights from HuggingFace"""
        print(f"Loading weights from pretrained BERT: {model_type}")

        # Instantiate the BERT model
        config_args = {
            'bert-base-uncased': dict(num_hidden_layers=12, num_attention_heads=12, hidden_size=768, intermediate_size=4 * 768),
            'bert-large-uncased': dict(num_hidden_layers=24, num_attention_heads=16, hidden_size=1024, intermediate_size=4 * 1024),
        }[model_type]
        config_args['vocab_size'] = 30522
        config_args['max_position_embeddings'] = 512
        config = BERTConfig(**config_args)
        model = BERT(config, mask_scale)

        # Load HuggingFace BERT model for weight extraction
        model_hf = BertModel.from_pretrained(model_type)

        # Copy weights from HuggingFace model to our model
        pretrained_dict = model_hf.state_dict()
        model_dict = model.state_dict()

        # Filter out unnecessary keys
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}

        # Overwrite entries in the existing state dict
        model_dict.update(pretrained_dict)

        # Load the new state dict
        model.load_state_dict(model_dict)

        return model


In [31]:
import torch
from torch.nn import CrossEntropyLoss
from transformers import BertTokenizer

class QuestionAnsweringModel(nn.Module):
    def __init__(self, model_type, mask_scale=1.0):
        super().__init__()
        self.bert = BERT.from_pretrained(model_type, mask_scale)
        self.qa_outputs = nn.Linear(self.bert.config.hidden_size, 2)  # For start and end logits

    def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None):
        # Get BERT outputs
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        
        # Get logits for start and end positions
        logits = self.qa_outputs(outputs)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
        
        outputs = (start_logits, end_logits)
        
        if start_positions is not None and end_positions is not None:
            # Compute loss
            loss_fct = CrossEntropyLoss()
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss,) + outputs
        
        return outputs

In [34]:
# Example usage
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def prepare_data(question, context, tokenizer, max_length=512):
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    return input_ids, attention_mask

# Initialize model
model = QuestionAnsweringModel('bert-base-uncased')

# Sample data
question = "What is the capital of France?"
context = "The capital of France is Paris."
input_ids, attention_mask = prepare_data(question, context, tokenizer)

# Forward pass
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    start_logits, end_logits = outputs

# Get the most likely beginning and end of answer span
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits)

# Convert token ids to tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index+1])

# Join tokens to form the answer
answer = tokenizer.convert_tokens_to_string(tokens)
print(f"Answer: {answer}")


Loading weights from pretrained BERT: bert-base-uncased
Answer: [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [33]:
# 加载BERT分词器
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 准备输入数据
text = "Hello, how are you? I am using a simplified BERT model for inference."
inputs = tokenizer(text, return_tensors="pt")

# 提取输入ID和注意力掩码
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# 加载预训练模型
model = BERT.from_pretrained("bert-base-uncased")

# 模型推理
model.eval()  # 设置模型为评估模式
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)

# 输出结果
print(outputs)



Loading weights from pretrained BERT: bert-base-uncased
tensor([[[ 0.7452,  1.4914,  2.3781,  ..., -0.6299, -1.5615, -0.6429],
         [ 1.8190, -0.7379, -1.7863,  ..., -2.0617,  0.6686,  0.7144],
         [ 1.9354,  0.9880,  0.2105,  ...,  1.1575, -0.5406,  1.0018],
         ...,
         [ 0.9247, -0.0671, -1.1082,  ..., -0.2936,  0.2140, -0.4268],
         [ 1.1636,  1.1287,  0.9556,  ..., -0.6875,  0.3120,  2.1783],
         [ 0.7863,  0.4213,  0.9697,  ...,  0.0096,  0.5468, -0.6275]]])
