In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch
model_id='prajjwal1/bert-tiny'
# model_id='bert-base-uncased'
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model_classification = AutoModelForSequenceClassification.from_pretrained(model_id,num_labels=2)
model = AutoModel.from_pretrained(model_id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### I. Bert Embedding 
note:  pooler_output or token_embeddings are useful for data science analysis

In [29]:

# Tokenize input text
text = "BERT is amazing."

tokenizer.tokenize(text)


['bert', 'is', 'amazing', '.']

In [30]:
# Bert tokenizer 
inputs = tokenizer(text, return_tensors='pt')

print('Tokenized sentence "{}":'.format(text),inputs['input_ids'])

Tokenized sentence "BERT is amazing.": tensor([[  101, 14324,  2003,  6429,  1012,   102]])


In [31]:
inputs = tokenizer(text, return_tensors='pt')

# Get hidden states
with torch.no_grad():
    outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state
    pooler_output = outputs.pooler_output

# Extract word embeddings
cls_embedding = hidden_states[0][0]  # Embedding for [CLS] token
token_embeddings = hidden_states[0]  # Embeddings for all tokens

# Print the embeddings
print("Token Embeddings:", token_embeddings.shape)
print("CLS Embedding:", cls_embedding.shape)
print("pooler_output:", pooler_output.shape)
print("hidden_states", hidden_states.shape)

Token Embeddings: torch.Size([6, 128])
CLS Embedding: torch.Size([128])
pooler_output: torch.Size([1, 128])
hidden_states torch.Size([1, 6, 128])


In [32]:
### for multiple sentence

# Prepare multiple sequences
sentences = ["BERT is amazing.", "Transformers are powerful.", "NLP is fascinating."]

# Tokenize input texts
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)

# Get hidden states
with torch.no_grad():
    outputs = model(**inputs)
    hidden_states = outputs.last_hidden_state
    pooler_output = outputs.pooler_output

# Extract word embeddings
cls_embeddings = hidden_states[:, 0, :]  # Embeddings for [CLS] token for each sequence
token_embeddings = hidden_states  # Embeddings for all tokens in all sequences

# Print the embeddings
print("pooler_output:", pooler_output.shape)
print("Token Embeddings:", token_embeddings.shape)
print("CLS Embeddings:", cls_embeddings.shape)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


pooler_output: torch.Size([3, 128])
Token Embeddings: torch.Size([3, 7, 128])
CLS Embeddings: torch.Size([3, 128])


### II. Bert for classificaiton (Sentiment Analysis)
note: model_classification needs fine-tuning (must)

In [33]:
 
# Example text of classificaiotn
text = "BERT Tiny is a smaller version of BERT."

# Tokenize the input text
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
# Perform inference
with torch.no_grad():
    outputs = model_classification(**inputs)
    logits = outputs.logits

# Get the predicted class
soft_output = torch.softmax(logits, dim=1)
print(f'Softmax output: {soft_output}')

predicted_class = torch.argmax(logits, dim=1).item()
print(f'Predicted class: {predicted_class}')

Softmax output: tensor([[0.4117, 0.5883]])
Predicted class: 1
