In [1]:
import spacy

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

sentence = "Invoice number to be generated voice in"
tokens = word_tokenize(sentence)
print(tokens)


['Invoice', 'number', 'to', 'be', 'generated', 'voice', 'in']


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prithvikiran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
sentence = "Invoice number to be generated voice in"
tokens = tokenizer.tokenize(sentence)
print(tokens)




['in', '##vo', '##ice', 'number', 'to', 'be', 'generated', 'voice', 'in']


In [4]:
import spacy

nlp = spacy.load("en_core_web_sm")
sentence = "Invoice number to be generated voice in"
doc = nlp(sentence)
tokens = [token.text for token in doc]
print(tokens)


['Invoice', 'number', 'to', 'be', 'generated', 'voice', 'in']


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertModel
import torch

# Download necessary NLTK resources
nltk.download('punkt')

# Given sentences
sentences = [
    "The client received the Invoice via email and promptly processed the payment.",
    "After completing the project, the freelance graphic designer sent an Invoice to the marketing agency.",
    "We noticed a discrepancy on the Invoice, so we contacted customer support for clarification.",
    "The Invoice for the office supplies was due on the 15th of the month, but it was paid early.",
    "The accounting department is responsible for managing and archiving all company Invoices.",
    "Jane carefully reviewed each line item on the Invoice to ensure there were no overcharges.",
    "The vendor issued a revised Invoice after correcting the error in the initial billing.",
    "To maintain a smooth cash flow, the company ensures all Invoices are sent out promptly after services are rendered.",
    "The automated system generates an Invoice immediately after an online purchase is made.",
    "Before filing taxes, the small business owner meticulously organized every Invoice from the past year.",
    "CA-22 Invoice-3334AD# Date Time"
]

# Tokenize each sentence using NLTK
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(tokenized_sentences, tokenizer, model):
    embeddings = []
    for sentence_tokens in tokenized_sentences:
        # Tokenize the sentence with BERT tokenizer
        inputs = tokenizer(sentence_tokens, return_tensors='pt', is_split_into_words=True, padding=True, truncation=True)
        
        # Get the BERT model's output
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Get the embeddings for each token
        last_hidden_states = outputs.last_hidden_state.squeeze(0)  # Remove the batch dimension
        
        # Align tokens and their embeddings
        word_ids = inputs.word_ids()
        token_embeddings = {}
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                word = sentence_tokens[word_id]
                if word not in token_embeddings:
                    token_embeddings[word] = []
                token_embeddings[word].append(last_hidden_states[idx].numpy())
        
        # Average embeddings for tokens split into multiple word pieces
        averaged_embeddings = {word: torch.tensor(embs).mean(dim=0).numpy() for word, embs in token_embeddings.items()}
        embeddings.append(averaged_embeddings)
    
    return embeddings

# Generate embeddings
bert_embeddings = get_bert_embeddings(tokenized_sentences, tokenizer, model)

# Display the embeddings for the first sentence as an example
print("First sentence tokens and their embeddings:")
for token, embedding in bert_embeddings[0].items():
    print(f"Token: {token}\nEmbedding: {embedding}\n")


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/prithvikiran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ValueError: word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast` class).