# Feature Extraction Embeddings from text
1) using Bert pretrained model


In [41]:
import torch
from transformers import BertModel, BertTokenizer

# Model BERT text to embeddings
class TextFeatureExtractor:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name, cache_dir="./cache")
        self.model = BertModel.from_pretrained(model_name, output_hidden_states=True, cache_dir="./cache")
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(device)
    
    def extract_features(self, text: str) -> torch.Tensor:
        # Tokenize input text
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        # Get embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Extract embeddings from the layer
        embeddings = outputs.hidden_states[-2]

        # Average embeddings of all tokens
        sentence_embedding = torch.mean(embeddings, dim=1)
        
        return sentence_embedding

    def cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.cosine_similarity(a, b, dim=1)



# Test for feature extraction below

In [42]:
# Test 
texts = [
    "Рецепты итальянской пасты.  Различные блюда из пасты. ",
    "Спагетти карбонара. Классическая римская паста.",
    "Техники приготовления пасты. Как сделать свежую пасту.",
    "Советы по уходу за автомобилем. Основной уход за автомобилем.",
    "Замена спущенного колеса. Пошаговое руководство.",
    "Основы ремонта автомобилей. Ремонт автомобилей своими руками."
]

extractor = TextFeatureExtractor()
embeddings = [extractor.extract_features(text) for text in texts]

print(f"Embedding dimension: {embeddings[0].shape}")

# Check non-zero vectors
print(f"Sum of absolute values of the first embedding: {torch.sum(torch.abs(embeddings[0]))}")
print(f"Mean value of the first embedding: {torch.mean(embeddings[0])}")

print(f"Similarity between text 1 and 2 (both about pasta): {extractor.cosine_similarity(embeddings[0], embeddings[1]).item()}")
print(f"Similarity between text 1 and 4 (pasta and car): {extractor.cosine_similarity(embeddings[0], embeddings[3]).item()}")

# Check consistency
embedding1 = extractor.extract_features(texts[0])
embedding2 = extractor.extract_features(texts[0])
print(f"Similarity between two embeddings of the same text: {extractor.cosine_similarity(embedding1, embedding2).item()}")

Embedding dimension: torch.Size([1, 768])
Sum of absolute values of the first embedding: 298.31231689453125
Mean value of the first embedding: -0.036628805100917816
Similarity between text 1 and 2 (both about pasta): 0.967228889465332
Similarity between text 1 and 4 (pasta and car): 0.9320670366287231
Similarity between two embeddings of the same text: 1.0000001192092896


# Real usage
text - input information

embeddings - output information

In [44]:
extractor = TextFeatureExtractor()

text = "Искусство приготовления пиццы. Секреты идеального теста и соуса."

embeddings = extractor.extract_features(text)

# some statistics of the embeddings
print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding mean: {torch.mean(embeddings)}")
print(f"Embedding sum: {torch.sum(embeddings)}")
print(f"Embedding max: {torch.max(embeddings)}")
print(f"Embedding min: {torch.min(embeddings)}")

# real full info about embeddings: 
# attention: "kinda big output"

# print(f"Embedding values: {embeddings}") 

Embedding shape: torch.Size([1, 768])
Embedding mean: -0.037250421941280365
Embedding sum: -28.60832405090332
Embedding max: 1.4767484664916992
Embedding min: -5.696453094482422
