# Feature Extraction Embeddings from text
1) using Bert pretrained model


In [3]:
import torch
from transformers import BertModel, BertTokenizer

# Model BERT text to embeddings
class FeatureExtractor:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name, output_hidden_states=True)
        
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.model.to(device)
    
    def extract_features(self, text: str) -> torch.Tensor:
        # Tokenize input text
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        # Get embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Extract embeddings from the layer
        embeddings = outputs.hidden_states[-2]

        # Average embeddings of all tokens
        sentence_embedding = torch.mean(embeddings, dim=1)
        
        return sentence_embedding

    def cosine_similarity(self, a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
        return torch.nn.functional.cosine_similarity(a, b, dim=1)



# Test for feature extraction below

In [4]:
# Test 
texts = [
    "Название видео: Рецепты итальянской пасты. Описание: Различные блюда из пасты.",
    "Название видео: Спагетти карбонара. Описание: Классическая римская паста.",
    "Название видео: Техники приготовления пасты. Описание: Как сделать свежую пасту.",
    "Название видео: Советы по уходу за автомобилем. Описание: Основной уход за автомобилем.",
    "Название видео: Замена спущенного колеса. Описание: Пошаговое руководство.",
    "Название видео: Основы ремонта автомобилей. Описание: Ремонт автомобилей своими руками."
]

extractor = FeatureExtractor()
embeddings = [extractor.extract_features(text) for text in texts]

print(f"Embedding dimension: {embeddings[0].shape}")

# Check non-zero vectors
print(f"Sum of absolute values of the first embedding: {torch.sum(torch.abs(embeddings[0]))}")
print(f"Mean value of the first embedding: {torch.mean(embeddings[0])}")

print(f"Similarity between text 1 and 2 (both about pasta): {extractor.cosine_similarity(embeddings[0], embeddings[1]).item()}")
print(f"Similarity between text 1 and 4 (pasta and car): {extractor.cosine_similarity(embeddings[0], embeddings[3]).item()}")

# Check consistency
embedding1 = extractor.extract_features(texts[0])
embedding2 = extractor.extract_features(texts[0])
print(f"Similarity between two embeddings of the same text: {extractor.cosine_similarity(embedding1, embedding2).item()}")

Embedding dimension: torch.Size([1, 768])
Sum of absolute values of the first embedding: 294.7742919921875
Mean value of the first embedding: -0.03844819590449333
Similarity between text 1 and 2 (both about pasta): 0.9817097187042236
Similarity between text 1 and 4 (pasta and car): 0.9646569490432739
Similarity between two embeddings of the same text: 1.0


# Real usage
text - input information

embeddings - output information

In [5]:
extractor = FeatureExtractor()

text = "Название видео: Искусство приготовления пиццы. Описание: Секреты идеального теста и соуса."

embeddings = extractor.extract_features(text)

# some statistics of the embeddings
print(f"Embedding shape: {embeddings.shape}")
print(f"Embedding mean: {torch.mean(embeddings)}")
print(f"Embedding sum: {torch.sum(embeddings)}")
print(f"Embedding max: {torch.max(embeddings)}")
print(f"Embedding min: {torch.min(embeddings)}")

# real full info about embeddings: 
# attention: "kinda big output"

# print(f"Embedding values: {embeddings}") 

Embedding shape: torch.Size([1, 768])
Embedding mean: -0.03887264430522919
Embedding sum: -29.854190826416016
Embedding max: 1.4039347171783447
Embedding min: -6.451247215270996
