In [1]:
# File: feature_extractor.py
import torch
from sentence_transformers import SentenceTransformer
import numpy as np

class CharacteristicVectorExtractor:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        
    def extract_characteristics(self, texts):
        """
        Extract characteristic vectors for a list of texts
        """
        characteristics = []
        
        for text in texts:
            # Get embeddings
            embedding = self.model.encode(text, show_progress_bar=False)
            
            # For simplicity, we use the embedding as characteristic vector
            # In advanced version, we'll extract from each layer
            char_vector = {
                'embedding': embedding,
                'mean': np.mean(embedding),
                'std': np.std(embedding)
            }
            characteristics.append(char_vector)
            
        return characteristics
    
    def batch_extract(self, texts, batch_size=32):
        """Extract characteristics in batches"""
        all_characteristics = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            batch_chars = self.extract_characteristics(batch)
            all_characteristics.extend(batch_chars)
            
        return all_characteristics

# Test the extractor
if __name__ == "__main__":
    extractor = CharacteristicVectorExtractor()
    test_texts = ["This is a great movie", "Amazing storyline"]
    characteristics = extractor.extract_characteristics(test_texts)
    print("Characteristic vectors extracted successfully!")
    print(f"Vector dimension: {len(characteristics[0]['embedding'])}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Characteristic vectors extracted successfully!
Vector dimension: 384
