In [1]:
import torch
from sentence_transformers import SentenceTransformer
import numpy as np

class CharacteristicVectorExtractor:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        self.model = SentenceTransformer(model_name)
        
    def extract_characteristics(self, texts):
        """
        Extract characteristic vectors for a list of texts
        """
        characteristics = []
        
        for text in texts:
            # Get embeddings
            embedding = self.model.encode(text, show_progress_bar=False)
            
            # For simplicity, we use the embedding as characteristic vector
            # In advanced version, we'll extract from each layer
            char_vector = {
                'embedding': embedding,
                'mean': np.mean(embedding),
                'std': np.std(embedding)
            }
            characteristics.append(char_vector)
            
        return characteristics
    
    def batch_extract(self, texts, batch_size=32):
        """Extract characteristics in batches"""
        all_characteristics = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            batch_chars = self.extract_characteristics(batch)
            all_characteristics.extend(batch_chars)
            
        return all_characteristics

# Test the extractor
if __name__ == "__main__":
    extractor = CharacteristicVectorExtractor()
    test_texts = ["This is a great movie", "Amazing storyline"]
    characteristics = extractor.extract_characteristics(test_texts)
    print("Characteristic vectors extracted successfully!")
    print(f"Vector dimension: {len(characteristics[0]['embedding'])}")

Characteristic vectors extracted successfully!
Vector dimension: 384
