In [None]:
!pip install transformers
import torch
import json
from transformers import XLMRobertaTokenizerFast
import argparse
import os
import logging

# Set up logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)

# Constants - should match those used in training
MAX_SEQ_LENGTH = 512
NUM_LABELS_SPAN = 3  # 0:O, 1:B, 2:I
NUM_LABELS_POLARITY = 4  # Positive, Negative, Neutral, None
NUM_LABELS_INTENSITY = 3  # Strong, Average, Weak
SPAN_EMBEDDING_DIM = 768
RELATION_EMBEDDING_DIM = 256
ADAPTER_SIZE = 128

# Import or redefine necessary model classes
# You should copy all the necessary model classes here from the original code
# For brevity, I'll assume the StructuredSentimentModel and all its components are imported from a file

from torch import nn

# Neural network modules
class SelfAttentionLayer(nn.Module):
    def __init__(self, input_dim, num_heads=8, head_dim=96):
        super(SelfAttentionLayer, self).__init__()
        self.input_dim = input_dim
        self.num_heads = num_heads
        self.head_dim = head_dim
        self.query = nn.Linear(input_dim, num_heads * head_dim)
        self.key = nn.Linear(input_dim, num_heads * head_dim)
        self.value = nn.Linear(input_dim, num_heads * head_dim)
        self.output_projection = nn.Linear(num_heads * head_dim, input_dim)
        self.layer_norm = nn.LayerNorm(input_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, _ = x.size()
        q = self.query(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.key(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.value(x).view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)
            scores = scores.masked_fill(mask == 0, -1e9)
        attn_weights = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn_weights, v).transpose(1, 2).contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)
        output = self.output_projection(context)
        return self.layer_norm(output + x)

class SpanDetector(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, num_labels=NUM_LABELS_SPAN):
        super(SpanDetector, self).__init__()
        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.classifier = nn.Linear(hidden_dim, num_labels)

    def forward(self, x):
        x = self.hidden(x)
        x = self.activation(x)
        return self.classifier(x)

class CrossSpanAttention(nn.Module):
    def __init__(self, input_dim, output_dim=RELATION_EMBEDDING_DIM):
        super(CrossSpanAttention, self).__init__()
        self.attention = nn.MultiheadAttention(input_dim, num_heads=4, batch_first=True)
        self.projection = nn.Linear(input_dim, output_dim)
        self.layer_norm = nn.LayerNorm(output_dim)

    def forward(self, spans, span_masks=None):
        key_padding_mask = ~span_masks if span_masks is not None else None
        context, _ = self.attention(spans, spans, spans, key_padding_mask=key_padding_mask)
        output = self.projection(context)
        return self.layer_norm(output)

class RelationClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, num_labels=2):  # NUM_LABELS_RELATION
        super(RelationClassifier, self).__init__()
        self.hidden = nn.Linear(input_dim * 2, hidden_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_dim, num_labels)

    def forward(self, span_pairs):
        x = self.hidden(span_pairs)
        x = self.activation(x)
        x = self.dropout(x)
        return self.classifier(x)

class PolarityClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_labels=NUM_LABELS_POLARITY):
        super(PolarityClassifier, self).__init__()
        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_dim, num_labels)

    def forward(self, x):
        x = self.hidden(x)
        x = self.activation(x)
        x = self.dropout(x)
        return self.classifier(x)

class IntensityClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_labels=NUM_LABELS_INTENSITY):
        super(IntensityClassifier, self).__init__()
        self.hidden = nn.Linear(input_dim, hidden_dim)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(hidden_dim, num_labels)

    def forward(self, x):
        x = self.hidden(x)
        x = self.activation(x)
        x = self.dropout(x)
        return self.classifier(x)

class LanguageAdapter(nn.Module):
    def __init__(self, input_dim, bottleneck_dim=ADAPTER_SIZE):
        super(LanguageAdapter, self).__init__()
        self.down_project = nn.Linear(input_dim, bottleneck_dim)
        self.activation = nn.ReLU()
        self.up_project = nn.Linear(bottleneck_dim, input_dim)
        self.layer_norm = nn.LayerNorm(input_dim)

    def forward(self, x):
        residual = x
        x = self.down_project(x)
        x = self.activation(x)
        x = self.up_project(x)
        return self.layer_norm(x + residual)

# Main model class
class StructuredSentimentModel(nn.Module):
    def __init__(self, pretrained_model_name="xlm-roberta-base", use_adapters=False, num_languages=8):
        super(StructuredSentimentModel, self).__init__()
        from transformers import XLMRobertaModel
        self.encoder = XLMRobertaModel.from_pretrained(pretrained_model_name)
        self.hidden_size = self.encoder.config.hidden_size
        self.span_attention = SelfAttentionLayer(self.hidden_size)
        self.holder_detector = SpanDetector(self.hidden_size)
        self.target_detector = SpanDetector(self.hidden_size)
        self.expression_detector = SpanDetector(self.hidden_size)
        self.cross_span_attention = CrossSpanAttention(self.hidden_size)
        self.relation_classifier = RelationClassifier(RELATION_EMBEDDING_DIM)
        self.polarity_classifier = PolarityClassifier(RELATION_EMBEDDING_DIM)
        self.intensity_classifier = IntensityClassifier(RELATION_EMBEDDING_DIM)
        self.use_adapters = use_adapters
        if use_adapters:
            self.language_adapters = nn.ModuleList([LanguageAdapter(self.hidden_size) for _ in range(num_languages)])
        self._init_weights()

    def _init_weights(self):
        modules = [self.span_attention, self.holder_detector, self.target_detector,
                   self.expression_detector, self.cross_span_attention,
                   self.relation_classifier, self.polarity_classifier, self.intensity_classifier]
        for module in modules:
            for name, param in module.named_parameters():
                if 'weight' in name and len(param.shape) >= 2:
                    nn.init.xavier_uniform_(param)
                elif 'bias' in name:
                    nn.init.zeros_(param)

    def extract_spans(self, span_logits, attention_mask):
        batch_size = span_logits.size(0)
        span_preds = torch.argmax(torch.softmax(span_logits, dim=-1), dim=-1)
        all_spans = []
        for i in range(batch_size):
            mask = attention_mask[i].bool()
            preds = span_preds[i][mask]
            spans = []
            start_idx = None
            for j, label in enumerate(preds):
                if label == 1:  # B
                    if start_idx is not None:
                        spans.append((start_idx, j - 1))
                    start_idx = j
                elif label == 0:  # O
                    if start_idx is not None:
                        spans.append((start_idx, j - 1))
                        start_idx = None
                elif label == 2:  # I
                    if start_idx is None:
                        start_idx = j
            if start_idx is not None:
                spans.append((start_idx, len(preds) - 1))
            all_spans.append(spans)
        return all_spans

    def get_span_embeddings(self, hidden_states, spans, attention_mask):
        batch_size = hidden_states.size(0)
        max_spans = max([len(s) for s in spans], default=0)
        if max_spans == 0:
            return torch.zeros((batch_size, 0, self.hidden_size), device=hidden_states.device), torch.zeros((batch_size, 0), dtype=torch.bool, device=hidden_states.device)
        span_embeddings = torch.zeros((batch_size, max_spans, self.hidden_size), device=hidden_states.device)
        span_masks = torch.zeros((batch_size, max_spans), dtype=torch.bool, device=hidden_states.device)
        for i in range(batch_size):
            for j, (start, end) in enumerate(spans[i]):
                if j < max_spans:
                    span_embeddings[i, j] = hidden_states[i, start:end+1].mean(dim=0)
                    span_masks[i, j] = True
        return span_embeddings, span_masks

    def _combine_spans(self, holder_emb, holder_mask, target_emb, target_mask, expr_emb, expr_mask):
        batch_size = holder_emb.size(0)
        max_spans = holder_emb.size(1) + target_emb.size(1) + expr_emb.size(1)
        if max_spans == 0:
            return torch.zeros((batch_size, 0, self.hidden_size), device=holder_emb.device), torch.zeros((batch_size, 0), dtype=torch.bool, device=holder_emb.device)
        combined_emb = torch.zeros((batch_size, max_spans, self.hidden_size), device=holder_emb.device)
        combined_mask = torch.zeros((batch_size, max_spans), dtype=torch.bool, device=holder_emb.device)
        holder_size = holder_emb.size(1)
        target_size = target_emb.size(1)
        expr_size = expr_emb.size(1)
        combined_emb[:, :holder_size] = holder_emb
        combined_emb[:, holder_size:holder_size+target_size] = target_emb
        combined_emb[:, holder_size+target_size:] = expr_emb
        combined_mask[:, :holder_size] = holder_mask
        combined_mask[:, holder_size:holder_size+target_size] = target_mask
        combined_mask[:, holder_size+target_size:] = expr_mask
        return combined_emb, combined_mask

    def _create_span_pairs(self, span_embeddings, holder_mask, target_mask, expr_mask):
        batch_size = span_embeddings.size(0)
        holder_size = holder_mask.size(1)
        target_size = target_mask.size(1)
        expr_size = expr_mask.size(1)
        total_holders = holder_mask.sum(dim=1)
        total_targets = target_mask.sum(dim=1)
        total_expressions = expr_mask.sum(dim=1)
        max_pairs = torch.max(total_holders * total_expressions + total_targets * total_expressions)
        if max_pairs == 0:
            return None, None
        pair_embeddings = torch.zeros((batch_size, max_pairs, RELATION_EMBEDDING_DIM * 2), device=span_embeddings.device)
        pair_indices = torch.zeros((batch_size, max_pairs, 2), dtype=torch.long, device=span_embeddings.device)
        offset = holder_size + target_size
        for i in range(batch_size):
            pair_idx = 0
            for h_idx in range(holder_size):
                if not holder_mask[i, h_idx]:
                    continue
                for e_idx in range(expr_size):
                    if not expr_mask[i, e_idx] or pair_idx >= max_pairs:
                        continue
                    pair_embeddings[i, pair_idx] = torch.cat([span_embeddings[i, h_idx], span_embeddings[i, offset + e_idx]])
                    pair_indices[i, pair_idx] = torch.tensor([h_idx, offset + e_idx], device=span_embeddings.device)
                    pair_idx += 1
            for t_idx in range(target_size):
                if not target_mask[i, t_idx]:
                    continue
                for e_idx in range(expr_size):
                    if not expr_mask[i, e_idx] or pair_idx >= max_pairs:
                        continue
                    pair_embeddings[i, pair_idx] = torch.cat([span_embeddings[i, holder_size + t_idx], span_embeddings[i, offset + e_idx]])
                    pair_indices[i, pair_idx] = torch.tensor([holder_size + t_idx, offset + e_idx], device=span_embeddings.device)
                    pair_idx += 1
        return pair_embeddings, pair_indices

    def forward(self, input_ids, attention_mask, language_id=None, labels=None):
        batch_size = input_ids.size(0)
        encoder_outputs = self.encoder(input_ids, attention_mask=attention_mask)
        hidden_states = encoder_outputs.last_hidden_state

        if self.use_adapters and language_id is not None:
            adapted_states = torch.zeros_like(hidden_states)
            for i in range(batch_size):
                adapted_states[i] = self.language_adapters[language_id[i].item()](hidden_states[i])
            hidden_states = adapted_states

        span_aware_states = self.span_attention(hidden_states, attention_mask)
        holder_logits = self.holder_detector(span_aware_states)
        target_logits = self.target_detector(span_aware_states)
        expression_logits = self.expression_detector(span_aware_states)

        if labels is not None:
            # Training mode - omitted for inference script
            pass
        else:
            # Inference mode
            holder_spans = self.extract_spans(holder_logits, attention_mask)
            target_spans = self.extract_spans(target_logits, attention_mask)
            expression_spans = self.extract_spans(expression_logits, attention_mask)
            holder_embeddings, holder_masks = self.get_span_embeddings(span_aware_states, holder_spans, attention_mask)
            target_embeddings, target_masks = self.get_span_embeddings(span_aware_states, target_spans, attention_mask)
            expression_embeddings, expression_masks = self.get_span_embeddings(span_aware_states, expression_spans, attention_mask)
            all_span_embeddings, all_span_masks = self._combine_spans(holder_embeddings, holder_masks, target_embeddings, target_masks, expression_embeddings, expression_masks)
            relation_aware_embeddings = self.cross_span_attention(all_span_embeddings, all_span_masks)
            relation_pairs, pair_indices = self._create_span_pairs(relation_aware_embeddings, holder_masks, target_masks, expression_masks)

            relation_logits = None
            polarity_logits = None
            intensity_logits = None
            if relation_pairs is not None:
                relation_logits = self.relation_classifier(relation_pairs)
                expression_relation_aware = relation_aware_embeddings[:, holder_embeddings.size(1) + target_embeddings.size(1):, :]
                polarity_logits = self.polarity_classifier(expression_relation_aware)
                intensity_logits = self.intensity_classifier(expression_relation_aware)

            return {
                'holder_logits': holder_logits,
                'target_logits': target_logits,
                'expression_logits': expression_logits,
                'relation_logits': relation_logits,
                'polarity_logits': polarity_logits,
                'intensity_logits': intensity_logits,
                'holder_spans': holder_spans,
                'target_spans': target_spans,
                'expression_spans': expression_spans,
                'pair_indices': pair_indices
            }

class SentimentAnalyzer:
    def __init__(self, model_path, pretrained_model="xlm-roberta-base", device=None):
        """
        Initialize the sentiment analyzer with a trained model

        Args:
            model_path: Path to the saved model checkpoint
            pretrained_model: Name of the pretrained model used during training
            device: Device to run inference on (cuda or cpu)
        """
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = device

        logger.info(f"Using device: {self.device}")

        # Load tokenizer
        self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(pretrained_model)

        # Initialize model
        self.model = StructuredSentimentModel(pretrained_model_name=pretrained_model)

        # Load trained weights
        checkpoint = torch.load(model_path, map_location=self.device)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.model.to(self.device)
        self.model.eval()

        logger.info(f"Model loaded from {model_path}")

        # Define polarity and intensity mappings
        self.polarity_map = {0: "Positive", 1: "Negative", 2: "Neutral", 3: "None"}
        self.intensity_map = {0: "Strong", 1: "Average", 2: "Weak"}
        # Define BIO labels for easier interpretation
        self.bio_labels = {0: "O", 1: "B", 2: "I"}

    def _get_actual_text_spans(self, text, tokens, spans):
        """
        Convert token indices to character spans in the original text

        Args:
            text: Original text
            tokens: Tokenized text info
            spans: List of (start_token, end_token) tuples

        Returns:
            List of (start_char, end_char, text_span) tuples
        """
        text_spans = []
        offset_mapping = tokens.offset_mapping[0].tolist()

        for start_token, end_token in spans:
            if start_token >= len(offset_mapping) or end_token >= len(offset_mapping):
                continue

            start_char = offset_mapping[start_token][0]
            end_char = offset_mapping[end_token][1]

            if start_char < end_char and end_char <= len(text):
                span_text = text[start_char:end_char]
                text_spans.append((start_char, end_char, span_text))

        return text_spans

    def _get_bio_encoding(self, holder_logits, target_logits, expression_logits, attention_mask):
        """
        Convert logits to BIO labels for visualization

        Args:
            holder_logits: Predicted logits for holder spans
            target_logits: Predicted logits for target spans
            expression_logits: Predicted logits for expression spans
            attention_mask: Attention mask to filter out padding tokens

        Returns:
            Dictionary with BIO encodings for each entity type
        """
        # Get the predicted BIO labels
        holder_preds = torch.argmax(holder_logits, dim=-1)[0].cpu().numpy()
        target_preds = torch.argmax(target_logits, dim=-1)[0].cpu().numpy()
        expression_preds = torch.argmax(expression_logits, dim=-1)[0].cpu().numpy()

        # Get active token indices (non-padding)
        active_indices = attention_mask[0].cpu().numpy().nonzero()[0]

        # Map indices to BIO labels
        holder_bio = [self.bio_labels[holder_preds[i]] for i in active_indices]
        target_bio = [self.bio_labels[target_preds[i]] for i in active_indices]
        expression_bio = [self.bio_labels[expression_preds[i]] for i in active_indices]

        return {
            "holder_bio": holder_bio,
            "target_bio": target_bio,
            "expression_bio": expression_bio
        }

    def analyze(self, text):
        """
        Analyze a text for structured sentiment

        Args:
            text: Text to analyze

        Returns:
            Dictionary with sentiment analysis results
        """
        # Tokenize input
        tokens = self.tokenizer(
            text,
            max_length=MAX_SEQ_LENGTH,
            padding='max_length',
            truncation=True,
            return_offsets_mapping=True,
            return_tensors='pt'
        )

        input_ids = tokens['input_ids'].to(self.device)
        attention_mask = tokens['attention_mask'].to(self.device)

        # Get model predictions
        with torch.no_grad():
            # Get the encoder outputs directly from XLM-RoBERTa
            encoder_outputs = self.model.encoder(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = encoder_outputs.last_hidden_state

            # Full model inference
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        # Extract spans
        holder_spans = outputs['holder_spans'][0]
        target_spans = outputs['target_spans'][0]
        expression_spans = outputs['expression_spans'][0]

        # Get character spans from token spans
        holder_text_spans = self._get_actual_text_spans(text, tokens, holder_spans)
        target_text_spans = self._get_actual_text_spans(text, tokens, target_spans)
        expression_text_spans = self._get_actual_text_spans(text, tokens, expression_spans)

        # Get BIO encodings
        bio_encodings = self._get_bio_encoding(
            outputs['holder_logits'],
            outputs['target_logits'],
            outputs['expression_logits'],
            attention_mask
        )

        # Get polarity and intensity for expressions
        sentiment_opinions = []

        if outputs['polarity_logits'] is not None and outputs['intensity_logits'] is not None:
            polarity_preds = torch.argmax(outputs['polarity_logits'], dim=-1)
            intensity_preds = torch.argmax(outputs['intensity_logits'], dim=-1)

            num_expressions = min(len(expression_text_spans), polarity_preds.size(1))

            for i in range(num_expressions):
                polarity_idx = polarity_preds[0, i].item()
                intensity_idx = intensity_preds[0, i].item()

                opinion = {
                    "expression": expression_text_spans[i][2],
                    "expression_span": f"{expression_text_spans[i][0]}:{expression_text_spans[i][1]}",
                    "polarity": self.polarity_map[polarity_idx],
                    "intensity": self.intensity_map[intensity_idx],
                }

                # Find related holder and target (if available)
                if outputs['pair_indices'] is not None:
                    for pair_idx in range(outputs['pair_indices'].size(1)):
                        idx1, idx2 = outputs['pair_indices'][0, pair_idx]

                        # If this pair involves the current expression
                        expr_offset = len(holder_spans) + len(target_spans)

                        if idx2 == expr_offset + i:
                            # It's a holder-expression or target-expression pair
                            if idx1 < len(holder_spans):
                                # It's a holder
                                holder_idx = idx1.item()
                                if holder_idx < len(holder_text_spans):
                                    opinion["holder"] = holder_text_spans[holder_idx][2]
                                    opinion["holder_span"] = f"{holder_text_spans[holder_idx][0]}:{holder_text_spans[holder_idx][1]}"
                            else:
                                # It's a target
                                target_idx = idx1.item() - len(holder_spans)
                                if target_idx < len(target_text_spans):
                                    opinion["target"] = target_text_spans[target_idx][2]
                                    opinion["target_span"] = f"{target_text_spans[target_idx][0]}:{target_text_spans[target_idx][1]}"

                # Add default values for missing fields
                if "holder" not in opinion:
                    opinion["holder"] = ""
                    opinion["holder_span"] = "0:0"
                if "target" not in opinion:
                    opinion["target"] = ""
                    opinion["target_span"] = "0:0"

                sentiment_opinions.append(opinion)

        # Prepare encoded embeddings info
        # Get all active tokens
        active_indices = attention_mask[0].cpu().numpy().nonzero()[0]
        tokens_list = self.tokenizer.convert_ids_to_tokens(input_ids[0][active_indices].cpu().numpy())

        # For each token, include the first few dimensions of its embedding
        num_dims_to_show = 5  # Show first 5 dimensions for each token
        embeddings_list = []

        for idx in active_indices:
            token_embedding = embeddings[0, idx, :num_dims_to_show].cpu().numpy().tolist()
            embeddings_list.append(token_embedding)

        # Also include the full embedding for the first token as an example
        first_token_full_embedding = embeddings[0, active_indices[0], :].cpu().numpy().tolist()

        embeddings_info = {
            "tokens": tokens_list,
            "embeddings": embeddings_list,  # First few dimensions for all tokens
            "first_token_full_embedding": first_token_full_embedding,
            "embeddings_shape": list(embeddings.shape),
            "num_dimensions_shown": num_dims_to_show
        }

        return {
            "text": text,
            "holders": [span[2] for span in holder_text_spans],
            "targets": [span[2] for span in target_text_spans],
            "expressions": [span[2] for span in expression_text_spans],
            "opinions": sentiment_opinions,
            "bio_encodings": bio_encodings,
            "encoded_embeddings": embeddings_info
        }

model_path = "/content/drive/MyDrive/NLP-Project/mpqa_best_model_f1_0.5422.pt"  # ← update this
pretrained_model = "xlm-roberta-base"
text_to_analyze = "राष्ट्रपति द्रौपदी मुर्मू ने राष्ट्र का अभिनंदन किया और देशवासियों से आग्रह किया कि वे सभी के जीवन को निरंतर प्रगति, समृद्धि और खुशहाली के रंगों से भर दें।"
input_file_path = None  # or set path to a file
output_file_path = "sentiment_analysis_results.json"

# Initialize analyzer
analyzer = SentimentAnalyzer(model_path, pretrained_model)

results = []

# Analyze a single text
if text_to_analyze:
    analysis = analyzer.analyze(text_to_analyze)
    results.append(analysis)

    print(f"\nAnalysis for: '{text_to_analyze}'")
    print(f"Holders: {analysis['holders']}")
    print(f"Targets: {analysis['targets']}")
    print(f"Expressions: {analysis['expressions']}")

    print("\nOpinions:")
    for i, opinion in enumerate(analysis['opinions']):
        print(f"  Opinion {i+1}:")
        print(f"    Holder: '{opinion['holder']}'")
        print(f"    Target: '{opinion['target']}'")
        print(f"    Expression: '{opinion['expression']}'")
        print(f"    Polarity: {opinion['polarity']}")
        print(f"    Intensity: {opinion['intensity']}")

    # Print BIO encodings
    print("\nBIO Encodings:")
    bio = analysis['bio_encodings']

    # Get tokens aligned with BIO tags
    tokens = analysis['encoded_embeddings']['tokens']

    print("\nToken\t\tHolder\tTarget\tExpression")
    print("-" * 50)
    for i, token in enumerate(tokens):
        # Format to align columns
        token_str = token[:10] + "..." if len(token) > 10 else token
        token_str = token_str.ljust(15)
        print(f"{token_str}\t{bio['holder_bio'][i]}\t{bio['target_bio'][i]}\t{bio['expression_bio'][i]}")

    # Print embeddings information
    emb_info = analysis['encoded_embeddings']
    print("\nEncoded Embeddings (XLM-RoBERTa):")
    print(f"Embeddings shape: {emb_info['embeddings_shape']} (batch, sequence_length, hidden_dim)")
    print(f"\nShowing first {emb_info['num_dimensions_shown']} dimensions for each token:")

    print("\nToken\t\tEmbedding dimensions (first few)")
    print("-" * 70)
    for i, token in enumerate(emb_info['tokens']):
        token_str = token[:10] + "..." if len(token) > 10 else token
        token_str = token_str.ljust(15)
        emb_str = "[" + ", ".join([f"{val:.4f}" for val in emb_info['embeddings'][i]]) + ", ...]"
        print(f"{token_str}\t{emb_str}")

    # Print a full example embedding for reference
    print("\nExample of full embedding vector for first token:")
    first_token = emb_info['tokens'][0]
    print(f"Token: {first_token}")

    # Print the full embedding in a readable format (chunked)
    full_emb = emb_info['first_token_full_embedding']
    chunk_size = 10
    for i in range(0, len(full_emb), chunk_size):
        chunk = full_emb[i:i+chunk_size]
        print(f"Dims {i}-{i+len(chunk)-1}: " + ", ".join([f"{val:.4f}" for val in chunk]))

# Or analyze from a file
elif input_file_path:
    with open(input_file_path, 'r', encoding='utf-8') as f:
        texts = [line.strip() for line in f if line.strip()]

    for text in texts:
        try:
            analysis = analyzer.analyze(text)
            results.append(analysis)
        except Exception as e:
            print(f"Error analyzing text: {text}")
            print(str(e))

# Save results
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\n✅ Results saved to: {output_file_path}")