In [None]:
# Tensorboard  (optional)
#sto comand prompt
#taskkill /im tensorboard.exe /f
#del /q %TMP%\.tensorboard-info\*
%load_ext tensorboard
#%tensorboard --logdir runs/train
%tensorboard --logdir ./logs
#%tensorboard --logdir {logs_base_dir}  --host localhost

In [None]:
####################################### The script assumes that the target property is hardness but the values have been replaced for UTS in the dataset ##########################

import warnings  # For suppressing warnings
warnings.filterwarnings("ignore")  # Suppress all warning messages

import joblib
import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup, TrainerCallback, EarlyStoppingCallback)
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns  # Import Seaborn
from datetime import datetime
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import re
from statsmodels.graphics.tsaplots import plot_acf

# ==========================================
# 1. Parameters and Setup
# ==========================================

# Parameters
K = 5  # Number of folds
random_seed = 42
output_base_dir = './'  # Specify your saving directory here

# Create base directory if it doesn't exist
if not os.path.exists(output_base_dir):
    os.makedirs(output_base_dir)

# ==========================================
# 2. Data Loading and Preprocessing
# ==========================================

# Load the data
#data = pd.read_csv('LLM_Elongation_features_Rounded_Cleaned.csv')
data = pd.read_csv('LLM_UTS_Features.csv')




def custom_tokenize(composition):
    """
    Custom tokenizer to parse the composition string into sorted element-fraction tokens.
    Example:
        Input: "Co1.2 F0.8 Ni1"
        Output: ['Co1.2', 'F0.8', 'Ni1']
    """
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    # Sort matches by the element's name to ensure alphabetical order
    sorted_matches = sorted(matches, key=lambda x: x[0])
    tokens = []
    for match in sorted_matches:
        element, fraction = match
        token = f"{element}{fraction}"  # Combine element and fraction
        tokens.append(token)
    return tokens

# Test the function
print(custom_tokenize("Co1.2 F0.8 Ni1"))  # Output should be: ['Co1.2', 'F0.8', 'Ni1']

# Create tokenized_elements column by applying custom_tokenize on the composition column
data['tokenized_elements'] = data['composition'].apply(custom_tokenize)

# ==========================================
# 3. K-Fold Cross-Validation Setup
# ==========================================

# Initialize K-Fold
kf = KFold(n_splits=K, shuffle=True, random_state=random_seed)

# Prepare to store metrics for each fold
fold_metrics = {
    'mse': [],
    'mae': [],
    'r2': []
}

# Prepare to collect predictions and actual values across folds
all_predictions_unscaled = []
all_actual_values_unscaled = []

# Define the CustomDataset class outside the K-Fold loop
class CustomDataset(Dataset):
    """
    Custom Dataset class for PyTorch.
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

# ==========================================
# 4. K-Fold Cross-Validation Loop
# ==========================================

# Iterate over each fold
for fold, (train_index, val_index) in enumerate(kf.split(data)):
    print(f"\n########### Fold {fold + 1} / {K} ###########")
    
    # Split data
    data_train = data.iloc[train_index].reset_index(drop=True)
    data_val = data.iloc[val_index].reset_index(drop=True)
    
    # Normalize 'hardness' column within the fold
    hardness_scaler = StandardScaler()
    data_train['normalized_hardness'] = hardness_scaler.fit_transform(data_train[['hardness']])
    data_val['normalized_hardness'] = hardness_scaler.transform(data_val[['hardness']])

    # Initialize a dictionary to store all scalers for this fold
    scalers = {'hardness_scaler': hardness_scaler}
    
    # Normalize all other numeric features within the fold and store their scalers
    feature_columns = [col for col in data.columns if col not in ['composition', 'hardness', 'tokenized_elements']]
    # Initialize combined features for train and validation
    combined_features_train = data_train['tokenized_elements'].astype(str)
    combined_features_val = data_val['tokenized_elements'].astype(str)

    for feature in feature_columns:
        if feature not in ['composition', 'hardness', 'tokenized_elements']:
            scaler = StandardScaler()
            data_train[f'normalized_{feature}'] = scaler.fit_transform(data_train[[feature]])
            data_val[f'normalized_{feature}'] = scaler.transform(data_val[[feature]])
            combined_features_train += ' ' + data_train[f'normalized_{feature}'].astype(str)
            combined_features_val += ' ' + data_val[f'normalized_{feature}'].astype(str)
            
            # Add the scaler to the dictionary with a unique key
            scalers[f'{feature}_scaler'] = scaler
    
    # Assign the combined features back to the dataframe
    data_train['combined_features'] = combined_features_train
    data_val['combined_features'] = combined_features_val
    
    # Save all scalers for this fold using joblib
    save_dir = os.path.join(output_base_dir, f'fold_{fold + 1}')
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    scalers_save_path = os.path.join(save_dir, 'scalers.pkl')
    joblib.dump(scalers, scalers_save_path)
    print(f"Scalers saved to {scalers_save_path}")
    
    # ==========================================
    # 5. Model Initialization and Tokenization
    # ==========================================

    # Initialize tokenizer and model for each fold
    
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')  # Update as needed 150K_With_Hardness, 6K_pretraining
    model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

    
    
    if torch.cuda.is_available():
        model.cuda()
    
    # Tokenize training data
    train_encodings = tokenizer(
        [" ".join(tokens) for tokens in data_train['tokenized_elements'].to_list()],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    # Tokenize validation data
    val_encodings = tokenizer(
        [" ".join(tokens) for tokens in data_val['tokenized_elements'].to_list()],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    
    # Create datasets
    train_dataset = CustomDataset(train_encodings, data_train['normalized_hardness'].values)
    val_dataset = CustomDataset(val_encodings, data_val['normalized_hardness'].values)
    
    def compute_metrics(p):
        """
        Compute evaluation metrics: MSE, MAE, R2.
        """
        predictions, labels = p.predictions.squeeze(), p.label_ids
        mse = mean_squared_error(labels, predictions)
        mae = mean_absolute_error(labels, predictions)
        r2 = r2_score(labels, predictions)
        return {'mse': mse, 'mae': mae, 'r2': r2}
    
    # Unique log directory for Tensorboard for each fold
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    log_dir = f'./logs/fold_{fold + 1}_{current_time}'
    
    # Define optimizer groups for decay mechanism
    decay_layers = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"]  # Adjust as necessary
    decay_param_names = [n for n, p in model.named_parameters() if any(f".{layer}." in n for layer in decay_layers)]
    no_decay_param_names = [n for n, p in model.named_parameters() if n not in decay_param_names]
    decay_params = [p for n, p in model.named_parameters() if n in decay_param_names]
    no_decay_params = [p for n, p in model.named_parameters() if n in no_decay_param_names]
    optimizer_grouped_parameters = [
        {"params": decay_params, "weight_decay": 0.02},
        {"params": no_decay_params, "weight_decay": 0.0}
    ]
    
    optimizer = AdamW(optimizer_grouped_parameters, lr=6e-5)  # lr=6e-5 best so far
    num_training_steps = len(train_dataset) * 100  # 10 epochs
    lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
    
    # ==========================================
    # 6. Training Arguments and Callbacks
    # ==========================================

    # Training Arguments
    training_args = TrainingArguments(
        output_dir=os.path.join(output_base_dir, f'fold_{fold + 1}'),
        num_train_epochs=100,  
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        logging_dir=log_dir,
        logging_steps=1,
        save_steps=1000,
        evaluation_strategy="steps",
        eval_steps=1,  # Ensure evaluation happens every step
        load_best_model_at_end=True,
        report_to='tensorboard',
        seed=random_seed,
        save_total_limit=1
    )
    
    # Create a callback to save metrics at each evaluation step
    class MetricsCallback(TrainerCallback):
        """
        Custom callback to store evaluation metrics at each step.
        """
        def __init__(self):
            super().__init__()
            self.metrics = []

        def on_evaluate(self, args, state, control, **kwargs):
            if state.log_history:
                self.metrics.append(state.log_history[-1])
    
    metrics_callback = MetricsCallback()
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        optimizers=(optimizer, lr_scheduler),  # Pass them here directly
        callbacks=[metrics_callback]
    )
    
    # ==========================================
    # 7. Training and Evaluation
    # ==========================================

    # Training
    trainer.train()
    
    # Save all step metrics to CSV
    step_metrics = pd.DataFrame(metrics_callback.metrics)
    step_metrics.to_csv(f'{save_dir}/fold_{fold + 1}_step_metrics.csv', index=False)
    
    # Find the best metric (e.g., lowest eval_mse)
    best_metric = step_metrics.loc[step_metrics['eval_mse'].idxmin()]
    print(f"Best Step Metrics for Fold {fold + 1}:")
    print(best_metric)
    
    # Save best metric to a CSV file
    best_metric.to_frame().transpose().to_csv(f'{save_dir}/fold_{fold + 1}_best_metric.csv', index=False)
    
    # Evaluation
    results = trainer.evaluate()
    print(f"Fold {fold + 1} Metrics: {results}")
    
    # Save metrics
    fold_metrics['mse'].append(results['eval_mse'])
    fold_metrics['mae'].append(results['eval_mae'])
    fold_metrics['r2'].append(results['eval_r2'])
    
    # Save the fine-tuned model for the fold
    trainer.save_model(os.path.join(output_base_dir, f'fold_{fold + 1}'))
    
    # Save tokenizer for the fold
    tokenizer.save_pretrained(os.path.join(output_base_dir, f'fold_{fold + 1}'))
    
    # Save metrics to CSV
    fold_results_df = pd.DataFrame([results])
    fold_results_df.to_csv(f'{save_dir}/fold_{fold + 1}_metrics.csv', index=False)
    
    # ==========================================
    # 8. Visualization for the Fold
    # ==========================================

    # Visualize actual vs predicted for the fold
    predictions = trainer.predict(val_dataset).predictions.squeeze()
    actual_values = data_val['normalized_hardness'].values
    
    # Convert normalized target values back to original scale
    actual_values_unscaled = hardness_scaler.inverse_transform(actual_values.reshape(-1, 1)).squeeze()
    predictions_unscaled = hardness_scaler.inverse_transform(predictions.reshape(-1, 1)).squeeze()
    
    # Collect predictions and actual values for combined plots later
    all_predictions_unscaled.extend(predictions_unscaled)
    all_actual_values_unscaled.extend(actual_values_unscaled)
    
    plt.figure(figsize=(10, 7))
    sns.scatterplot(x=actual_values_unscaled, y=predictions_unscaled, alpha=0.5)
    plt.plot([actual_values_unscaled.min(), actual_values_unscaled.max()],
             [actual_values_unscaled.min(), actual_values_unscaled.max()],
             color='red', linestyle='--', label="Ideal Prediction")
    plt.xlabel("Actual UTS Values", fontsize=15)
    plt.ylabel("Predicted UTS Values", fontsize=15)
    plt.title(f"Fold {fold + 1}: Actual vs Predicted UTS Values", fontsize=18)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'{save_dir}/actual_vs_predicted_UTS.png', bbox_inches='tight', dpi=300)
    plt.show()
    
    # Compute residuals
    residuals = actual_values_unscaled - predictions_unscaled
    
    # 1. Residual Box Plot with Individual Points using Seaborn
    plt.figure(figsize=(10, 4))
    sns.boxplot(x=residuals, color='lightblue', fliersize=0)  # fliersize=0 hides outliers
    sns.stripplot(x=residuals, color='green', alpha=0.5, size=4, jitter=True, label='Residuals')
    plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
    plt.title(f"Fold {fold + 1}: Residuals Box Plot with Individual Points", fontsize=14)
    plt.legend()
    plt.grid(True, axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f'{save_dir}/residuals_box_plot_seaborn.png', bbox_inches='tight', dpi=300)
    plt.show()
    
    # 2. Residual Distribution Plot
    plt.figure(figsize=(10, 6))
    # Histogram
    plt.hist(residuals, bins=30, alpha=0.7, color='skyblue', edgecolor='black', density=True, label='Histogram')
    
    # Kernel Density Estimate (KDE)
    sns.kdeplot(residuals, color='red', linewidth=2, label='KDE')
    
    plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
    plt.ylabel("Density", fontsize=12)
    plt.title(f"Fold {fold + 1}: Residuals Distribution", fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'{save_dir}/residuals_distribution.png', bbox_inches='tight', dpi=300)
    plt.show()
    
    # 3. Residuals vs Predicted Values Plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=predictions_unscaled, y=residuals, alpha=0.5)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel("Predicted Values", fontsize=12)
    plt.ylabel("Residuals (Actual - Predicted)", fontsize=12)
    plt.title(f"Fold {fold + 1}: Residuals vs. Predicted Values", fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f'{save_dir}/residuals_vs_predicted.png', bbox_inches='tight', dpi=300)
    plt.show()
    
    # 5. Autocorrelation of Residuals
    plt.figure(figsize=(10, 6))
    plot_acf(residuals, lags=30, alpha=0.05)
    plt.title(f"Fold {fold + 1}: Autocorrelation of Residuals", fontsize=14)
    plt.xlabel("Lag", fontsize=12)
    plt.ylabel("Autocorrelation", fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig(f'{save_dir}/autocorrelation_residuals.png', bbox_inches='tight', dpi=300)
    plt.show()
    
# ==========================================
# 10. Aggregating Cross-Validation Results
# ==========================================

print("\n########### Cross-Validation Results ###########")
for metric in fold_metrics:
    scores = fold_metrics[metric]
    mean_score = np.mean(scores)
    std_score = np.std(scores)
    print(f"{metric.upper()}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")

# Optionally, save the aggregated metrics to a CSV
metrics_df = pd.DataFrame(fold_metrics)
metrics_summary = metrics_df.agg(['mean', 'std']).transpose().reset_index()
metrics_summary.columns = ['Metric', 'Mean', 'Std']
metrics_summary.to_csv(os.path.join(output_base_dir, 'cross_validation_metrics.csv'), index=False)
print("\nCross-validation metrics saved to 'cross_validation_metrics.csv'")

# Convert collected predictions and actual values to numpy arrays
all_predictions_unscaled = np.array(all_predictions_unscaled)
all_actual_values_unscaled = np.array(all_actual_values_unscaled)

# Compute overall metrics on combined data
mse = mean_squared_error(all_actual_values_unscaled, all_predictions_unscaled)
mae = mean_absolute_error(all_actual_values_unscaled, all_predictions_unscaled)
r2 = r2_score(all_actual_values_unscaled, all_predictions_unscaled)
print("\n########### Overall Metrics on Combined Data ###########")
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R2 Score: {r2:.4f}")

# Save the overall metrics to a CSV file
overall_metrics_df = pd.DataFrame({'MSE': [mse], 'MAE': [mae], 'R2': [r2]})
overall_metrics_df.to_csv(os.path.join(output_base_dir, 'overall_metrics.csv'), index=False)
print("\nOverall metrics saved to 'overall_metrics.csv'")

# ==========================================
# 11. Combined Visualization
# ==========================================

# 1. Combined Actual vs Predicted Plot
plt.figure(figsize=(10, 7))
sns.scatterplot(x=all_actual_values_unscaled, y=all_predictions_unscaled, alpha=0.5)
plt.plot([all_actual_values_unscaled.min(), all_actual_values_unscaled.max()],
         [all_actual_values_unscaled.min(), all_actual_values_unscaled.max()],
         color='red', linestyle='--', label="Ideal Prediction")
plt.xlabel("Actual UTS Values", fontsize=15)
plt.ylabel("Predicted UTS Values", fontsize=15)
plt.title(f"Combined: Actual vs Predicted UTS Values", fontsize=18)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(output_base_dir, 'combined_actual_vs_predicted_UTS.png'), bbox_inches='tight', dpi=300)
plt.show()

# Compute residuals
combined_residuals = all_actual_values_unscaled - all_predictions_unscaled

# 2. Combined Residual Box Plot with Individual Points
plt.figure(figsize=(10, 4))
sns.boxplot(x=combined_residuals, color='lightblue', fliersize=0)
sns.stripplot(x=combined_residuals, color='green', alpha=0.5, size=4, jitter=True, label='Residuals')
plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
plt.title("Combined: Residuals Box Plot with Individual Points", fontsize=14)
plt.legend()
plt.grid(True, axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(output_base_dir, 'combined_residuals_box_plot_seaborn.png'), bbox_inches='tight', dpi=300)
plt.show()

# 3. Combined Residual Distribution Plot
plt.figure(figsize=(10, 6))
plt.hist(combined_residuals, bins=30, alpha=0.7, color='skyblue', edgecolor='black', density=True, label='Histogram')
sns.kdeplot(combined_residuals, color='red', linewidth=2, label='KDE')
plt.xlabel("Residuals (Actual - Predicted)", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.title("Combined: Residuals Distribution", fontsize=14)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(output_base_dir, 'combined_residuals_distribution.png'), bbox_inches='tight', dpi=300)
plt.show()

# 4. Combined Residuals vs Predicted Values Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=all_predictions_unscaled, y=combined_residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Predicted Values", fontsize=12)
plt.ylabel("Residuals (Actual - Predicted)", fontsize=12)
plt.title("Combined: Residuals vs. Predicted Values", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(output_base_dir, 'combined_residuals_vs_predicted.png'), bbox_inches='tight', dpi=300)
plt.show()

# 5. Combined Autocorrelation of Residuals
plt.figure(figsize=(10, 6))
plot_acf(combined_residuals, lags=30, alpha=0.05)
plt.title("Combined: Autocorrelation of Residuals", fontsize=14)
plt.xlabel("Lag", fontsize=12)
plt.ylabel("Autocorrelation", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(output_base_dir, 'combined_autocorrelation_residuals.png'), bbox_inches='tight', dpi=300)
plt.show()

# # ==========================================
# # End of Script
# # ==========================================


In [None]:
###################### Attention Map  ####################################

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
import re

# Load your trained model and tokenizer
model_path = './'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path, output_attentions=True)

# Ensure model is in evaluation mode
model.eval()

# Input text
input_text = "Co1 Cr1 Fe1 Mn1 Ni1 V1"

# Tokenize with offset mappings
inputs = tokenizer(
    input_text,
    return_tensors="pt",
    add_special_tokens=True,
    return_offsets_mapping=True
)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
offset_mapping = inputs['offset_mapping'][0]

# Remove 'offset_mapping' from inputs before passing to the model
model_inputs = {
    'input_ids': inputs['input_ids'],
    'attention_mask': inputs['attention_mask']
}

# Get Attention Weights
outputs = model(**model_inputs)
attentions = outputs.attentions  # List of attention weights for each layer

# Aggregate across all heads for simplicity (could be refined)
avg_attention = attentions[-1].squeeze(0).mean(0).detach().numpy()

# Exclude the [CLS] and [SEP] tokens
avg_attention = avg_attention[1:-1, 1:-1]
offset_mapping = offset_mapping[1:-1]

# Get tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])[1:-1]

# Get components from input text
components = input_text.strip().split()

# Get component spans
component_spans = []
current_pos = 0
for component in components:
    start = input_text.index(component, current_pos)
    end = start + len(component)
    component_spans.append((start, end))
    current_pos = end + 1  # Assuming one space between components

# Map tokens to components
token_to_component = []
for token_idx, (token_start, token_end) in enumerate(offset_mapping):
    token_start = token_start.item()
    token_end = token_end.item()
    # For each token, find which component it belongs to
    for comp_idx, (comp_start, comp_end) in enumerate(component_spans):
        # Check if token overlaps with component
        if token_start >= comp_start and token_end <= comp_end:
            token_to_component.append(comp_idx)
            break
    else:
        # Token does not belong to any component
        token_to_component.append(-1)

# Map components to token indices
component_to_token_indices = {}
for token_idx, comp_idx in enumerate(token_to_component):
    if comp_idx == -1:
        continue
    if comp_idx not in component_to_token_indices:
        component_to_token_indices[comp_idx] = []
    component_to_token_indices[comp_idx].append(token_idx)

# Extract element symbols from components
def extract_element(component):
    match = re.match(r"([A-Za-z]+)", component)
    if match:
        return match.group(1)
    else:
        return component

component_elements = [extract_element(component) for component in components]

# Map elements to component indices
element_to_component_indices = {}
for idx, elem in enumerate(component_elements):
    if elem not in element_to_component_indices:
        element_to_component_indices[elem] = []
    element_to_component_indices[elem].append(idx)

# Map elements to token indices
element_to_token_indices = {}
for elem, comp_indices in element_to_component_indices.items():
    token_indices = []
    for comp_idx in comp_indices:
        token_indices.extend(component_to_token_indices.get(comp_idx, []))
    element_to_token_indices[elem] = token_indices

# Get unique elements
unique_elements = list(element_to_token_indices.keys())
n_elements = len(unique_elements)

# Initialize reduced attention matrix
reduced_attention = np.zeros((n_elements, n_elements))

# Compute average attention for each element pair
for i, elem_i in enumerate(unique_elements):
    indices_i = element_to_token_indices[elem_i]
    for j, elem_j in enumerate(unique_elements):
        indices_j = element_to_token_indices[elem_j]
        if indices_i and indices_j:
            attention_values = avg_attention[np.ix_(indices_i, indices_j)]
            reduced_attention[i, j] = attention_values.mean()
        else:
            reduced_attention[i, j] = 0

# Make the attention matrix symmetric by averaging with its transpose
symmetric_attention = (reduced_attention + reduced_attention.T) / 2

# Mask the diagonal to exclude self-pairs
np.fill_diagonal(symmetric_attention, 0)

# Visualize
sns.heatmap(
    symmetric_attention,
    annot=True,
    xticklabels=unique_elements,
    yticklabels=unique_elements,
    cmap="viridis"
)
plt.title("Symmetric Attention Map (Self-Pairs Masked)")
plt.show()
