### just an experimentation of how ann with simple attetnion weights will work


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load and prepare data (same as your code)
LONG_METHOD_dataset = pd.read_csv("../../../dataset-source/embedding-dataset/software-metrics/LongMethod_code_metrics_values.csv")
X_LONG_METHOD_dataset, y_LONG_METHOD_dataset = LONG_METHOD_dataset.drop(['sample_id','label'], axis=1).values, LONG_METHOD_dataset['label'].values
print(f"Dataset shape: X={X_LONG_METHOD_dataset.shape}, y={y_LONG_METHOD_dataset.shape}")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_LONG_METHOD_dataset, y_LONG_METHOD_dataset, 
    test_size=0.2, random_state=42
)

# Convert to tensors
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)
y_train = torch.FloatTensor(y_train).view(-1, 1)  # Shape: (n, 1)
y_test = torch.FloatTensor(y_test).view(-1, 1)

input_features = X_LONG_METHOD_dataset.shape[1]  # 26


Dataset shape: X=(2408, 26), y=(2408,)


In [2]:
class SimpleFeatureAttention(nn.Module):
    """
    Learns attention weights for each input feature.
    Weights sum to 1.0 and indicate feature importance.
    """
    def __init__(self, input_features, attention_dim=32):
        super().__init__()
        self.attention_layer = nn.Sequential(
            nn.Linear(input_features, attention_dim),
            nn.Tanh(),
            nn.Linear(attention_dim, input_features),
            nn.Softmax(dim=1)  # Ensures weights sum to 1
        )
        
    def forward(self, x):
        attention_weights = self.attention_layer(x)  # Shape: (batch, 26)
        weighted_features = x * attention_weights     # Element-wise multiplication
        return weighted_features, attention_weights


In [None]:
class ANN_Model_With_Attention(nn.Module):
    def __init__(self, input_features=26, hidden1=64, hidden2=32):
        super().__init__()
        
        # Attention mechanism for explainability
        self.attention = SimpleFeatureAttention(input_features, attention_dim=32)
        
        # Neural network layers
        self.f_connected1 = nn.Linear(input_features, hidden1)
        self.dropout1 = nn.Dropout(0.3)
        self.f_connected2 = nn.Linear(hidden1, hidden2)
        self.dropout2 = nn.Dropout(0.3)
        self.out = nn.Linear(hidden2, 1)  # CHANGED: 1 output for binary classification
        
        # Store attention weights for analysis
        self.last_attention_weights = None
        
    def forward(self, x):
        # Apply attention to learn feature importance
        attended_x, attention_weights = self.attention(x)
        self.last_attention_weights = attention_weights  # Save for explainability
        
        # Forward pass through network
        x = F.relu(self.f_connected1(attended_x))
        x = self.dropout1(x)
        x = F.relu(self.f_connected2(x))
        x = self.dropout2(x)
        x = torch.sigmoid(self.out(x)) 
        return x
    
    def get_feature_importance(self):
        """Extract average attention weights across batch"""
        if self.last_attention_weights is not None:
            return self.last_attention_weights.mean(dim=0).detach()
        return None


In [None]:
torch.manual_seed(20)
model = ANN_Model_With_Attention(input_features=input_features, hidden1=64, hidden2=32)


loss_fn = nn.BCELoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 500
final_losses = []

print("Training started...")
model.train()

for i in range(epochs):
    # Forward pass
    y_pred = model(X_train)
    loss = loss_fn(y_pred, y_train)
    final_losses.append(loss.item())
    
    if (i + 1) % 50 == 0:
        print(f"Epoch {i+1}/{epochs} - Loss: {loss.item():.6f}")
    
    # Backward propagation
    optimizer.zero_grad() # clears old gradient
    loss.backward() # calculates the gradients of the loss wrt to each model parameter
    optimizer.step() # it updates the model params using calcualted gradients

print("Training completed!")


Training started...
Epoch 50/500 - Loss: 0.399996
Epoch 100/500 - Loss: 0.172375
Epoch 150/500 - Loss: 0.107078
Epoch 200/500 - Loss: 0.086150
Epoch 250/500 - Loss: 0.072311
Epoch 300/500 - Loss: 0.057973
Epoch 350/500 - Loss: 0.050287
Epoch 400/500 - Loss: 0.040403
Epoch 450/500 - Loss: 0.035104
Epoch 500/500 - Loss: 0.027991
Training completed!


In [None]:
# Evaluate model
model.eval()
# model.eval() sets the model to evaluation mode, turning off dropout and using running averages 
# in batch normalization for stable, consistent results during inference. Without it, dropout stays
# active and batch norm uses batch stats, causing random or inconsistent predictions .

with torch.no_grad(): # making sure that not to track thje gradients for any operations as it saves memory
    y_test_pred = model(X_test)
    test_loss = loss_fn(y_test_pred, y_test)
    y_test_pred_class = (y_test_pred >= 0.5).float()
    accuracy = (y_test_pred_class == y_test).float().mean()
    
    print(f"\nTest Loss: {test_loss.item():.6f}")
    print(f"Test Accuracy: {accuracy.item()*100:.2f}%")





Test Loss: 0.655094
Test Accuracy: 90.66%

FEATURE IMPORTANCE (from Attention Weights)

Top 10 Most Important Features:
1. uniqueWordsQty: 0.418209
2. loc: 0.265773
3. variablesQty: 0.125396
4. comparisonsQty: 0.012094
5. modifiers: 0.011792
6. stringLiteralsQty: 0.011322
7. maxNestedBlocksQty: 0.011115
8. parametersQty: 0.010793
9. assignmentsQty: 0.010504
10. wmc: 0.010406


In [None]:
print("\n" + "="*70)
print("FEATURE IMPORTANCE (from Attention Weights)")
print("="*70)

with torch.no_grad():
    _ = model(X_test)
    avg_attention = model.get_feature_importance()
    
    feature_importance = avg_attention.numpy()
    sorted_indices = np.argsort(feature_importance)[::-1]
    
    feature_names = LONG_METHOD_dataset.drop(['sample_id', 'label'], axis=1).columns
    
    print("\nTop 10 Most Important Features:")
    for rank, idx in enumerate(sorted_indices[:10], 1):
        print(f"{rank}. {feature_names[idx]}: {feature_importance[idx]:.6f}")

In [None]:
print("\n" + "="*70)
print("ATTENTION WEIGHTS FOR INDIVIDUAL SAMPLES")
print("="*70)

# Analyze first 3 test samples
with torch.no_grad():
    sample_predictions = model(X_test[:3])
    sample_attention = model.last_attention_weights[:3]
    
for i in range(3):
    print(f"\nSample {i+1}:")
    print(f"  True Label: {y_test[i].item():.0f}")
    print(f"  Predicted: {sample_predictions[i].item():.4f} (class: {(sample_predictions[i] >= 0.5).float().item():.0f})")
    
    top5_idx = torch.argsort(sample_attention[i], descending=True)[:5]
    print(f"  Top 5 Features Influencing This Prediction:")
    for rank, idx in enumerate(top5_idx, 1):
        print(f"    {rank}. {feature_names[idx.item()]}: {sample_attention[i][idx].item():.4f}")



ATTENTION WEIGHTS FOR INDIVIDUAL SAMPLES

Sample 1:
  True Label: 0
  Predicted: 0.0030 (class: 0)
  Top 5 Features Influencing This Prediction:
    1. uniqueWordsQty: 0.4764
    2. loc: 0.3107
    3. variablesQty: 0.0753
    4. modifiers: 0.0098
    5. stringLiteralsQty: 0.0098

Sample 2:
  True Label: 0
  Predicted: 0.0009 (class: 0)
  Top 5 Features Influencing This Prediction:
    1. uniqueWordsQty: 0.4014
    2. loc: 0.2887
    3. variablesQty: 0.0501
    4. comparisonsQty: 0.0189
    5. modifiers: 0.0189

Sample 3:
  True Label: 0
  Predicted: 0.0003 (class: 0)
  Top 5 Features Influencing This Prediction:
    1. uniqueWordsQty: 0.5994
    2. loc: 0.1164
    3. variablesQty: 0.0283
    4. methodsInvokedIndirectLocalQty: 0.0171
    5. parametersQty: 0.0155
